### Importing Libraries

In [141]:
import pandas as pd
from sklearn import metrics
from sklearn.model_selection import train_test_split
from sklearn.metrics import recall_score
from sklearn.metrics import classification_report
from sklearn.metrics import confusion_matrix
from sklearn.tree import DecisionTreeClassifier
from imblearn.combine import SMOTEENN

#### Reading csv

In [142]:
df=pd.read_csv("sd.csv")

In [143]:
df.head()

Unnamed: 0.1,Unnamed: 0,Fees,DegreeYears,Churn,Gender_Female,Gender_Male,Education_Good,Education_Poor,Infrastructure_Good,Infrastructure_Poor,PlacementSupport_Good,PlacementSupport_Poor,FacultySupport_Good,FacultySupport_Poor,Events_Good,Events_Poor,Sport Facility_Good,Sport Facility_Poor
0,0,52885,4,0,1,0,0,1,1,0,0,1,0,1,0,1,0,1
1,1,13000,4,0,0,1,1,0,0,1,1,0,0,1,0,1,0,1
2,2,52885,4,1,0,1,1,0,1,0,0,1,0,1,0,1,0,1
3,3,13000,4,0,0,1,1,0,0,1,1,0,1,0,0,1,0,1
4,4,52885,4,1,1,0,0,1,0,1,0,1,0,1,0,1,0,1


In [144]:
df=df.drop('Unnamed: 0',axis=1)

In [145]:
x=df.drop('Churn',axis=1)
x

Unnamed: 0,Fees,DegreeYears,Gender_Female,Gender_Male,Education_Good,Education_Poor,Infrastructure_Good,Infrastructure_Poor,PlacementSupport_Good,PlacementSupport_Poor,FacultySupport_Good,FacultySupport_Poor,Events_Good,Events_Poor,Sport Facility_Good,Sport Facility_Poor
0,52885,4,1,0,0,1,1,0,0,1,0,1,0,1,0,1
1,13000,4,0,1,1,0,0,1,1,0,0,1,0,1,0,1
2,52885,4,0,1,1,0,1,0,0,1,0,1,0,1,0,1
3,13000,4,0,1,1,0,0,1,1,0,1,0,0,1,0,1
4,52885,4,1,0,0,1,0,1,0,1,0,1,0,1,0,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
7038,13000,4,0,1,1,0,0,1,1,0,1,0,1,0,1,0
7039,13000,4,1,0,0,1,1,0,1,0,0,1,1,0,1,0
7040,52885,4,1,0,1,0,0,1,0,1,0,1,0,1,0,1
7041,52885,4,0,1,0,1,0,1,0,1,0,1,1,0,0,1


In [146]:
y=df['Churn']
y

0       0
1       0
2       1
3       0
4       1
       ..
7038    0
7039    0
7040    0
7041    1
7042    0
Name: Churn, Length: 7043, dtype: int64

##### Train Test Split

In [147]:
x_train,x_test,y_train,y_test=train_test_split(x,y,test_size=0.2)

#### Decision Tree Classifier

In [148]:
model_dt=DecisionTreeClassifier(criterion = "gini",random_state = 100,max_depth=6, min_samples_leaf=8)

In [149]:
model_dt.fit(x_train,y_train)

In [150]:
y_pred=model_dt.predict(x_test)
y_pred

array([0, 1, 0, ..., 0, 0, 0], dtype=int64)

In [151]:
model_dt.score(x_test,y_test)

0.7473385379701917

In [152]:
print(classification_report(y_test, y_pred, labels=[0,1]))

              precision    recall  f1-score   support

           0       0.80      0.88      0.83      1019
           1       0.56      0.41      0.47       390

    accuracy                           0.75      1409
   macro avg       0.68      0.64      0.65      1409
weighted avg       0.73      0.75      0.73      1409



In [153]:
sm = SMOTEENN()
X_resampled, y_resampled = sm.fit_resample(x,y)

In [154]:
xr_train,xr_test,yr_train,yr_test=train_test_split(X_resampled, y_resampled,test_size=0.2)

In [155]:
model_dt_smote=DecisionTreeClassifier(criterion = "gini",random_state = 100,max_depth=6, min_samples_leaf=8)

In [156]:
model_dt_smote.fit(xr_train,yr_train)
yr_predict = model_dt_smote.predict(xr_test)
model_score_r = model_dt_smote.score(xr_test, yr_test)
print(model_score_r)
print(metrics.classification_report(yr_test, yr_predict))

0.9861687413554634
              precision    recall  f1-score   support

           0       0.98      1.00      0.99       429
           1       1.00      0.97      0.98       294

    accuracy                           0.99       723
   macro avg       0.99      0.98      0.99       723
weighted avg       0.99      0.99      0.99       723



In [157]:
print(metrics.confusion_matrix(yr_test, yr_predict))

[[429   0]
 [ 10 284]]


###### Now we can see quite better results, i.e. Accuracy: 99 %, and a very good recall, precision & f1 score for minority class.

###### Let's try with some other classifier.

#### Random Forest Classifier

In [158]:
from sklearn.ensemble import RandomForestClassifier

In [159]:
model_rf=RandomForestClassifier(n_estimators=100, criterion='gini', random_state = 100,max_depth=6, min_samples_leaf=8)

In [160]:
model_rf.fit(x_train,y_train)

In [161]:
y_pred=model_rf.predict(x_test)

In [162]:
model_rf.score(x_test,y_test)

0.7501774308019872

In [163]:
print(classification_report(y_test, y_pred, labels=[0,1]))

              precision    recall  f1-score   support

           0       0.79      0.88      0.84      1019
           1       0.57      0.40      0.47       390

    accuracy                           0.75      1409
   macro avg       0.68      0.64      0.65      1409
weighted avg       0.73      0.75      0.74      1409



In [164]:
sm = SMOTEENN()
X_resampled1, y_resampled1 = sm.fit_resample(x,y)

In [165]:
xr_train1,xr_test1,yr_train1,yr_test1=train_test_split(X_resampled1, y_resampled1,test_size=0.2)

In [166]:
model_rf_smote=RandomForestClassifier(n_estimators=100, criterion='gini', random_state = 100,max_depth=6, min_samples_leaf=8)

In [167]:
model_rf_smote.fit(xr_train1,yr_train1)

In [168]:
yr_predict1 = model_rf_smote.predict(xr_test1)

In [169]:
model_score_r1 = model_rf_smote.score(xr_test1, yr_test1)

In [170]:
print(model_score_r1)
print(metrics.classification_report(yr_test1, yr_predict1))

0.9939831528279182
              precision    recall  f1-score   support

           0       0.99      1.00      0.99       422
           1       1.00      0.99      0.99       409

    accuracy                           0.99       831
   macro avg       0.99      0.99      0.99       831
weighted avg       0.99      0.99      0.99       831



In [171]:
print(metrics.confusion_matrix(yr_test1, yr_predict1))

[[422   0]
 [  5 404]]


###### With RF Classifier, also we are able to get quite good results, infact better than Decision Tree.

###### We can now further go ahead and create multiple classifiers to see how the model performance is, but that's not covered here, so you can do it by yourself :)

#### Performing PCA

In [172]:
# Applying PCA
from sklearn.decomposition import PCA
pca = PCA(0.9)
xr_train_pca = pca.fit_transform(xr_train1)
xr_test_pca = pca.transform(xr_test1)
explained_variance = pca.explained_variance_ratio_

In [173]:
model=RandomForestClassifier(n_estimators=100, criterion='gini', random_state = 100,max_depth=6, min_samples_leaf=8)

In [174]:
model.fit(xr_train_pca,yr_train1)

In [175]:
yr_predict_pca = model.predict(xr_test_pca)

In [176]:
model_score_r_pca = model.score(xr_test_pca, yr_test1)

In [177]:
print(model_score_r_pca)
print(metrics.classification_report(yr_test1, yr_predict_pca))

0.7809867629362214
              precision    recall  f1-score   support

           0       0.77      0.81      0.79       422
           1       0.79      0.75      0.77       409

    accuracy                           0.78       831
   macro avg       0.78      0.78      0.78       831
weighted avg       0.78      0.78      0.78       831



##### With PCA, we couldn't see any better results, hence let's finalise the model which was created by RF Classifier, and save the model so that we can use it in a later stage :)

#### Pickling the model

In [178]:
import pickle

In [179]:
filename = 'model.sav'

In [180]:
pickle.dump(model_rf_smote, open(filename, 'wb'))

In [181]:
load_model = pickle.load(open(filename, 'rb'))

In [182]:
model_score_r1 = load_model.score(xr_test1, yr_test1)

In [183]:
model_score_r1

0.9939831528279182

In [184]:
load_model.score(xr_test, yr_test)

0.9529737206085753

##### Our final model i.e. RF Classifier with SMOTEENN, is now ready and dumped in model.sav, which we will use and prepare API's so that we can access our model from UI.