### Importing Required Libraries

In [2]:
import pandas as pd
from sklearn import metrics
from sklearn.model_selection import train_test_split
from sklearn.metrics import recall_score
from sklearn.metrics import classification_report
from sklearn.metrics import confusion_matrix
from sklearn.tree import DecisionTreeClassifier
from imblearn.combine import SMOTEENN

#### Reading Data Which is in csv format

In [4]:
df=pd.read_csv("/content/sample_data/tel_churn.csv")
df.head()

Unnamed: 0.1,Unnamed: 0,SeniorCitizen,MonthlyCharges,TotalCharges,Churn,gender_Female,gender_Male,Partner_No,Partner_Yes,Dependents_No,...,PaymentMethod_Bank transfer (automatic),PaymentMethod_Credit card (automatic),PaymentMethod_Electronic check,PaymentMethod_Mailed check,tenure_group_1 - 12,tenure_group_13 - 24,tenure_group_25 - 36,tenure_group_37 - 48,tenure_group_49 - 60,tenure_group_61 - 72
0,0,0,29.85,29.85,0,1,0,0,1,1,...,0,0,1,0,1,0,0,0,0,0
1,1,0,56.95,1889.5,0,0,1,1,0,1,...,0,0,0,1,0,0,1,0,0,0
2,2,0,53.85,108.15,1,0,1,1,0,1,...,0,0,0,1,1,0,0,0,0,0
3,3,0,42.3,1840.75,0,0,1,1,0,1,...,1,0,0,0,0,0,0,1,0,0
4,4,0,70.7,151.65,1,1,0,1,0,1,...,0,0,1,0,1,0,0,0,0,0


In [5]:
df=df.drop('Unnamed: 0',axis=1)      #Dropping the unwanted column --> 'Unnamed'

In [6]:
x=df.drop('Churn',axis=1)     #All input features  
x

Unnamed: 0,SeniorCitizen,MonthlyCharges,TotalCharges,gender_Female,gender_Male,Partner_No,Partner_Yes,Dependents_No,Dependents_Yes,PhoneService_No,...,PaymentMethod_Bank transfer (automatic),PaymentMethod_Credit card (automatic),PaymentMethod_Electronic check,PaymentMethod_Mailed check,tenure_group_1 - 12,tenure_group_13 - 24,tenure_group_25 - 36,tenure_group_37 - 48,tenure_group_49 - 60,tenure_group_61 - 72
0,0,29.85,29.85,1,0,0,1,1,0,1,...,0,0,1,0,1,0,0,0,0,0
1,0,56.95,1889.50,0,1,1,0,1,0,0,...,0,0,0,1,0,0,1,0,0,0
2,0,53.85,108.15,0,1,1,0,1,0,0,...,0,0,0,1,1,0,0,0,0,0
3,0,42.30,1840.75,0,1,1,0,1,0,1,...,1,0,0,0,0,0,0,1,0,0
4,0,70.70,151.65,1,0,1,0,1,0,0,...,0,0,1,0,1,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
7027,0,84.80,1990.50,0,1,0,1,0,1,0,...,0,0,0,1,0,1,0,0,0,0
7028,0,103.20,7362.90,1,0,0,1,0,1,0,...,0,1,0,0,0,0,0,0,0,1
7029,0,29.60,346.45,1,0,0,1,0,1,1,...,0,0,1,0,1,0,0,0,0,0
7030,1,74.40,306.60,0,1,0,1,1,0,0,...,0,0,0,1,1,0,0,0,0,0


In [7]:
y=df['Churn']      #The output label or the target column 
y

0       0
1       0
2       1
3       0
4       1
       ..
7027    0
7028    0
7029    0
7030    1
7031    0
Name: Churn, Length: 7032, dtype: int64

##### Splitting the Data set using Train Test Split

In [8]:
x_train,x_test,y_train,y_test=train_test_split(x,y,test_size=0.2)

#### Algorithm -->> Decision Tree Classifier

In [9]:
model_dt=DecisionTreeClassifier(criterion = "gini",random_state = 100,max_depth=6, min_samples_leaf=8)
# Creating the object for the classifier 

In [10]:
model_dt.fit(x_train,y_train)           #Training the model using training data 

DecisionTreeClassifier(max_depth=6, min_samples_leaf=8, random_state=100)

In [11]:
y_pred=model_dt.predict(x_test)     #Doing predictions for the test data 
y_pred

array([1, 0, 0, ..., 0, 1, 0])

In [12]:
model_dt.score(x_test,y_test)        #Checking the model accuracy for the test data 

0.7860696517412935

In [None]:
# As we can see the accuacy of this DecisionTreeClassifier is not that impressive, so we will use some other algorithms also like 

In [13]:
print(classification_report(y_test, y_pred, labels=[0,1]))
# Using 'classification_report' function for getting the report containing things like --> precision, recall, f1-score, accuracy etc 

              precision    recall  f1-score   support

           0       0.82      0.90      0.86      1026
           1       0.64      0.47      0.54       381

    accuracy                           0.79      1407
   macro avg       0.73      0.69      0.70      1407
weighted avg       0.77      0.79      0.77      1407



###### As we can see that the accuracy is quite low, and as it's an imbalanced dataset, we shouldn't consider Accuracy as our metrics to measure the model, as Accuracy is cursed in imbalanced datasets.

###### Hence, we need to check recall, precision & f1 score for the minority class, and it's quite evident that the precision, recall & f1 score is too low for Class 1, i.e. churned customers.

###### Hence, we will be using SMOTEENN (UpSampling + ENN) for balancing the data 

In [14]:
sm = SMOTEENN()                                   #creating the object for the SMOTEENN
X_resampled, y_resampled = sm.fit_resample(x,y)    #Resampling the input features and the output labels\target data 

In [15]:
xr_train,xr_test,yr_train,yr_test=train_test_split(X_resampled, y_resampled,test_size=0.2)
#Dividing the data again into train and test data 

In [16]:
model_dt_smote=DecisionTreeClassifier(criterion = "gini",random_state = 100,max_depth=6, min_samples_leaf=8)
# Creating the object for the Classifier 

In [17]:
model_dt_smote.fit(xr_train,yr_train)                                  #Training again
yr_predict = model_dt_smote.predict(xr_test)                            #Predicting the values for the xr_test data 
model_score_r = model_dt_smote.score(xr_test, yr_test)                   #Getting the accuracy 
print(model_score_r)
print(metrics.classification_report(yr_test, yr_predict))                #For precision, recall, f1-score etc.

0.9334470989761092
              precision    recall  f1-score   support

           0       0.91      0.94      0.93       527
           1       0.95      0.92      0.94       645

    accuracy                           0.93      1172
   macro avg       0.93      0.93      0.93      1172
weighted avg       0.93      0.93      0.93      1172



In [19]:
print(metrics.confusion_matrix(yr_test, yr_predict))   #Confusion matrix 

[[498  29]
 [ 49 596]]


###### Now we can see quite better results compair to previous results, i.e. Accuracy: 93 %, and a very good recall, precision & f1 score for minority class.

###### Let's try with some other classifier.

#### Another algorithm--> Random Forest Classifier

In [21]:
from sklearn.ensemble import RandomForestClassifier    #importing the RandomForestClassifier

In [22]:
model_rf=RandomForestClassifier(n_estimators=100, criterion='gini', random_state = 100,max_depth=6, min_samples_leaf=8)
# Creating the objet for the classifier 

In [23]:
model_rf.fit(x_train,y_train)          #Training the model 

RandomForestClassifier(max_depth=6, min_samples_leaf=8, random_state=100)

In [24]:
y_pred=model_rf.predict(x_test)       #Doing predictions  

In [25]:
model_rf.score(x_test,y_test)         #Getting the accuracy 

0.7974413646055437

In [26]:
print(classification_report(y_test, y_pred, labels=[0,1]))
# Getting the values of various parameters for this 'RandomForestClassifier' algorithm

              precision    recall  f1-score   support

           0       0.82      0.92      0.87      1026
           1       0.68      0.48      0.56       381

    accuracy                           0.80      1407
   macro avg       0.75      0.70      0.71      1407
weighted avg       0.79      0.80      0.78      1407



In [None]:
# The performance of this classifier (RandomForestClassifier) on the original unbalanced data is little better as compair to the performance of previous classifier (DecisionTreeClassifier)

In [29]:
# Again balancig the data using  SMOTEENN for this classifier for getting better score 

sm = SMOTEENN()
X_resampled1, y_resampled1 = sm.fit_resample(x,y)

In [30]:
xr_train1,xr_test1,yr_train1,yr_test1=train_test_split(X_resampled1, y_resampled1,test_size=0.2)
# Dividing the data into train and test 

In [31]:
model_rf_smote=RandomForestClassifier(n_estimators=100, criterion='gini', random_state = 100,max_depth=6, min_samples_leaf=8)

In [32]:
model_rf_smote.fit(xr_train1,yr_train1)      #Training 

RandomForestClassifier(max_depth=6, min_samples_leaf=8, random_state=100)

In [33]:
yr_predict1 = model_rf_smote.predict(xr_test1)    #Doing prediction for xr_test1 data

In [34]:
model_score_r1 = model_rf_smote.score(xr_test1, yr_test1)           #Getting the accuracy after balancing the data 

In [35]:
print(model_score_r1)
print(metrics.classification_report(yr_test1, yr_predict1))
# Getting the imp. parameters values for (RandomForestClassifier) after balancing the data 

0.9418702611625948
              precision    recall  f1-score   support

           0       0.95      0.92      0.93       533
           1       0.94      0.96      0.95       654

    accuracy                           0.94      1187
   macro avg       0.94      0.94      0.94      1187
weighted avg       0.94      0.94      0.94      1187



In [None]:
# We can see that the accuracy is about 94% which is better than previous model tested on same data after balancing using SMOTEENN

In [36]:
print(metrics.confusion_matrix(yr_test1, yr_predict1))        #Confusion matrix 

[[491  42]
 [ 27 627]]


###### With RF Classifier, also we are able to get quite good results, infact better than Decision Tree.


#### Pickling the model
We will use pickle to save our final model so that we don't need to train our model again and again and model can be used any time and by anybody for doing prediction after loading the model 

In [44]:
import pickle   #importing the module

In [45]:
filename = 'model.sav'       # creating file for dumping the model 

In [46]:
pickle.dump(model_rf_smote, open(filename, 'wb'))   #dumping 

In [47]:
load_model = pickle.load(open(filename, 'rb')) #loading model for doing prediction 

In [48]:
model_score_r1 = load_model.score(xr_test1, yr_test1)
# getting the accuracy after loding the model 

In [49]:
model_score_r1

0.9418702611625948

##### Our final model i.e. RF Classifier with SMOTEENN, is now ready and dumped in model.sav, which we will use and prepare API's so that we can access our model from UI.