In [None]:
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd
%matplotlib inline

In [None]:
data = pd.read_csv('../input/telco-customer-churn/WA_Fn-UseC_-Telco-Customer-Churn.csv')

In [None]:
data.sample(5)

In [None]:
#customer id is a waste as it is not req for our prediction so lets drop it
data.drop(labels = ['customerID'],axis='columns',inplace=True)

In [None]:
data.dtypes

In [None]:
#as we can se the totalcharges is string and monthly is number so lets 
#get our dataset in a correct dtype
# pd.to_numeric(data.TotalCharges)
# this will currently give error as there are somevalue which have space
#so lets settle that out.

In [None]:
pd.to_numeric(data.TotalCharges,errors='coerce').isnull()

In [None]:
data[pd.to_numeric(data.TotalCharges,errors='coerce').isnull()]

In [None]:
new_df = data[data.TotalCharges!=' '] #lets drop that columns.
new_df.shape #total 11 rows are deleted. 

In [None]:
#now lets do the same thing
new_df.TotalCharges = pd.to_numeric(new_df.TotalCharges)

In [None]:
 new_df.dtypes

In [None]:
#Now lets do some quick visualization.
tenure_churn_no = new_df[new_df.Churn=='No'].tenure
tenure_churn_yes = new_df[new_df.Churn=='Yes'].tenure

In [None]:
plt.xlabel('Number of Customer')
plt.ylabel('Customer Churn Prediction Visualization')

plt.hist([tenure_churn_yes,tenure_churn_no],color=['green','red'],label=['Churn=Yes','Churn=No'])
plt.legend()

In [None]:
#lets see for monthly charges
mc_churn_no = new_df[new_df.Churn=='No'].MonthlyCharges
mc_churn_yes = new_df[new_df.Churn=='Yes'].MonthlyCharges

plt.xlabel('Monthly Charges')
plt.ylabel('Number of Customers')
plt.title('Customer Churn Prediction Visualization')

blood_sugar_men = [113,85,90,150,149,88,93,115,135,80,77,82,129]
blood_sugar_women =[67,98,89,120,133,150,84,69,89,79,120,112,100] 

plt.hist([mc_churn_yes,mc_churn_no],rwidth=0.95,color=['green','red'],label=['Churn=Yes','Churn=No'])
plt.legend()

In [None]:
def print_unique_col_value(data):
    for column in data:
        if data[column].dtypes=='object':
            print(f'{column}:{data[column].unique()}')
    

In [None]:
print_unique_col_value(new_df) #so these are our catagorical columns

In [None]:
#Lets clean our dataset
new_df.replace('No internet service','No',inplace=True)
new_df.replace('No phone service','No',inplace=True)

In [None]:
print_unique_col_value(new_df)

In [None]:
#Now lets replace yes and no with maybe 1 and 0
yes_no_columns = ['Partner','Dependents','PhoneService','MultipleLines',
                 'OnlineSecurity','OnlineBackup','DeviceProtection','TechSupport',
                 'StreamingTV','StreamingMovies','PaperlessBilling','Churn'] #all cloumn with yes and no

for col in yes_no_columns:
    new_df[col].replace({'Yes':1,'No':0},inplace=True)

In [None]:
for col in new_df:
    print(f'{col}:{new_df[col].unique()}')

In [None]:
new_df['gender'].replace({'Female':1,'Male':0},inplace=True)

In [None]:
new_df['gender'].unique()

In [None]:
 new_df1 = pd.get_dummies(data=new_df,columns=['InternetService','Contract','PaymentMethod'])

In [None]:
new_df1.columns

In [None]:
new_df1.sample(4) #Now our data look quite.

In [None]:
#Now lets see the datatype... as we can see all are number which is quite great
new_df1.dtypes

In [None]:
#Now lets scale our data.
#so the columns to be scal are tenure,MonthlyCharges,TotalCharges
#as they are nit in range 0-1
#We will use min-max or normalization.
cols_to_scale = ['tenure','MonthlyCharges','TotalCharges']
from sklearn.preprocessing import MinMaxScaler
scaler = MinMaxScaler()
new_df1[cols_to_scale] = scaler.fit_transform(new_df1[cols_to_scale])

In [None]:
#Now our dataframe is scale and used for prediction
#We are done with preprocessing,
for col in new_df1:
    print(f'{col}:{new_df1[col].unique()}')

In [None]:
def neural_net(x_train,y_train,x_test,y_test):
    import tensorflow as tf
    from functools import partial
    from tensorflow import keras
    from sklearn.metrics import confusion_matrix,classification_report


    model = keras.Sequential(
        [
            keras.layers.Dense(20,input_shape=(26,),activation='relu'),
            keras.layers.Dropout(0.3),
            keras.layers.Dense(10,activation='relu'),
            keras.layers.Dropout(0.3),
            keras.layers.Dense(5,activation='relu'),
            keras.layers.Dropout(0.3),
            keras.layers.Dense(1,activation='sigmoid'),
        ]
    )

    model.compile(optimizer ='adam',
                 loss='binary_crossentropy',
                 metrics=['accuracy'])

    # model.fit(x_train,y_train,epochs=5)
    model.fit(x_train,y_train,epochs=50,batch_size=8)

    model.evaluate(x_test,y_test)
    y_pred = model.predict(x_test)
    y_pred_actual = []
    for ele in y_pred:
        if ele > 0.5:
            y_pred_actual.append(1)
        else :
            y_pred_actual.append(0)

    
    print("Classification Reports is:\n",classification_report(y_test,y_pred_actual))
    
    return y_pred_actual

In [None]:
from sklearn.model_selection import train_test_split
X = new_df1.drop('Churn',axis=1)
y = new_df1['Churn']

x_train,x_test,y_train,y_test = train_test_split(X,y,test_size=0.2,random_state=15,stratify=y)

In [None]:
y_train.value_counts()

In [None]:
y_test.value_counts()

## As We can see our dataset is imbalanced. Now see the difference between training of unbalanced dataset vs balanced dataset

## Training of Unbalanced Dataset :-

In [None]:
y_preds = neural_net(x_train,y_train,x_test,y_test)

### lets look at confusion matrix

In [None]:

import seaborn as sns
import tensorflow as tf
cm = tf.math.confusion_matrix(labels=y_test,predictions=y_preds)

plt.figure(figsize=(9,7))
sns.heatmap(cm,annot=True,fmt='d')
plt.xlabel('Predicted')
plt.ylabel('Actual')

## As we can see the f1-score for 1 class is .62 lets Balance our dataset and see the difference

## We will use SMOTE (Over sampling by producing syntetic samples) 
**One can refer other methods of balancing :- https://towardsdatascience.com/having-an-imbalanced-dataset-here-is-how-you-can-solve-it-1640568947eb**

In [None]:
from imblearn.over_sampling import SMOTE
smote = SMOTE(sampling_strategy ='minority')
x_sm, y_sm = smote.fit_resample(X,y)

In [None]:
y_sm.value_counts() #Now our dataset is balanced.

In [None]:
x_train,x_test,y_train,y_test = train_test_split(x_sm,y_sm,test_size=0.2,random_state=15,stratify=y_sm)

In [None]:
y_train.value_counts() #balanced train dataset

In [None]:
y_test.value_counts() #balanced test dataset

In [None]:
y_preds = neural_net(x_train,y_train,x_test,y_test)

### Lets see confusion matrix

In [None]:
import seaborn as sns
import tensorflow as tf
cm = tf.math.confusion_matrix(labels=y_test,predictions=y_preds)

plt.figure(figsize=(9,7))
sns.heatmap(cm,annot=True,fmt='d')
plt.xlabel('Predicted')
plt.ylabel('Actual')

## Conclusion :-
**Balancing the dataset not only helps us improve f1-score of classes in classification problem but also helps improve accuracy.So tackling a unbalanced datset is must.<br>**
<h6>*Note:- the epochs of neural network is 50 maybe more epochs may result in better result*</h6>