# Customer Churn Prediction
                                            - Kenny.D

1. Data Cleaning
2. EDA
3. Model Building

#### Data Cleaning
1. Handle Null Values
2. Remove Duplicates
3. Clean SeniorCitizen Column from trash values
4. Drop CustomerID column

#### Exploratory Data Analysis
1. Plot countplots for categorical variables vs Churn
2. Plot barplots for numerical variables vs Churn
3. Plot heatmap of correlation

#### Model Building
1. Pre-processing  
    a. Train Test Split  
    b. One hot encoding for Categorical Variables  
    c. Scaling of Numerical Variables 
      
2. Machine Learning  
    a. Logistic Regression  
    b. Decision Tree  
    c. K Nearest Neighbor  
    d. Random Forest Classifier  
      
3. Testing  
    a. Accuracy Score  
    b. Confusion Matrix  
    c. Classification Report  
      

In [None]:
import os
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline

In [None]:
df = pd.read_csv("/kaggle/input/telco-customer-churn/WA_Fn-UseC_-Telco-Customer-Churn.csv")

Checking the data

In [None]:
df.head()

In [None]:
df.head().T

In [None]:
df.info()

In [None]:
df.isnull().sum()

In [None]:
df.nunique()

In [None]:
duplicate = df[df.duplicated()]
duplicate

In [None]:
df.duplicated().value_counts()

In [None]:
df2 = df.drop_duplicates()

In [None]:
df2.tail()

In [None]:
df2.shape

In [None]:
df2.nunique()

We see that Senior Citizen has 755 values. This cannot be true.  
Hence, we will remove the rows where the column value is not 0 or 1.

In [None]:
df3 = df2.loc[df2["SeniorCitizen"].isin([0,1])]

In [None]:
df3.nunique()

In [None]:
df3 = df3.drop("customerID", axis = 1)

In [None]:
df3.TotalCharges = df3.TotalCharges.replace([' '],0)
df3.TotalCharges = df3.TotalCharges.astype('float64')

## Exploratory Data Analysis

In [None]:
df3.info()

We will make plots for each of the variable with churn.  

Plot between gender and churn

In [None]:
sns.countplot(data = df3, x=df3.Churn, hue = df3.gender)

In [None]:
ax = sns.barplot(x=df3.Churn, y=df3.TotalCharges, data=df3)
ax.set(title='Total Amount based on Churn') # title barplot

# label each bar in barplot
for p in ax.patches:
    
 # get the height of each bar
 height = p.get_height()
    
 # adding text to each bar
 ax.text(
     x = p.get_x()+(p.get_width()/2), # x-coordinate position of data label, padded to be in the middle of the bar
     
     y = height+100, # y-coordinate position of data label, padded 100 above bar
     
     s = '{:.0f}'.format(height), # data label, formatted to ignore decimals
     
     ha = 'center') # sets horizontal alignment (ha) to center

In [None]:
df3.SeniorCitizen = df3.SeniorCitizen.astype(str)

In [None]:
sns.countplot(data = df3, x = df3.Churn, hue = df3.SeniorCitizen)

In [None]:
sns.countplot(data = df3, x = df3.Churn, hue = df3.Partner)

In [None]:
objectlst = list(df3.select_dtypes(["object"]).columns)
objectlst

In [None]:
for col in objectlst:
    plt.figure()
    sns.countplot(data = df3, x = df3.Churn, hue = df3[col])    

In [None]:
df3.info()

In [None]:
floatlst = list(df3.select_dtypes(["float64"]).columns)
floatlst

In [None]:
for col in floatlst:
    plt.figure()
    ax = sns.barplot(data = df3, x = df3.Churn, y = df3[col])
    for p in ax.patches:
        ht = p.get_height()
        ax.text(
        x = p.get_x()+(p.get_width()/2),
        y = ht/2,
        s = '{:.0f}'.format(ht),
        ha = 'center')
        

In [None]:
sns.heatmap(df3.corr(), annot = True)

## Machine Learning Modelling

In [None]:
from sklearn.preprocessing import StandardScaler
sc = StandardScaler()

In [None]:
from sklearn.model_selection import train_test_split
tts = train_test_split

In [None]:
X = df3.drop("Churn", axis=1)
y = df3.Churn

In [None]:
X_train, X_test, y_train, y_test = tts(X,y)

In [None]:
print(X_train.shape, X_test.shape, y_train.shape, y_test.shape)

In [None]:
X_train_std = sc.fit_transform(X_train[floatlst])
X_test_std = sc.transform(X_test[floatlst])

In [None]:
print(X_train_std.shape, X_test_std.shape)

In [None]:
X_train_df = pd.DataFrame(X_train_std, columns = floatlst)
X_test_df = pd.DataFrame(X_test_std, columns = floatlst)

In [None]:
print(X_train_df.shape, X_test_df.shape, df3.shape)

One Hot Encoding for Categorical Variables

In [None]:
X_train_obj = X_train.drop(columns = floatlst)
X_test_obj = X_test.drop(columns = floatlst)

In [None]:
print(X_train_obj.shape, X_test_obj.shape)

In [None]:
X_train_obj_dm = pd.get_dummies(X_train_obj, drop_first = True).reset_index()

In [None]:
X_test_obj_dm = pd.get_dummies(X_test_obj, drop_first = True).reset_index()

In [None]:
print(X_train_obj_dm.shape, X_test_obj_dm.shape)

In [None]:
X_train = pd.concat([X_train_obj_dm, X_train_df], axis = 1)
X_test = pd.concat([X_test_obj_dm, X_test_df],axis = 1)

In [None]:
print(X_train.shape, X_test.shape)

### Logistic Regression

In [None]:
from sklearn.linear_model import LogisticRegression
log_model = LogisticRegression(max_iter = 5000)

In [None]:
log_model.fit(X_train,y_train)
log_preds = log_model.predict(X_test)

### Checking the Accuracy

In [None]:
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report

In [None]:
accuracy_score(log_preds, y_test).round(3)

In [None]:
confusion_matrix(log_preds, y_test)

In [None]:
print(classification_report(log_preds, y_test))

### Decision Tree

In [None]:
from sklearn.tree import DecisionTreeClassifier
dtree = DecisionTreeClassifier(criterion = "entropy")
dtree.fit(X_train,y_train)
dtree_preds = dtree.predict(X_test)

In [None]:
accuracy_score(dtree_preds,y_test).round(3)

In [None]:
confusion_matrix(dtree_preds, y_test)

In [None]:
print(classification_report(dtree_preds,y_test))

### Random Forest Classifier

In [None]:
from sklearn.ensemble import RandomForestClassifier
rfc = RandomForestClassifier()
rfc.fit(X_train,y_train)
rfc_pred = rfc.predict(X_test)

In [None]:
accuracy_score(rfc_pred, y_test).round(3)

In [None]:
confusion_matrix(rfc_pred,y_test)

In [None]:
print(classification_report(rfc_pred, y_test))

### K Nearest Neighbor

In [None]:
from sklearn.neighbors import KNeighborsClassifier
knn = KNeighborsClassifier()
knn.fit(X_train, y_train)
knn_pred = knn.predict(X_test)

In [None]:
accuracy_score(knn_pred, y_test).round(3)

In [None]:
confusion_matrix(knn_pred, y_test)

In [None]:
print(classification_report(knn_pred, y_test))