# Telco Customer Churn

In [None]:
import numpy as np 
import pandas as pd 
import seaborn as sns
import matplotlib.pyplot as plt

In [None]:
df = pd.read_csv('../input/telco-customer-churn/WA_Fn-UseC_-Telco-Customer-Churn.csv')
df.head()

In [None]:
df.info()

In [None]:
df.isnull().sum()

In [None]:
df.nunique()

In [None]:
df.drop('customerID', axis=1, inplace=True)

In [None]:
df.groupby(['Churn','gender'])['tenure'].mean()

In [None]:
sns.countplot(x='TechSupport', hue='Churn', data=df);

In [None]:
df.TechSupport.value_counts()

In [None]:
df.TotalCharges.astype('float64')

In [None]:
df.TotalCharges.value_counts()

In [None]:
df[df.TotalCharges==' ']

In [None]:
df['TotalCharges'][df.TotalCharges==' ']=df['MonthlyCharges'][df.TotalCharges==' ']

In [None]:
df.TotalCharges = df.TotalCharges.astype('float64')

In [None]:
df.info()

In [None]:
df.corr().T

In [None]:
plt.figure(figsize=(10,5))
sns.scatterplot(x='tenure', y='TotalCharges',hue='Churn', data=df);

In [None]:
a = 1
plt.figure(figsize=[18,25])
for i in df[['tenure','MonthlyCharges','TotalCharges']]:
    plt.subplot(3,1,a)
    a+=1
    sns.histplot(x=i,bins = 70,hue = 'Churn',kde=True, data=df)
    plt.title(i)
    plt.xlabel(' ')

In [None]:
df.groupby('Churn').agg({'mean','min','max'})

In [None]:
a = 1
plt.figure(figsize=[25,25])
df_n = df.drop(['tenure','MonthlyCharges','TotalCharges'], axis=1)
for i in df_n.iloc[:,:-1].columns:
    plt.subplot(4,4,a)
    a+=1
    sns.countplot(x=i,hue='Churn',palette = 'BuPu', data=df_n)
    plt.title(i)
    plt.xlabel(' ')

In [None]:
from sklearn.preprocessing import LabelEncoder

In [None]:
le = LabelEncoder()
for i in df[['gender','Partner','Dependents','PhoneService','PaperlessBilling','Churn']]:
    df[i] = le.fit_transform(df[i])
    
df = pd.get_dummies(df,columns = ['MultipleLines','InternetService','OnlineSecurity','OnlineBackup','DeviceProtection','TechSupport','StreamingTV','StreamingMovies','Contract','PaymentMethod'], prefix_sep='_')

In [None]:
df

In [None]:
df.corr().sort_values('Churn')

In [None]:
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.neighbors import KNeighborsClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.neural_network import MLPClassifier
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.ensemble import RandomForestClassifier
from xgboost import XGBClassifier
from lightgbm import LGBMClassifier
from catboost import CatBoostClassifier

In [None]:
X = df.drop('Churn', axis=1)
y = df.Churn
X_train,X_test,y_train,y_test = train_test_split(X,y, test_size = 0.25, random_state = 1)

In [None]:
models = [LogisticRegression,
          SVC,
          KNeighborsClassifier,
          DecisionTreeClassifier,
          MLPClassifier,
          GradientBoostingClassifier,
          RandomForestClassifier,
          XGBClassifier,
          LGBMClassifier,
          CatBoostClassifier]

In [None]:
Model=[]
score=[]
 
for i in models:
    model = i().fit(X_train,y_train)
    Model.append(i.__name__)
    score.append(accuracy_score(y_test, model.predict(X_test)))
    
df_m = pd.DataFrame({'Model':Model,'score':score})

In [None]:
df_m.sort_values('score')

## Standardization

In [None]:
from sklearn.preprocessing import StandardScaler

In [None]:
ss = StandardScaler()
X_c = X.copy()
X = ss.fit_transform(X)
X = pd.DataFrame(X, columns = X_c.columns)

In [None]:
X_train,X_test,y_train,y_test = train_test_split(X,y, test_size = 0.25, random_state = 1)

In [None]:
Model=[]
score=[]
 
for i in models:
    model = i().fit(X_train,y_train)
    Model.append(i.__name__)
    score.append(accuracy_score(y_test, model.predict(X_test)))
    
df_m = pd.DataFrame({'Model':Model,'score':score})

In [None]:
df_m.sort_values('score')

In [None]:
# CatBoostClassifier has best accuracy score
model = CatBoostClassifier().fit(X_train,y_train)
y_pred = model.predict(X_test)
print(confusion_matrix(y_test, y_pred))
print(classification_report(y_test, y_pred))

## Feature Importances

In [None]:
f_imp = pd.DataFrame(model.feature_importances_,columns = ['Importances'], index=X.columns)

In [None]:
f_imp

In [None]:
f_imp.sort_values('Importances').plot(kind='bar', figsize=[15,8]);

In [None]:
a = f_imp.sort_values('Importances', ascending=False)[:6].index

In [None]:
X = df[a]
y = df.Churn
X_train,X_test,y_train,y_test = train_test_split(X,y, test_size = 0.25, random_state = 1)

In [None]:
X

In [None]:
model = CatBoostClassifier().fit(X_train,y_train)
y_pred = model.predict(X_test)
print(confusion_matrix(y_test, y_pred))
print(classification_report(y_test, y_pred))

In [None]:
# CatBoostClassifier has an accuracy score of 81%