In [None]:
import numpy as np 
import pandas as pd 
import seaborn as sns
import matplotlib.pyplot as plt


import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))




In [None]:
churn_df=pd.read_csv("/kaggle/input/telco-customer-churn/WA_Fn-UseC_-Telco-Customer-Churn.csv")

In [None]:
churn_df.head()

In [None]:
churn_df.info()

TotalCharges dtype looks like an object so we have to convert it to float

In [None]:
churn_df["TotalCharges"]=pd.to_numeric(churn_df["TotalCharges"],errors="coerce")
churn_df["TotalCharges"].dtypes

In [None]:
churn_df.isnull().sum()

There are 11 missing values for TotalCharges, missing value ratio is so small so we can remove them.

In [None]:
print("Before dropna", churn_df.shape)
churn_df.dropna(inplace=True)
print("After dropna",churn_df.shape)

We won't use customerID  for prediction so we can drop this column

In [None]:
churn_df.drop("customerID",axis=1,inplace=True)

Split the data 3 main parts. They are related each others so i'll examine them together for EDA

In [None]:
services_col=["PhoneService","MultipleLines","InternetService","OnlineSecurity","OnlineBackup","DeviceProtection","TechSupport","StreamingTV","StreamingMovies","Churn"]

account_col=["tenure","Contract","PaperlessBilling","PaymentMethod","MonthlyCharges","TotalCharges","Churn"]

demographic_col=["gender","SeniorCitizen","Partner","Dependents","Churn"] 

In [None]:
services_df = churn_df[services_col]
account_df = churn_df[account_col]
demographic_df = churn_df[demographic_col]

I'll use countplot function at below for each categorical features

In [None]:
def countplot_ratio(x,data,hue=None,ax=None):
    
    ax=sns.countplot(x,data=data,hue=hue,ax=ax)
    ax.set_xticklabels(ax.get_xticklabels(),rotation=10)
    ax.set_title(x + " Distributions")
    ax.legend(bbox_to_anchor=(1.01,1))
    total = float(len(data))
    for p in ax.patches:
        height = p.get_height()
        ax.text(p.get_x()+p.get_width()/2.,height + 3,
                '{:.2f}%'.format((height/total)*100),fontsize=12, weight = 'bold',ha="center") 


# Exploratory Data Analysis

In [None]:
countplot_ratio("Churn",churn_df)

While No-Churn rate is %73.42, Churn rate is %26.58. There seems to be a slight imbalance in the data.

# Services Features

In [None]:
fig, axes = plt.subplots(nrows = 3,ncols = 3,figsize = (32,22))
fig.suptitle('Services Features Distributions by % Churn',fontsize=30)
for i, col in enumerate(services_col[0:-1]):    
    if i < 3:
        countplot_ratio(col,services_df,hue="Churn",ax=axes[i,0])
    elif i >=3 and i < 6:
         countplot_ratio(col,services_df,hue="Churn",ax=axes[i-3,1])
    elif i >=6:
        countplot_ratio(col,services_df,hue="Churn",ax=axes[i-6,2])

*Noteworthy Things*


* Churn rate is significantly higher  than other internet Services for fiber optic.
* Churn rate is high when the support services aren't used such as OnlineSecurity, TechSupport etc.
* It looks like that using or not using some services have almost same effects on Churn such as MultipleLines,StreamingTV,StreamingMovies

# Feature Engineering for Services Features

In [None]:
churn_df["SumofIntServices"]=(churn_df[["OnlineSecurity","OnlineBackup", "DeviceProtection",
                                        "TechSupport","StreamingTV", "StreamingMovies"]]=='Yes').sum(axis=1)


churn_df.loc[:,'fiber'] = np.where((churn_df['InternetService'] == 'Fiber optic'), 1,0)


churn_df.loc[:,'AnySup'] = np.where((churn_df['OnlineSecurity'] != 'No') |(churn_df['OnlineBackup'] != 'No') |\
                                    (churn_df['DeviceProtection'] != 'No')|(churn_df['TechSupport'] != 'No'),1,0)


# Demographic Features

In [None]:
fig, axes = plt.subplots(nrows = 2,ncols = 2,figsize = (12,10))
fig.suptitle('Demographic Features Distributions by % Churn',fontsize=16)
for i, col in enumerate(demographic_col[0:-1]):    
    if i < 2:
        countplot_ratio(col,demographic_df,hue="Churn",ax=axes[i,0])
    elif i >=2:
         countplot_ratio(col,demographic_df,hue="Churn",ax=axes[i-2,1])

*Noteworthy Things*

* Churn rate is equal for males and females.
* Young people are majority of the customers.
* Churn rate is higher between No-Dependents and No-Partner

# Feature Engineering for Demographic Features

In [None]:
churn_df.loc[:,'AnyDeporPart'] = np.where((churn_df['Partner'] != 'No')|(churn_df['Dependents'] != 'No'),1,0)

# Customer Account Information Features

In [None]:
cat_col=list(account_df.select_dtypes(include="object").columns)
cat_col

In [None]:
fig, axes = plt.subplots(nrows = 1,ncols = 3,figsize = (20,6))
fig.suptitle('Account Categorical Features  Distributions by % Churn',fontsize=20)
for i, col in enumerate(cat_col[0:-1]):
        countplot_ratio(col,account_df,hue="Churn",ax=axes[i])

*Noteworthy Things*

* Churn rate is %23.54 for who has Month-to-month contract
* PaperlessBiling has a effect on Churn when Yes
* Churn rate is %15.23 for who pay with Electronic check

# Feature Engineering for Customer Account Features -1

In [None]:
churn_df.loc[:,'MonthtoMonth'] = np.where((churn_df['Contract'] == 'Month-to-month'), 1,0)
churn_df.loc[:,'ElectronicCheck'] = np.where((churn_df['PaymentMethod'] == 'Electronic check'), 1,0)

In [None]:
churn_df.describe()

I'll use kdeplot function at below for each numerical features

In [None]:
def kdeplot_churn(col,ax=None):

    ax = sns.kdeplot(churn_df[col][(churn_df["Churn"] == 'Yes') ],color="Red",ax=ax)
    ax = sns.kdeplot(churn_df[col][(churn_df["Churn"] == 'No') ],color="Blue",ax=ax)
    ax.legend(["Churn","Not Churn"],loc='upper right')
    ax.set_ylabel('Density')
    ax.set_xlabel(col)
    ax.set_title("Distribution of "+ col +" by churn")

In [None]:
fig, axes = plt.subplots(nrows = 1,ncols = 3,figsize = (20,6))
kdeplot_churn("tenure",ax=axes[0])
kdeplot_churn("MonthlyCharges",ax=axes[1])
kdeplot_churn("TotalCharges",ax=axes[2])

In [None]:
fig, axes = plt.subplots(nrows = 1,ncols = 3,figsize = (20,6))
ax = sns.boxplot(x="Churn",y="tenure",data=churn_df,ax=axes[0])
ax = sns.boxplot(x="Churn",y="MonthlyCharges",data=churn_df,ax=axes[1])
ax = sns.boxplot(x="Churn",y="TotalCharges",data=churn_df,ax=axes[2])

*Noteworthy Things*

* Having  a tenure  less than about 10 months is effective on Churn
* People who have monthly bills greater than 60 unit tend to Churn a lot more.

# Feature Engineering for Customer Account Features -2

In [None]:
churn_df["tenure_cat"]=pd.qcut(churn_df["tenure"],4)
churn_df["TotalCharges_cat"]=pd.qcut(churn_df["TotalCharges"],4)

In [None]:
fig, axes = plt.subplots(nrows = 1,ncols = 2,figsize = (20,6))
countplot_ratio("tenure_cat",churn_df,hue="Churn",ax=axes[0])
countplot_ratio("TotalCharges_cat",churn_df,hue="Churn",ax=axes[1])

# Preparations for Modeling

In [None]:
churn_df.info()

In [None]:
churn_df['Churn'].replace(to_replace='Yes', value=1, inplace=True)
churn_df['Churn'].replace(to_replace='No',  value=0, inplace=True)

In [None]:
churn_df2=churn_df.copy()
churn_df2.columns

In [None]:
churn_df2.drop(['Partner', 'Dependents', 'tenure','InternetService', 'OnlineSecurity',
       'OnlineBackup', 'DeviceProtection', 'TechSupport', 'StreamingTV',
       'StreamingMovies', 'Contract', 'PaymentMethod','TotalCharges'],axis=1,inplace=True)

In [None]:
churn_dummies = pd.get_dummies(churn_df2)
churn_dummies.head()

# Correlation with Heatmap

In [None]:
fig, ax = plt.subplots(figsize=(10,10))
fig.suptitle('Correlation between Churn and features',fontsize=20)
ax=sns.heatmap(churn_dummies.corr()[["Churn"]].sort_values("Churn"),vmax=1, vmin=-1, cmap="YlGnBu", annot=True, ax=ax);
ax.invert_yaxis()

Drop some features which have less than 0.05 correlation and greater than -0.05 correlation.

In [None]:
churn_corr=churn_dummies.corr()[["Churn"]].sort_values("Churn")
corr_drop=list(churn_corr[(churn_corr["Churn"]< 0.05)& (churn_corr["Churn"]>-0.05)].index)
churn_dummies.drop(corr_drop,axis=1,inplace=True)

In [None]:
churn_dummies.info()

# Models for Prediction

In [None]:
from sklearn.ensemble import VotingClassifier, BaggingClassifier, AdaBoostClassifier, GradientBoostingClassifier
from sklearn.ensemble import RandomForestClassifier, ExtraTreesClassifier
from sklearn.linear_model import LogisticRegression, SGDClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.pipeline import Pipeline
from sklearn.model_selection import cross_val_score
from sklearn.preprocessing import StandardScaler
from sklearn.svm import LinearSVC, SVC
import warnings
warnings.filterwarnings("ignore")
from xgboost import  XGBClassifier
from sklearn.metrics import f1_score,accuracy_score
from sklearn.model_selection import train_test_split

In [None]:
X=churn_dummies.drop("Churn",axis=1)
y=churn_dummies["Churn"]

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.33, random_state=42,shuffle=True, stratify = y)

In [None]:
print("No Churn", round(y_train.value_counts()[0]/len(y_train) * 100,2), "% of the Train dataset")
print("Churn", round(y_train.value_counts()[1]/len(y_train) * 100,2), "% of the Train dataset")
print("-------------------------------------------------------------------------------------------")
print("No Churn", round(y_test.value_counts()[0]/len(y_test) * 100,2), "% of Test the dataset")
print("Churn", round(y_test.value_counts()[1]/len(y_test) * 100,2), "% of the Test dataset")

In [None]:
dct = DecisionTreeClassifier(max_depth=10, random_state=42)

sgd = SGDClassifier(random_state=42)

log = LogisticRegression(random_state=42)

svm_rbf = SVC(kernel="rbf", random_state=42)

svm_lin = LinearSVC(loss="hinge")

voting = VotingClassifier(estimators=[
    ("dct", dct), ("sgd", sgd), ("svm_rbf", svm_rbf), ("smv_lin", svm_lin), ("log", log)
], voting="hard", n_jobs=-1)

bag1 = BaggingClassifier(base_estimator=dct, n_estimators=50, max_samples=0.6, n_jobs=-1)

bag2 = BaggingClassifier(base_estimator=log, n_estimators=50, max_samples=0.6, n_jobs=-1)

rnd_frst = RandomForestClassifier(n_estimators=50, max_depth=10, max_features=0.7, random_state=42, n_jobs=-1)

extra_tree = ExtraTreesClassifier(n_estimators=50, max_depth=10, max_features=0.7, random_state=42, n_jobs=-1)

ada_boost1 = AdaBoostClassifier(base_estimator=dct, n_estimators=50, algorithm="SAMME.R", random_state=42)

ada_boost2 = AdaBoostClassifier(base_estimator=dct, n_estimators=50, algorithm="SAMME", random_state=42)

ada_boost3 = AdaBoostClassifier(base_estimator=sgd, n_estimators=50, algorithm="SAMME", random_state=42)

grad_boost = GradientBoostingClassifier(max_depth=10, random_state=42, n_iter_no_change=5)

xgboost = XGBClassifier(max_depth=10, random_state=42, n_jobs=-1)

classifiers = [dct, sgd, log, svm_rbf, svm_lin, voting, bag1, bag2, 
               rnd_frst, extra_tree, ada_boost1, ada_boost2, ada_boost3,
               grad_boost, xgboost]

for c in classifiers:
  pip = Pipeline([
        ("scaler", StandardScaler()),
        ("clf", c)
  ])

  score_train = cross_val_score(pip, X_train, y_train, cv=5).mean()
  pip.fit(X_train, y_train)
  y_pred= pip.predict(X_test)
  score_test=accuracy_score(y_test,y_pred)
  print("Model:", c.__class__.__name__,"----->","Train Score: ",score_train,"----->","Test Score: ",score_test)