In [None]:
import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)



import seaborn as sns
import matplotlib.pyplot as plt
%matplotlib inline

pd.set_option('display.max_columns',None)

from sklearn.compose import make_column_transformer
from sklearn.pipeline import make_pipeline
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report

from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import OneHotEncoder
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from sklearn.preprocessing import OrdinalEncoder




In [None]:
df =pd.read_csv('../input/telco-customer-churn/WA_Fn-UseC_-Telco-Customer-Churn.csv')
df

In [None]:
df.info()

In [None]:
df.describe()

In [None]:
sns.countplot(df['Churn'])

In [None]:
# What is the percentage of customers that are leaving?

retained = df[df['Churn']=='No'].shape[0]
left = df[df['Churn']=='Yes'].shape[0]

retained_percentage = 100*retained/(retained + left)

print('Loyal Customers percentage = ', retained_percentage , '%')
print('Churn Customers percentage = ', 100- retained_percentage , '%')


### Data Seems unbalanced as Churned% is low compared to Loyal customers 

In [None]:
df.head(2)

In [None]:
df.columns

In [None]:
# Creating SubPlots

plt.figure(figsize=(16,25))

# Churned for male and female

plt.subplot(5,3,1)
plt.grid(True)
sns.countplot(x='gender', hue='Churn', data=df);

# Visualize the churn count for internet service

plt.subplot(5,3,2)
plt.grid(True)
sns.countplot(x='InternetService', hue='Churn', data=df);



plt.subplot(5,3,3)
plt.grid(True)
sns.countplot(x='PhoneService', hue='Churn', data=df);
      
plt.subplot(5,3,4)
plt.grid(True)
sns.countplot(x='PaymentMethod', hue='Churn', data=df);

plt.subplot(5,3,5)
plt.grid(True)
sns.countplot(x='Contract', hue='Churn', data=df);

plt.subplot(5,3,6)
plt.grid(True)
sns.countplot(x='OnlineBackup', hue='Churn', data=df);

plt.subplot(5,3,7)
plt.grid(True)
sns.countplot(x='Partner', hue='Churn', data=df);

plt.subplot(5,3,8)
plt.grid(True)
sns.countplot(x='StreamingMovies', hue='Churn', data=df);



plt.subplot(5,3,9)
plt.grid(True)
sns.countplot(x='StreamingTV', hue='Churn', data=df);

plt.subplot(5,3,10)
plt.grid(True)
sns.countplot(x='OnlineSecurity', hue='Churn', data=df);

plt.subplot(5,3,11)
plt.grid(True)
sns.countplot(x='PaperlessBilling', hue='Churn', data=df);

plt.subplot(5,3,12)
plt.grid(True)
sns.countplot(x='TechSupport', hue='Churn', data=df);

plt.subplot(5,3,13)
plt.grid(True)
sns.countplot(x='Dependents', hue='Churn', data=df);

plt.subplot(5,3,14)
plt.grid(True)
sns.countplot(x='DeviceProtection', hue='Churn', data=df);




#plt.tight_layout()

In [None]:
num_features=['tenure', 'MonthlyCharges']

fig, ax =plt.subplots(1,2,figsize=(28,8))
df[df.Churn=='No'][num_features].hist(bins=20, color='blue' , alpha=0.5, ax=ax);
df[df.Churn=='Yes'][num_features].hist(bins=20, color='black' , alpha=0.5, ax=ax);


# Feature Engineering

In [None]:
df1=df.drop('customerID', axis=1)

In [None]:
# Convert all non numerical column to numeric

for i in df1.columns:
    if df1[i].dtype==np.number:
        continue
    df1[i]= LabelEncoder().fit_transform(df1[i])

In [None]:
df1.head(4)

In [None]:
df1.info()

In [None]:
df1.hist();

In [None]:
# Scale the data

y=df1['Churn']
X=df1.drop('Churn', axis=1)

X=StandardScaler().fit_transform(X)

In [None]:
x_train,x_test, y_train, y_test = train_test_split(X,y,test_size=0.2, random_state=7)

# Logistic Regression

In [None]:
# Create a model
lr=LogisticRegression()
# Train the model
lr.fit(x_train,y_train)

In [None]:
# Create prediction i.e generate target

predict= lr.predict(x_test)

In [None]:
# Print Report

print(classification_report(y_test, predict))

# XGBoost vs LightGBoost vs CatBoost

In [None]:
from time import time

In [None]:
y=df1['Churn']
x= df1.drop('Churn', axis=1)

In [None]:
accuracy={}
speed= {}

# XGBoost

In [None]:
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import RepeatedStratifiedKFold

In [None]:
gbc= GradientBoostingClassifier()
start=time()
cv= RepeatedStratifiedKFold(n_splits= 5, n_repeats=2, random_state=0)
score= cross_val_score(gbc,x,y,scoring='accuracy', cv=cv, n_jobs=-1)
speed["GBC"] = np.round(time() - start, 3)
accuracy["GBC"] = np.mean(score).round(3)

print(
    f"Mean Accuracy: {accuracy['GBC']}\nStd: {np.std(score):.3f}\nRun time: {speed['GBC']}s"
    )

In [None]:
from xgboost import XGBClassifier

In [None]:
xgb = XGBClassifier()

start = time()
cv = RepeatedStratifiedKFold(n_splits=5, n_repeats=2, random_state=0)
score = cross_val_score(xgb, X, y, scoring="accuracy", cv=cv, n_jobs=-1)

speed["XGB"] = np.round(time() -start, 3)
accuracy["XGB"] = np.mean(score).round(3)

print(
    f"Mean Accuracy: {accuracy['XGB']}\nStd: {np.std(score):.3f}\nRun time: {speed['XGB']}s"
    )


# LGBM

In [None]:
from lightgbm import LGBMClassifier

In [None]:
lgbm = LGBMClassifier()

start = time()
cv = RepeatedStratifiedKFold(n_splits=5, n_repeats=2, random_state=0)
score = cross_val_score(lgbm, X, y, scoring="accuracy", cv=cv, n_jobs=-1)

speed["LGBM"] = np.round(time() -start, 3)
accuracy["LGBM"] = np.mean(score).round(3)

print(
    f"Mean Accuracy: {accuracy['LGBM']}\nStd: {np.std(score):.3f}\nRun time: {speed['LGBM']}s"
    )


# CatBoosting

In [None]:
from catboost import CatBoostClassifier

In [None]:
cat = CatBoostClassifier()

start = time()
cv = RepeatedStratifiedKFold(n_splits=5, n_repeats=2, random_state=0)
score = cross_val_score(cat, X, y, scoring="accuracy", cv=cv, n_jobs=-1)

speed["CAT"] = np.round(time() -start, 3)
accuracy["CAT"] = np.mean(score).round(3)

print(
    f"Mean Accuracy: {accuracy['CAT']}\nStd: {np.std(score):.3f}\nRun time: {speed['CAT']}s"
    )


In [None]:
print('Accuracy:')
{k:v for k, v in sorted(accuracy.items(), key=lambda i:i[1], reverse= True)}

In [None]:
print('Speed :')
{k:v for k, v in sorted(speed.items(), key=lambda i:i[1], reverse= True)}

# Coming up soon..................

# Boosting After Optimisation 

# Pipeline

Pipelines are a simple way to keep your data preprocessing and modeling code organized. Specifically, a pipeline bundles preprocessing and modeling steps so you can use the whole bundle as if it were a single step.

Many data scientists hack together models without pipelines, but pipelines have some important benefits. Those include:

    1. Cleaner Code: Accounting for data at each step of preprocessing can get messy. With a pipeline, you won't need to manually keep track of your training and validation data at each step.
    2. Fewer Bugs: There are fewer opportunities to misapply a step or forget a preprocessing step.
    3. Easier to Productionize: It can be surprisingly hard to transition a model from a prototype to something deployable at scale. We won't go into the many related concerns here, but pipelines can help.
    4. More Options for Model Validation: You will see an example in the next tutorial, which covers cross-validation.

In [None]:
'''df.drop('customerID', axis=1, inplace=True)''';

In [None]:
'''col= ['gender', 'SeniorCitizen', 'Partner', 'Dependents',
       'tenure', 'PhoneService', 'MultipleLines', 'InternetService',
       'OnlineSecurity', 'OnlineBackup', 'DeviceProtection', 'TechSupport',
       'StreamingTV', 'StreamingMovies', 'Contract', 'PaperlessBilling',
       'PaymentMethod', 'MonthlyCharges', 'TotalCharges']


X=df[col]
y= df['Churn'].replace({'Yes':1 , 'No':0})''';

In [None]:
'''X_train,X_test, y_train,y_test= train_test_split(X,y) ''';

In [None]:
'''scale=StandardScaler()
ohe=OneHotEncoder()
le=LabelEncoder()
# define ordinal encoding
oe = OrdinalEncoder()
logreg=LogisticRegression()''';

'gender', 'SeniorCitizen', 'Partner', 'Dependents',
        'PhoneService', 'MultipleLines', 'InternetService',
       'OnlineSecurity', 'OnlineBackup', 'DeviceProtection', 'TechSupport',
       'StreamingTV', 'StreamingMovies', 'Contract', 'PaperlessBilling',
       'PaymentMethod'

In [None]:
'''ct= make_column_transformer((ohe,['gender', 'SeniorCitizen', 'Partner', 'Dependents','PhoneService', 'MultipleLines', 'InternetService','OnlineSecurity', 'OnlineBackup', 'DeviceProtection', 'TechSupport','StreamingTV', 'StreamingMovies',  'PaperlessBilling',
                                  'PaymentMethod']), (scale,['tenure', 'MonthlyCharges', 'TotalCharges']), remainder='passthrough')
                        
''';

In [None]:
'''pipe = make_pipeline(ct, logreg)
pipe.fit(X_train, y_train)''';