# **Telco Customer Churn Prediction**
### İmport Libraries
### Load Data
### Missin Values
### Data Analiysis(EDA and Visualization)
### Feature Engineerin
### Outlier Analysis
### Label Encoder-One Hot Encoder
### Base Models
### CatBoost Model
### LightGMB Model
### GBM Model

# Import Libraries

In [None]:
# 
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

# İgnore the Warnings
import warnings
warnings.filterwarnings("ignore")
pd.set_option('display.max_rows', 150)
pd.set_option('display.max_columns', None)

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# Load Data

In [None]:
# Load data 
df=pd.read_csv("../input/telco-customer-churn/WA_Fn-UseC_-Telco-Customer-Churn.csv")
df1=df.copy() # copies data as df1
df2=df.copy()

In [None]:
df.head()

In [None]:
df.info()

In [None]:
# Convert TotalCharges's dtype to float from string 
df["TotalCharges"]=df["TotalCharges"].apply(lambda x: float(x) if len(x)>1 else np.nan) # "if len(x)>1" because df["TotalCharges"] has '' 

## Missing Values

In [None]:
# Missin Values
df.isna().sum()

In [None]:
# NaN Total Charges : İf tenure = 0 then Total Charges = 0
df.loc[df["TotalCharges"].isna(),["tenure","MonthlyCharges", "TotalCharges"]]

In [None]:
# Fiil Total Chargres with 0
df["TotalCharges"].fillna(0,inplace=True)

In [None]:
# Missin Value Control
df.isna().sum().sum()

# Data Analysis (EDA + Visualization)

In [None]:
df.describe().T

In [None]:
df.hist(figsize=(6,6));

In [None]:
# Churn count
df.Churn.value_counts()

In [None]:
sns.countplot("Churn", data=df);

In [None]:
# Grab categerocal and numerical columns
cat_cols=[col for col in df.columns if df[col].nunique()<10 ]
num_cols=[col for col in df.columns if col not in cat_cols and df[col].dtype!="O"]
print("# of cat cols:", len(cat_cols) ," and # of numcols: ", len(num_cols))

### Categorical Variables

In [None]:
# Plot catagorical columns by Churn columns
plot_col=[col for col in cat_cols if col!="Churn"]
for col in plot_col:
    fig, ax=plt.subplots(1,3,figsize=(15,4))
    data1=(df.groupby(col)["Churn"].value_counts(normalize=True)*100).rename("Percentage").reset_index()
   
    sns.countplot(ax=ax[0],x=col, data=df)
    ax[0].set_title(col+"_Count")
   
    sns.countplot(ax=ax[1],x=col, data=df, hue="Churn")
    ax[1].set_title(col+"_Count by Churn")
    
    sns.barplot(ax=ax[2],x=col, y="Percentage", data=data1, hue="Churn",order=df[col].unique())
    ax[2].set_ylim(0,100)
    ax[2].set_title(col+"_Percentage by Churn")

    if col =="PaymentMethod":
          for i in range(3):
            ax[i].tick_params(axis="x",labelrotation=10)

### Numerical Variaibles

In [None]:
#Distibution of Numeracail Columns by Churn
fig, ax=plt.subplots(2,3,figsize=(18,11))
for i,col in enumerate(num_cols):
    sns.boxplot(ax=ax[0,i],x="Churn",y=col, data=df);
    ax[0,i].set_title(col)
    sns.histplot(ax=ax[1,i],x=df[col],hue=df["Churn"],multiple="dodge",kde=True,bins=30)


In [None]:
# Numerical Columns visualization
fig , ax=plt.subplots(2,2, figsize=(15,10))
for i,col in enumerate(["MonthlyCharges","TotalCharges"]):
    sns.scatterplot(ax=ax[0,i],x="tenure", y=col,data=df,hue="Churn")
    #sns.scatterplot(ax=ax[1,i],x="tenure", y=col,data=df,hue="Churn")
    sns.lineplot(ax=ax[1,i],x="tenure", y=col,data=df,hue="Churn")

In [None]:
#Corelation 
sns.heatmap(df.corr(),annot=True)

## Feature Engineerin

In [None]:
df.head()

### New Features from customerID

In [None]:

# 7795-CFOCW---->nnnn-ssss 

# Before "-" in ID
df["ID_Fig1"]=df.customerID.apply(lambda x: int(x[0]))
df["ID_Fig2"]=df.customerID.apply(lambda x: int(x[1]))
df["ID_Fig3"]=df.customerID.apply(lambda x: int(x[2]))
df["ID_Fig4"]=df.customerID.apply(lambda x: int(x[3]))

# After "-" in ID
## Letter List
letter = ['A','B','C','D','E','F','G','H','I','J','K','L','M','N','O','P','Q','R','S','T','U','V','W','X','Y','Z']
### Give the Letter Number 
df["ID_Let1"]=df.customerID.apply(lambda x: letter.index(x[5])) # index number of the letter
df["ID_Let2"]=df.customerID.apply(lambda x: letter.index(x[6]))
df["ID_Let3"]=df.customerID.apply(lambda x: letter.index(x[7]))
df["ID_Let4"]=df.customerID.apply(lambda x: letter.index(x[8]))

In [None]:
# Is there any relation between Chunr and these new features, Lets look at the plots
fig,ax=plt.subplots(2,4, figsize=(25,10))

sns.countplot(ax=ax[0,0],x=df.ID_Fig1, hue=df.Churn)
sns.countplot(ax=ax[0,1],x=df.ID_Fig2, hue=df.Churn)
sns.countplot(ax=ax[0,2],x=df.ID_Fig3, hue=df.Churn)
sns.countplot(ax=ax[0,3],x=df.ID_Fig4, hue=df.Churn)

sns.countplot(ax=ax[1,0],x=df.ID_Let1, hue=df.Churn)
sns.countplot(ax=ax[1,1],x=df.ID_Let2, hue=df.Churn)
sns.countplot(ax=ax[1,2],x=df.ID_Let3, hue=df.Churn)
sns.countplot(ax=ax[1,3],x=df.ID_Let4, hue=df.Churn)



### New Feature(MonthlyCharges/Tenure and TotalCharges/Tenure)


In [None]:
# MonthlyCharges/Tenure and TotalCharges/Tenure
#New_MCharg_Tenure
df["New_MCharg_Tenure"]=df.MonthlyCharges/(df.tenure)

#New_TCharg_Tenure
df["New_TCharg_Tenure"]=df.TotalCharges/(df.tenure)
df["New_TCharg_Tenure"].fillna(0,inplace=True)


In [None]:
#EXCELLENT, boxplot shows that New_MCharg_Tenure feature is excelent, I think it's very important
fig, ax=plt.subplots(1,2,figsize=(12,5))
sns.boxplot(ax=ax[0],y=df["New_MCharg_Tenure"], x=df.Churn)
sns.boxplot(ax=ax[1],y=df["New_TCharg_Tenure"], x=df.Churn)

In [None]:
fig , ax=plt.subplots(2,2, figsize=(15,10))
for i,col in enumerate(["New_MCharg_Tenure","New_TCharg_Tenure"]):
    sns.scatterplot(ax=ax[0,i],x="tenure", y=col,data=df,hue="Churn")
    #sns.scatterplot(ax=ax[1,i],x="tenure", y=col,data=df,hue="Churn")
    sns.lineplot(ax=ax[1,i],x="tenure", y=col,data=df,hue="Churn")

## Outlier Analysis

In [None]:
# This function equates otuliers of numeric columns and returns these values in the table.
# Also plots boxplots
#And shows observations of otulier columns that are autlier
def outliers_view(df,num_treshould = 20, q1=0.25,q3=0.75,visul = False):
    num_cols = [x for x in df.columns if (df[x].dtype != "O") & (df[x].nunique() > num_treshould)]
    table = pd.DataFrame()
    outliers_dict = {}
    if visul ==True:
        for i in num_cols:
            sns.boxplot(y=df[i])
            plt.xlabel(i)
            plt.show()

    for i in range(len(num_cols)):
        q_1 = df[num_cols[i]].quantile(q1)
        q2 = df[num_cols[i]].quantile(0.5)
        q_3 = df[num_cols[i]].quantile(q3)
        iqr = q_3 - q_1
        up = q_3 + 1.5 * iqr
        low = q_1 - 1.5*iqr
        outliers_count = len(df[(df[num_cols[i]] < low) | (df[num_cols[i]] > up)][num_cols[i]])

        table.loc[num_cols[i],"low_limit"] = low
        table.loc[num_cols[i], q1] = q_1
        table.loc[num_cols[i], "mean"] = df[num_cols[i]].mean()
        table.loc[num_cols[i], "median"] = q2
        table.loc[num_cols[i], q3] = q_3
        table.loc[num_cols[i], "up_limit"] = up
        table.loc[num_cols[i], "count"] = outliers_count
        table.loc[num_cols[i], "percent"] = outliers_count/df.shape[0]*100
        outliers_dict[num_cols[i]] = df[(df[num_cols[i]] < low) | (df[num_cols[i]] > up)][num_cols[i]]
    outliers_values =pd.DataFrame.from_dict(outliers_dict)
    outliers_values = outliers_values[[col for col in outliers_values.columns if outliers_values[col].notnull().sum() != 0]]

    return table, outliers_values

# Replace outliers with treshoulds
def outlier_press( dataframe, column=all,num_treshould=20,q1=0.25,q3=0.75):
    if column == all:
        columns =outliers_view(dataframe,num_treshould=num_treshould,q1=q1,q3=q3)[1].columns
    else:
        columns = column

    for col in columns:
        indexes = outliers_view(dataframe,num_treshould=num_treshould,q1=q1,q3=q3)[1][col]\
            [outliers_view(dataframe,num_treshould=num_treshould,q1=q1,q3=q3)[1][col].notnull()].index
        low_limit = outliers_view(dataframe,num_treshould=num_treshould,q1=q1,q3=q3)[0].loc[col, "low_limit"]
        up_limit = outliers_view(dataframe,num_treshould=num_treshould,q1=q1,q3=q3)[0].loc[col, "up_limit"]
        dataframe.loc[indexes, col] = dataframe.loc[indexes, col].\
            apply(lambda x: low_limit if x < low_limit else up_limit)
    df1=dataframe
    return df1

In [None]:
outliers_view(df,q3=0.95)

In [None]:
df=outlier_press(df,["New_MCharg_Tenure"],q3=0.90)

### Numerical New Columns visualization

In [None]:
# New_MCharg_Tenure
sns.histplot(x=df["New_MCharg_Tenure"],hue=df["Churn"],multiple="dodge",kde=True,bins=30)


In [None]:
# New_TCharg_Tenure
sns.histplot(x=df["New_TCharg_Tenure"],hue=df["Churn"],multiple="dodge",kde=True,bins=30)


## Label Encoder-One Hot Encoder

In [None]:
# LABEL ENCODER AND ONE HOT ENCODDER Functions

from sklearn import preprocessing
# Label Encoder
def label_encoder(dataframe, binary_cols):
    for col in binary_cols:
        labelencoder = preprocessing.LabelEncoder()
        dataframe[col] = labelencoder.fit_transform(dataframe[col])
    return dataframe


#One Hot Encoder
def one_hot_encoder(dataframe, categorical_cols, drop_first=False):
    dataframe = pd.get_dummies(dataframe, columns=categorical_cols, drop_first=drop_first)
    return dataframe

In [None]:
# Grab binary columns and Categorical but not binary columns 
binary_cols=[col for col in cat_cols if df[col].nunique()==2]
notbinary_cat_cols=[col for col in cat_cols if col not in binary_cols]

In [None]:
# Label Encoder
label_encoder(df,binary_cols)

# Onde Hot Encoder
df=one_hot_encoder(df,notbinary_cat_cols)

In [None]:
df.head()

## Standar Scaler

In [None]:
#scaler_cols=[col for col  in df.columns if (df[col].dtype!="O") & (df[col].nunique()>5)]

In [None]:
"""from sklearn.preprocessing import StandardScaler
for col in scaler_cols:
    scaler = StandardScaler().fit(df[[col]])
    df[col]=scaler.transform(df[[col]])"""

# Model

In [None]:
# İmport Libraries
from sklearn import preprocessing, metrics
from sklearn.model_selection import train_test_split, GridSearchCV,cross_val_score

# İmport All Base Models
from catboost import CatBoostClassifier
from lightgbm import LGBMClassifier
from sklearn.ensemble import RandomForestClassifier,GradientBoostingClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsClassifier
from sklearn.svm import SVC
from sklearn.tree import DecisionTreeClassifier
from xgboost import XGBClassifier

# Model Evaluation metrics
from sklearn.metrics import accuracy_score,classification_report,confusion_matrix, recall_score



In [None]:
# Train-Test Split
y=df["Churn"]
x=df.drop(["customerID","Churn"],axis=1) #Drop customerID
X_train,X_test,y_train,y_test= train_test_split(x,y,test_size=0.20)

## Base Models

In [None]:
# Base Models Things
models = [('KNN', KNeighborsClassifier()),
          ('CART', DecisionTreeClassifier()),
          ('RF', RandomForestClassifier()),
          ('GBM', GradientBoostingClassifier()),
          ("XGBoost", XGBClassifier()),
          ("LightGBM", LGBMClassifier()),
          ("CatBoost", CatBoostClassifier(verbose=False)),
          ("LOG",LogisticRegression()),
          ("SVC",SVC())]


In [None]:
# Base Model Scores
test_scor=[]
train_scor=[]
name1=[]
for name, model in models:
    model.fit(X_train, y_train)
    y_pred = model.predict(X_test)
    test_score=accuracy_score(y_pred, y_test) 
    y_pred_train=model.predict(X_train)
    train_score=accuracy_score(y_pred_train, y_train)
    name1.append(name)
    test_scor.append(test_score)
    train_scor.append(train_score)
    
base_model_score=pd.DataFrame({"indeks":name1,"Test_Scor":test_scor,"Train_Scor":train_scor})
#base_model_score.index=name
base_model_score.set_index("indeks")

In [None]:
fig, ax=plt.subplots(figsize=(7,5))
ax.plot(base_model_score.indeks, base_model_score.Test_Scor, label="Test Scor",marker="d")
ax.plot(base_model_score.indeks, base_model_score.Train_Scor,label="Train Scor", marker="d")
plt.legend()
plt.hlines(0.80,0,9,colors="red")
plt.xticks(rotation=90);

In [None]:
#######################################
# Feature Importance
#######################################

def plot_importance(model, X, num=x.shape[0]):
    feature_imp = pd.DataFrame({'Value': model.feature_importances_, 'Feature': X.columns})
    plt.figure(figsize=(10, 15))
    sns.set(font_scale=1)
    sns.barplot(x="Value", y="Feature", data=feature_imp.sort_values(by="Value",
                                                                     ascending=False)[0:num])
    plt.title('Feature Importance')
    plt.tight_layout()
    plt.savefig('importances-01.png')
    plt.show()

# **Importanta 5 metrics: Accuracy, precision, recall, f1-score  and ROC Curve**

# CatBoost, LightGBM and GBM Models

## CatBoost Model

In [None]:
# CatBoostClassifier()
model= CatBoostClassifier(verbose=False).fit(X_train, y_train)
y_pred = model.predict(X_test)
test_score=accuracy_score(y_pred, y_test) 
y_pred_train=model.predict(X_train)
train_score=accuracy_score(y_pred_train, y_train)

print("Test Scor: ",test_score)
print("Train Score", train_score)
print("###########################")
print(confusion_matrix(y_pred, y_test))
print(classification_report(y_pred, y_test))

In [None]:
# Model Tuning

catb_params = {"iterations": [200, 500, 800,1000],
               "learning_rate": [0.001, 0.01, 0.1],
               "depth": [1 ,2, 3, 5, 6]}

catb_model =CatBoostClassifier(verbose=False)
catb_cv_model = GridSearchCV(catb_model,
                             catb_params,
                             cv=5,
                             n_jobs=-1,
                             verbose=2).fit(X_train, y_train)

catb_cv_model.best_params_

In [None]:
# Final Model

catb_tuned = CatBoostClassifier(**catb_cv_model.best_params_,verbose=False).fit(X_train, y_train)
y_pred = catb_tuned.predict(X_test)
test_score=accuracy_score(y_pred, y_test) 
y_pred_train=catb_tuned.predict(X_train)
train_score=accuracy_score(y_pred_train, y_train)

print("Test Scor: ",test_score)
print("Train Score", train_score)
print(confusion_matrix(y_pred, y_test))
print(classification_report(y_pred, y_test))

In [None]:
# Roc Curve
metrics.plot_roc_curve(catb_tuned, X_test, y_test)  

In [None]:
# Plot İmportance
plot_importance(catb_tuned , X_test)

## Ligth GBM Model

In [None]:
# LGBMClassifier()
model= LGBMClassifier().fit(X_train, y_train)
y_pred = model.predict(X_test)
test_score=accuracy_score(y_pred, y_test) 
y_pred_train=model.predict(X_train)
train_score=accuracy_score(y_pred_train, y_train)

print("Test Scor: ",test_score)
print("Train Score", train_score)
print("###########################")
print(confusion_matrix(y_pred, y_test))
print(classification_report(y_pred, y_test))

In [None]:
# Model Tuning

lgb_model = LGBMClassifier()

lgbm_params = {"learning_rate": [0.005,0.01, 0.1],
               "n_estimators": [300, 500, 1000],
               "max_depth": [2, 3, 5, 8],
               "colsample_bytree": [1,0.9, 0.8, 0.6,0.5]}

lgbm_cv_model = GridSearchCV(lgb_model,
                             lgbm_params,
                             cv=10,
                             n_jobs=-1,
                             verbose=2).fit(X_train, y_train)

lgbm_cv_model.best_params_

In [None]:
# Final Model

lgbm_tuned = LGBMClassifier(**lgbm_cv_model.best_params_).fit(X_train, y_train)
y_pred = lgbm_tuned.predict(X_test)
test_score=accuracy_score(y_pred, y_test) 
y_pred_train=lgbm_tuned.predict(X_train)
train_score=accuracy_score(y_pred_train, y_train)

print("Test Scor: ",test_score)
print("Train Score", train_score)
print("###########################")
print(confusion_matrix(y_pred, y_test))
print(classification_report(y_pred, y_test))

In [None]:
# Roc Curve
metrics.plot_roc_curve(lgbm_tuned, X_test, y_test)  

In [None]:
# Plot İmportance
plot_importance(lgbm_tuned , X_test)

## GBM Model

In [None]:
# GradientBoostingClassifier()
model= GradientBoostingClassifier().fit(X_train, y_train)
y_pred = model.predict(X_test)
test_score=accuracy_score(y_pred, y_test) 
y_pred_train=model.predict(X_train)
train_score=accuracy_score(y_pred_train, y_train)

print("Test Scor: ",test_score)
print("Train Score", train_score)
print("###########################")
print(confusion_matrix(y_pred, y_test))
print(classification_report(y_pred, y_test))

In [None]:
# Model Tuning

gbm_params = {"learning_rate": [0.005,0.01, 0.1],
              "max_depth": [3, 8],
              "n_estimators": [300,500,800, 1000],
              "subsample": [0.3, 0.5, 0.7, 1]}

gbm_model = GradientBoostingClassifier(random_state=17)
gbm_cv_model = GridSearchCV(gbm_model, gbm_params, cv=5, n_jobs=-1, verbose=2).fit(X_train, y_train)
gbm_cv_model.best_params_

In [None]:
# Final Model

gbm_tuned = GradientBoostingClassifier(**gbm_cv_model.best_params_).fit(X_train, y_train)
y_pred = gbm_tuned.predict(X_test)
test_score=accuracy_score(y_pred, y_test) 
y_pred_train=gbm_tuned.predict(X_train)
train_score=accuracy_score(y_pred_train, y_train)

print("Test Scor: ",test_score)
print("Train Score", train_score)
print("###########################")
print(confusion_matrix(y_pred, y_test))
print(classification_report(y_pred, y_test))

In [None]:
# Roc Curve
metrics.plot_roc_curve(gbm_tuned, X_test, y_test)  

In [None]:
# Plot İmportance
plot_importance(gbm_tuned , X_test)

> # **Thank You**