In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

# **What is Churn ?**


**Customer Churn means that a customer (player, subscriber, user, etc.) has terminated its relationship with a company. Online businesses often treat a customer as if they have churned after a certain amount of time since the customer's last interaction with the site or service. The full cost of loss includes both the loss of revenue and the marketing costs associated with replacing these customers. Reducing user churn is a core business goal of every online business. In this project, a classification model is created with the data we have now to predict the customers who may churn in the future. By using this model, it may be possible to identify the customers who are likely to churn in the future, reach them and offer opportunities or advantageous offers to prevent them from churning.** 

In [None]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
import plotly.express as px
import warnings
warnings.filterwarnings("ignore")

In [None]:
df = pd.read_csv("/kaggle/input/telco-customer-churn/WA_Fn-UseC_-Telco-Customer-Churn.csv")

# Exploratory Data Analysis

In [None]:
df.shape

In [None]:
df.columns

In [None]:
df.head(20)

In [None]:
df.info()

In [None]:
df.isna().sum()

In [None]:
df['TotalCharges'] = df['TotalCharges'].replace(" ", 0).astype('float32')
df['SeniorCitizen']=pd.Categorical(df['SeniorCitizen'])

In [None]:
df.describe().T

In [None]:
df.corr()

# Data Visualisation 

In [None]:
def kdeplot(feature):
    plt.figure(figsize=(9, 4))
    plt.title("KDE for {}".format(feature))
    ax0 = sns.kdeplot(df[df['Churn'] == 'No'][feature].dropna(), color= 'navy', label= 'Churn: No')
    ax1 = sns.kdeplot(df[df['Churn'] == 'Yes'][feature].dropna(), color= 'orange', label= 'Churn: Yes')
kdeplot('tenure')
kdeplot('MonthlyCharges')
kdeplot('TotalCharges')

In [None]:
def barplot_percentages(feature, orient='v', axis_name="percentage of customers"):
    ratios = pd.DataFrame()
    g = df.groupby(feature)["Churn"].value_counts().to_frame()
    g = g.rename({"Churn": axis_name}, axis=1).reset_index()
    g[axis_name] = g[axis_name]/len(df)
    if orient == 'v':
        ax = sns.barplot(x=feature, y= axis_name, hue='Churn', data=g, orient=orient)
        ax.set_yticklabels(['{:,.0%}'.format(y) for y in ax.get_yticks()])
    else:
        ax = sns.barplot(x= axis_name, y=feature, hue='Churn', data=g, orient=orient)
        ax.set_xticklabels(['{:,.0%}'.format(x) for x in ax.get_xticks()])
    ax.plot()
plt.figure(figsize=(9, 4.5))
barplot_percentages("MultipleLines", orient='h')

In [None]:
ax = sns.catplot(x="MultipleLines", y="MonthlyCharges", hue="Churn", kind="violin",
                 split=True, palette="pastel", data=df, height=4.2, aspect=1.4)

In [None]:
plt.figure(figsize=(9, 4.5))
barplot_percentages("InternetService", orient="h")

In [None]:
cols = ["OnlineSecurity", "OnlineBackup", "DeviceProtection", "TechSupport", "StreamingTV", "StreamingMovies"]
df1 = pd.melt(df[df["InternetService"] != "No"][cols]).rename({'value': 'Has service'}, axis=1)
plt.figure(figsize=(10, 4.5))
ax = sns.countplot(data=df1, x='variable', hue='Has service')
ax.set(xlabel='Additional service', ylabel='Num of customers')
plt.show()

In [None]:
def display_bars_splitted(a,b="Churn"):
    df_g = df.groupby([a, b]).size().reset_index()
    df_g['percentage'] = df.groupby([a, b]).size().groupby(level=0).apply(lambda x: 100 * x / float(x.sum())).values
    df_g.columns = [a, b, 'Counts', 'Percentage']

    fig = px.bar(df_g, x=a, y=['Counts'], color=b, title = a, text=df_g['Percentage'].apply(lambda x: '{0:1.2f}%'.format(x)))
    fig.show()
display_bars_splitted('PaperlessBilling')

In [None]:
plt.figure(figsize=(9, 4.5))
barplot_percentages("PaymentMethod", orient='h')

In [None]:
ax = sns.catplot(y="Churn", kind="count", data=df, height=2.6, aspect=2.5, orient='h')

# Feature Selection

In [None]:
yeni_df = df.iloc[:,[1,5,6,8,11,13,15,16,17,18,20]]

In [None]:
yeni_df.head()

# Categorical Variable Encoding

In [None]:
yeni_df["gender"] = yeni_df["gender"].apply(lambda x: 1 if x=='Female' else 0)
yeni_df["PhoneService"] = yeni_df["PhoneService"].apply(lambda x: 1 if x=='Yes' else 0)
yeni_df["Churn"] = yeni_df["Churn"].apply(lambda x: 1 if x=='Yes' else 0)
yeni_df["PaperlessBilling"] = yeni_df["PaperlessBilling"].apply(lambda x: 1 if x=='Yes' else 0)

In [None]:
yeni_df = pd.get_dummies(yeni_df,columns=['InternetService',
       'DeviceProtection', 
       'StreamingTV',
       'Contract',
       'PaymentMethod',],drop_first=False)

In [None]:
yeni_df.head(10)

**Train Test Split**

In [None]:

from sklearn.model_selection import train_test_split, GridSearchCV, cross_val_score
from sklearn.metrics import confusion_matrix, accuracy_score, classification_report


In [None]:
y = yeni_df["Churn"]
X = yeni_df.drop(['Churn'], axis=1)
X_train, X_test, y_train, y_test = train_test_split(X, y, 
                                                    test_size=0.30, 
                                                    random_state=2021)

# LogisticRegression Model

In [None]:
from sklearn.linear_model import LogisticRegression
loj = LogisticRegression(solver = "liblinear")
loj_model = loj.fit(X_train,y_train)
loj_model

In [None]:
loj_model.intercept_

In [None]:
loj_model.coef_

In [None]:
y_pred = loj_model.predict(X_test)

In [None]:
print(confusion_matrix(y_test, y_pred))

In [None]:
accuracy_score(y_test, y_pred)

In [None]:
print(classification_report(y_test, y_pred))

**Accuracy is the ratio of correct guesses to the total number of guesses.**

 **Precision is concerned with how many of what we predict will actually churn.** 

 **We look at Recall because of false negatives in the denominator, people who are predicted not to churn and who churn.** 

 **F1 - Score is the harmonic mean of precision and recall. If we are looking for balance between precision and recall, we look for cases where F1-Score is maximum.**

<img src="https://i.ytimg.com/vi/RYFViaaJxE8/hqdefault.jpg" />
<img src="https://miro.medium.com/max/564/1*OYQpRezFugSZa4HSnBixVw@2x.jpeg" />


# SVM Model

In [None]:
y = yeni_df["Churn"]
X = yeni_df.drop(['Churn'], axis=1)
X_train, X_test, y_train, y_test = train_test_split(X, y, 
                                                    test_size=0.30, 
                                                    random_state=2021)

In [None]:
from sklearn.svm import SVC
svm_model = SVC(kernel = "linear").fit(X_train, y_train)

In [None]:
svm_model

In [None]:
y_pred = svm_model.predict(X_test)

In [None]:
accuracy_score(y_test, y_pred)

# XGBoost Model

In [None]:
y = yeni_df["Churn"]
X = yeni_df.drop(['Churn'], axis=1)
X_train, X_test, y_train, y_test = train_test_split(X, y, 
                                                    test_size=0.30, 
                                                    random_state=2021)

In [None]:
from xgboost import XGBClassifier
xgb_model = XGBClassifier().fit(X_train, y_train)

In [None]:
xgb_model

In [None]:
y_pred = xgb_model.predict(X_test)
accuracy_score(y_test, y_pred)

# RandomForestClassifier 

In [None]:
from sklearn.ensemble import RandomForestClassifier

In [None]:
rf_model = RandomForestClassifier().fit(X_train, y_train)

In [None]:
rf_model

In [None]:
y_pred = rf_model.predict(X_test)
accuracy_score(y_test, y_pred)

# Comparision of Classification Models

In [None]:
modeller = [
    
    loj_model,
    svm_model,
    xgb_model,
    rf_model]


for model in modeller:
    isimler = model.__class__.__name__
    y_pred = model.predict(X_test)
    dogruluk = accuracy_score(y_test, y_pred)
    print("-"*28)
    print(isimler + ":" )
    print("Accuracy: {:.4%}".format(dogruluk))

In [None]:
sonuc = []

sonuclar = pd.DataFrame(columns= ["Modeller","Accuracy"])

for model in modeller:
    isimler = model.__class__.__name__
    y_pred = model.predict(X_test)
    dogruluk = accuracy_score(y_test, y_pred)    
    sonuc = pd.DataFrame([[isimler, dogruluk*100]], columns= ["Modeller","Accuracy"])
    sonuclar = sonuclar.append(sonuc)
    
    
sns.barplot(x= 'Accuracy', y = 'Modeller', data=sonuclar, color="purple")
plt.xlabel('Accuracy %')
plt.title('Accuracy of Models ');    

# Conclusion

**Support Vector Machine Classifier Model has better accuracy amongst all classifer models that shown above. With this model we may able to reach a customer before he or she churns and we might be able to convice him/her before he/she churns.**