In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 5GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

# Telco Customer Churn
### Focused customer retention programs

The churn rate, also known as the rate of attrition or customer churn, is the rate at which customers stop doing business with an entity. It is most commonly expressed as the percentage of service subscribers who discontinue their subscriptions within a given time period.

----

<img src="https://cdn-images-1.medium.com/fit/t/1600/480/0*d58iZ6esNNcfntQ7">

----

### Context

"Predict behavior to retain customers. Our goal is to analyze all relevant customer data and develop focused customer retention programs." 

### Content

The data set includes information about:

* Customers who left within the last month – the column is called Churn
* Services that each customer has signed up for – phone, multiple lines, internet, online security, online backup, device protection, tech support, and streaming TV and movies
* Customer account information – how long they’ve been a customer, contract, payment method, paperless billing, monthly charges, and total charges.
* Demographic info about customers – gender, age range, and if they have partners and dependents

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import plotly.graph_objs as go
import plotly.express as px
import plotly.offline as po

### Loading Data

In [None]:
customer_churn = pd.read_csv('/kaggle/input/telco-customer-churn/WA_Fn-UseC_-Telco-Customer-Churn.csv')

### Data Overview

In [None]:
customer_churn.head()

In [None]:
print('The names of columns in our dataset:\n ',list(customer_churn.columns))
print("\nThe total number of columns in our data set: ",len(customer_churn.columns))
print("\nThe shape of our data set is: ",customer_churn.shape)

Now, let's check if we have any missing values in our dataset

In [None]:
print('-------------------------------')
print("Rows\t\tMissing values")
print("-------------------------------")
print(customer_churn.isna().sum())
print('------------------------------')

Here we have no missing values.

Time to check what type of data we have.

In [None]:
print('----------------------------')
print("Rows\t\tData types")
print('----------------------------')
print(customer_churn.dtypes)
print('----------------------------')

Here we can see that we have lots of items having data types object. 

## Exploratory Data Analysis

Now, let's go through the churn column in our dataset.

In [None]:
customer_churn['Churn'][:5]

Here we can say that churn have have positive or negative values . Yes means customer will churn our and no means customer will not churn our. So, our first task is to convert the yes/no values to 1/0 values so that is will be easier for us to do the further operations.

In [None]:
customer_churn['Churn'] = customer_churn['Churn'].replace({"Yes":1,"No":0})

In [None]:
customer_churn['Churn'][:5]

Here we get the value in the form of 0 and 1. Let's step ahead and look at other rows if we can convert the values to the appropriate format.

let's convert replace the "No internet service" string to "No" in the following columns below.

In [None]:
cols = ['OnlineBackup', 'StreamingMovies','DeviceProtection','TechSupport','OnlineSecurity','StreamingTV']
for values in cols:
    customer_churn[values] = customer_churn[values].replace({'No internet service':'No'})

Now the values lies in yes or no for the above columns.

Total charges column have some values with extra spaces. so first remove those extra spaces from our columns. After that our column is a type of object so convert it into float.

In [None]:
customer_churn['TotalCharges'] = customer_churn['TotalCharges'].replace(" ",np.nan)

# Drop null values of 'Total Charges' feature
customer_churn = customer_churn[customer_churn["TotalCharges"].notnull()]
customer_churn = customer_churn.reset_index()[customer_churn.columns]

customer_churn['TotalCharges'] = customer_churn['TotalCharges'].astype(float)

In [None]:
customer_churn['TotalCharges'].dtype

Here we successfully converted our object type data into float.

### Data Visualization

Let's visualize our data and understand it more clearly.


In [None]:
customer_churn['Churn'].value_counts().unique()

### What percent of customer have churn or not?

In [None]:
churn_x = customer_churn['Churn'].value_counts().keys().tolist()
churn_y = customer_churn['Churn'].value_counts().values.tolist()
fig = px.pie(customer_churn,
           labels = churn_x,
            values = churn_y,
            color_discrete_sequence=['grey','teal'],
             hole=0.6
            )


fig.update_layout(
    title='Customer Churn',
    template='plotly_dark'
)
fig.show()

### "Male or Female" who have churn the most?

In [None]:
gender_chunk = customer_churn.groupby('gender').Churn.mean().reset_index()

fig = go.Figure(data=[go.Bar(
            x=gender_chunk['gender'], y=gender_chunk['Churn'],
            textposition='auto',
            width=[0.2,0.2],
            marker = dict(color=['brown','purple']))])
fig.update_layout(
    title='Churn rate by Gender',
    xaxis_title="Gender",
    yaxis_title="Churn rate",
        template='plotly_dark'

)
fig.show()

### Does Tech support plays a important role?

In [None]:
tech_chunk = customer_churn.groupby('TechSupport').Churn.mean().reset_index()

fig = go.Figure(data=[go.Bar(
            x=tech_chunk['TechSupport'], y=tech_chunk['Churn'],
            textposition='auto',
            width=[0.2,0.2],
            marker = dict(color=['midnightblue','darkgreen']))])
fig.update_layout(
    title='Churn rate by Tech Support',
    xaxis_title="Tech Support",
    yaxis_title="Churn rate",
        template='plotly_dark'

)
fig.show()

### Telecome company is not satisfying their customer with internet support?

In [None]:
internet_chunk = customer_churn.groupby('InternetService').Churn.mean().reset_index()

fig = go.Figure(data=[go.Bar(
            x=internet_chunk['InternetService'], y=internet_chunk['Churn'],
            textposition='auto',
            width=[0.2,0.2,0.2],
            marker = dict(color=['tomato','tan','cyan']))])
fig.update_layout(
    title='Churn rate by Internet Services',
    xaxis_title="Internet Services",
    yaxis_title="Churn rate",
        template='plotly_dark'

)
fig.show()

### Payment Method, Any role in Churning??

In [None]:
payment_chunk = customer_churn.groupby('PaymentMethod').Churn.mean().reset_index()

fig = go.Figure(data=[go.Bar(
            x=payment_chunk['PaymentMethod'], y=payment_chunk['Churn'],
            textposition='auto',
            width=[0.2,0.2,0.2,0.2],
            marker = dict(color=['teal','thistle','lime','navy']))])
fig.update_layout(
    title='Churn rate by Payment Method',
    xaxis_title="Payment Method Churns",
    yaxis_title="Churn rate",
        template='plotly_dark'

)
fig.show()

Here we can clearly see that customers making payment through electronic check seem to be more churn than others.

### Signing long term or short contract term churn the most?

In [None]:
contract_chunk = customer_churn.groupby('Contract').Churn.mean().reset_index()

fig = go.Figure(data=[go.Bar(
            x=contract_chunk['Contract'], y=contract_chunk['Churn'],
            textposition='auto',
            width=[0.2,0.2,0.2,0.2],
            marker = dict(color=['teal','thistle','purple']))])
fig.update_layout(
    title='Churn rate by Contract',
    xaxis_title="Contract Churns",
    yaxis_title="Churn rate",
        template='plotly_dark'

)
fig.show()

From the above bar chart we can clearly see that customers who have monthly contract seems more likely to churn as compared to one year of two year contract signed customers.


In [None]:
ten_chunk = customer_churn.groupby('tenure').Churn.mean().reset_index()

fig = go.Figure(data=[go.Scatter(
    x=ten_chunk['tenure'],
    y=ten_chunk['Churn'],
        mode='markers',
        name='Low',
        marker= dict(size= 5,
            line= dict(width=0.8),
            color= 'blue'
           ),
)])
fig.update_layout(
    title='Churn rate by Tenure',
    xaxis_title="Tenure",
    yaxis_title="Churn rate",
    template='plotly_dark'

)
fig.show()

Here we can see that higher tenure resulted in low churn rate.

In [None]:
churn_data = pd.get_dummies(customer_churn, columns = ['Contract','Dependents','DeviceProtection','gender',
                                                        'InternetService','MultipleLines','OnlineBackup',
                                                        'OnlineSecurity','PaperlessBilling','Partner',
                                                        'PaymentMethod','PhoneService','SeniorCitizen',
                                                        'StreamingMovies','StreamingTV','TechSupport'],
                              drop_first=True)

In [None]:
churn_data.head()

## Modelling

In [None]:
from sklearn.preprocessing import StandardScaler

#Perform Feature Scaling on 'tenure', 'MonthlyCharges', 'TotalCharges' in order to bring them on same scale
standard = StandardScaler()
columns_for_ft_scaling = ['tenure', 'MonthlyCharges', 'TotalCharges']

#Apply the feature scaling operation on dataset using fit_transform() method
churn_data[columns_for_ft_scaling] = standard.fit_transform(churn_data[columns_for_ft_scaling])


In [None]:
churn_data.head()

Here we can see that tenure,MonthlyCharges and TotalCharges are in same range. and the suffix are converted into subcolumns eg: Contract is expanded to contract_one year and Contract_Two year. These are only the individual values. Let's see how many columns we have in our dataset.

In [None]:
list(churn_data.columns)

We can clearly see that we have added multiple columns for the individual values.

Now let's import the appropriate libraries and train our model.

In [None]:
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import SVC
from sklearn.tree import DecisionTreeClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import accuracy_score,plot_confusion_matrix

Time to split our data into x and y.

In [None]:
X = churn_data.drop(['Churn','customerID'], axis=1)
y = churn_data['Churn']

Using the train_test_split() function to split our data into training and test sets. I will use 70% of our data for training our model and 30% for testing our model.

In [None]:
X_train,X_test,y_train,y_test = train_test_split(X,y,test_size=0.3,random_state=0)

In [None]:
X_train.shape,y_train.shape,X_test.shape,y_test.shape

We have successfully splited our data into 70% and 30%. Now its time to predict using different algorithms.

### Logistic Regression

----

<img src="https://static.javatpoint.com/tutorial/machine-learning/images/logistic-regression-in-machine-learning.png">

----

In [None]:
#using logistic regression
log = LogisticRegression()

#fitting our data
log.fit(X_train,y_train)

#making prediction
y_pred = log.predict(X_test)

In [None]:
y_pred

In [None]:
print("The accuracy score of Logistic Regression is: {:.2f}% ".format(accuracy_score(y_pred,y_test)*100))

### Random Forest

-----

<img src="https://static.javatpoint.com/tutorial/machine-learning/images/random-forest-algorithm2.png">

-----

In [None]:
#using random forest classifier
rand = RandomForestClassifier()

#fitting the data
rand.fit(X_train,y_train)

#predicting values
y_rand_pred = rand.predict(X_test)

In [None]:
y_rand_pred

In [None]:
print("The accuracy score of Random Forest is: {:.2f}% ".format(accuracy_score(y_rand_pred,y_test)*100))

### Support Vector Machine

----

<img src="https://static.javatpoint.com/tutorial/machine-learning/images/support-vector-machine-algorithm.png">

----

In [None]:
#using support vector machine
svm_model = SVC(kernel='linear',probability=True)

#fitting our data
svm_model.fit(X_train,y_train)

#predicting values
y_svm_pred = svm_model.predict(X_test)
#svm_model.score(X_test,y_test)

In [None]:
y_svm_pred

In [None]:
print("The accuracy score of Support Vector Machine is: {:.2f}% ".format(accuracy_score(y_svm_pred,y_test)*100))

### Decision Tree

-----

<img src="https://static.javatpoint.com/tutorial/machine-learning/images/decision-tree-classification-algorithm.png">

-----

In [None]:
#using decision tree classifier
dec = DecisionTreeClassifier()

#fitting our data
dec.fit(X_train,y_train)

#predicting the values
y_dec_pred = dec.predict(X_test)

In [None]:
print("The accuracy score of Decision Tree is: {:.2f}% ".format(accuracy_score(y_dec_pred,y_test)*100))

### K-Nearest Neighbor

----

<img src = "https://static.javatpoint.com/tutorial/machine-learning/images/k-nearest-neighbor-algorithm-for-machine-learning2.png">

-----

In [None]:
#using knearestneighbor
knn = KNeighborsClassifier()

#fitting our data
knn.fit(X_train,y_train)

#predicting the values.
y_knn_pred = knn.predict(X_test)

In [None]:
y_knn_pred

In [None]:
print("The accuracy score of K-Nearest Neighbor is: {:.2f}% ".format(accuracy_score(y_knn_pred,y_test)*100))

Here we can see that logistic regression and Support vector machine performs better as compared to other.

### Confusion matrix

----

<img src="https://www.websystemer.no/wp-content/uploads/2020/05/understanding-confusion-matrix-and-applying-it-on-knn-classifier-on-iris-dataset-800x418.png">

----

So lets plot the confusion matrix for each of our model model to see how well they perform.

In [None]:
print('Logistic Regression Model')
plot_confusion_matrix(log,X_test,y_test);

In [None]:
print("Support Vector Machine Model")
plot_confusion_matrix(svm_model,X_test,y_test,cmap='summer');

In [None]:
print("Random Forest Model")
plot_confusion_matrix(rand,X_test,y_test,cmap='cividis');

In [None]:
print("K-Nearest Neighbor Model")
plot_confusion_matrix(knn,X_test,y_test,cmap='magma');

In [None]:
print("Decision Tree Model")
plot_confusion_matrix(dec,X_test,y_test,cmap='RdPu');

Let's predict the probability of churn for each customer using logistic regression model.

In [None]:
# Predict the probability of Churn of each customer
churn_data['Customer Churning Probability'] = log.predict_proba(churn_data[X_test.columns])[:,1]

Here I have created a new column in our dataset named Customer Churning Probability.

In [None]:
churn_data[['customerID','Customer Churning Probability']].head(10)

Here, we can clearly see the probability of customer whether they churn or not.