In [None]:
import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

**Importing Dependencies and Reading Data**

In [None]:
import pandas as pd
import numpy as np

data=pd.read_csv('/kaggle/input/telco-customer-churn/WA_Fn-UseC_-Telco-Customer-Churn.csv')
data.head()

7043 rows denoting the number of customers this data was surveyed upon and 21 columns including the target variable which means 20 features contribute to the churn result.

In [None]:
data.shape

Customer ID and the target variable column are not to be included in the data which is used to predict the churn result.

In [None]:
data.drop(['customerID','Churn'],axis=1,inplace=True)
data.head()

Checking if there exists any NULL in the data, so it doesn't throw any errors afterwards while fitting the model to the data.

In [None]:
print(data.isnull().sum())

To check how many unique variables exist in the gender column

In [None]:
data['gender'].unique()

In [None]:
data['PaymentMethod'].unique()

Here, i have directly chosen to encode the variables in numeric form.

In [None]:
from sklearn.preprocessing import LabelEncoder

le=LabelEncoder()
data['gender']=le.fit_transform(data['gender'])
data.head()

In [None]:
cols=data.columns
cols

In [None]:
cat_cols=data.select_dtypes(exclude=['int','float']).columns
cat_cols

In [None]:
data.dtypes

In [None]:
enc_data=list(cat_cols)
enc_data=enc_data[:-1]
enc_data

In [None]:
data[enc_data]=data[enc_data].apply(lambda col:le.fit_transform(col))
data[enc_data].head()

In [None]:
data.head()

In [None]:
data.shape

In [None]:
data.dtypes

In [None]:
try:
    data['TotalCharges']=pd.to_numeric(data['TotalCharges'])
except Exception as e:
    print(e)

The TotalCharges column was giving a weird error which was not able to solve. Hence i decided to drop the column for time being.

In [None]:
data['TotalCharges'].empty

In [None]:
data.drop(['TotalCharges'],axis=1,inplace=True)

We fit the KMeans clustering algorithm to the data inorder to verify the exact number of different types of customers we have in the dataset.

In [None]:
from scipy.spatial.distance import cdist
from sklearn.cluster import KMeans
distortions = []
mapping1 = {} 
k=range(1,15)
for i in k:
    kmeanModel = KMeans(n_clusters=i) 
    kmeanModel.fit(data)
    distortions.append(sum(np.min(cdist(data, kmeanModel.cluster_centers_,'euclidean'),axis=1)) / data.shape[0])
    mapping1[i] = sum(np.min(cdist(data, kmeanModel.cluster_centers_,'euclidean'),axis=1)) / data.shape[0]
for key,val in mapping1.items(): 
    print(str(key)+' : '+str(val))

In [None]:
import matplotlib.pyplot as plt
plt.plot(k, distortions, 'bx-') 
plt.xlabel('Values of K') 
plt.ylabel('Distortion') 
plt.title('The Elbow Method using Distortion') 
plt.show() 

In [None]:
import matplotlib.pyplot as plt
Sum_of_squared_distances = []
mapp={}
K = range(1,15)
for k in K:
    km = KMeans(n_clusters=k)
    km = km.fit(data)
    Sum_of_squared_distances.append(km.inertia_)
    mapp[k]=km.inertia_
for key,val in mapp.items(): 
    print(str(key)+' : '+str(val)) 

In [None]:
plt.plot(K, Sum_of_squared_distances, 'bx-')
plt.xlabel('k')
plt.ylabel('Sum_of_squared_distances')
plt.title('Elbow Method For Optimal k')
plt.show()

The elbow method is a technique to find the optimum number of clusters to fit for the data.
Here i will mention this amazing [blog post](http://https://www.geeksforgeeks.org/elbow-method-for-optimal-value-of-k-in-kmeans/) by GeeksforGeeks which clearly explains Elbow method in detail along with code.

In [None]:
from sklearn.metrics import silhouette_score
try:
    for n_clusters in K:
        clusterer = KMeans (n_clusters=n_clusters).fit(data)
        preds = clusterer.predict(data)
        centers = clusterer.cluster_centers_

        score = silhouette_score (data, preds, metric='euclidean')
        print ("For n_clusters = {}, silhouette score is {})".format(n_clusters, score))
except Exception as e:
    print(e)

In [None]:
score_list=[]
for n_clusters in range(2,15):
        clusterer = KMeans (n_clusters=n_clusters).fit(data)
        preds = clusterer.predict(data)
        centers = clusterer.cluster_centers_

        score = silhouette_score (data, preds, metric='euclidean')
        score_list.append(score)
        print ("For n_clusters = {}, silhouette score is {})".format(n_clusters, score))

This indicates 4 to be the best way to cluster.
Since at k=4 it shows the maximum silhouette score.

In [None]:
plt.bar(range(2,15),score_list)
plt.show()

In [None]:
model=KMeans(n_clusters=4)
model.fit(data)
print(model.labels_)

These are th encoded labels corresponding to the  types of customers falling into the data. Using this information we can send out different business strategies or ads targeting them.