In [None]:
import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

**Importing and Reading**
Here as usual we are importing the basic libraries and also reading the data.

In [None]:
import pandas as pd
import numpy as np

data=pd.read_csv('/kaggle/input/telco-customer-churn/WA_Fn-UseC_-Telco-Customer-Churn.csv')
data.head()

**Data Preprocessing**

First of all, we will drop the unnecessary columns and the target column from our data.

In [None]:
data.drop(['customerID','Churn','TotalCharges'],axis=1,inplace=True)
data.head()

We will directly move on to our preprocessing as data analysis is covered in [this notebook](http://www.kaggle.com/vedanth777/telecom-customer-churn-data-preprocessing) already. So we will directly jump into our label encoding part.

In [None]:
from sklearn.preprocessing import LabelEncoder

le=LabelEncoder()
cat_cols=data.select_dtypes(exclude=['int','float']).columns
enc_data=list(cat_cols)
data[enc_data]=data[enc_data].apply(lambda col:le.fit_transform(col))
data[enc_data].head()

As we encoded the categorical columns we need to scale the numeric columns inorder to maintain stability throughout the data as one feature maybe ranging differently than other and may indirectly have more influence over the prediction just because of the numeric range and not because of the feature actually contributing towards the actual prediction of target.

In [None]:
from sklearn.preprocessing import StandardScaler

scale=StandardScaler()
datas=scale.fit_transform(data)
data_df=pd.DataFrame(datas)
data_df.head()

**PCA for Visualization**

Since we have hell so many features, there will be that many dimensions while visualizing. Since we are all humans (considering Kaggle already did a captcha) and can only visualize stuff in 2D or 3D max, we need to pull the features down to 2D. That dimensionality reduction is possible by applying Principal Component Analysis(PCA) on the data.

In [None]:
from sklearn.decomposition import PCA

pca=PCA(n_components=2)
pc=pca.fit_transform(datas)

pdf=pd.DataFrame(data=pc,columns=['principal component 1','principal component 2'])
pdf.head()

**Plots**

The first graphs we are going to plot is going to be for the churned and existing customers. The next two sets of graphs are going to be for the clustering of different types of customers including those existing as well as churned

In [None]:
sim_data=pd.read_csv('/kaggle/input/telco-customer-churn/WA_Fn-UseC_-Telco-Customer-Churn.csv')
sim_data=sim_data[['Churn']]
sim_data.head()
# sim_data=pd.DataFrame(le.fit_transform(sim_data),columns=['Churn'])
# sim_data.head()

In [None]:
data=pd.concat([pdf,sim_data],axis=1)
data.head()

In [None]:
import matplotlib.pyplot as plt

fig = plt.figure(figsize = (8,8))
ax = fig.add_subplot(1,1,1) 
ax.set_xlabel('Principal Component 1', fontsize = 15)
ax.set_ylabel('Principal Component 2', fontsize = 15)
ax.set_title('2 component PCA', fontsize = 20)
targets = ['Yes','No']
colors = ['g','r']
for target, color in zip(targets,colors):
    indicesToKeep = data['Churn'] == target
    ax.scatter(data.loc[indicesToKeep, 'principal component 1']
               , data.loc[indicesToKeep, 'principal component 2']
               , c = color
               , s = 50)
ax.legend(targets)
ax.grid()

In [None]:
data=pd.read_csv('/kaggle/input/telco-customer-churn/WA_Fn-UseC_-Telco-Customer-Churn.csv')
data.drop(['customerID','Churn','TotalCharges'],axis=1,inplace=True)
data.head()

In [None]:
from sklearn.preprocessing import LabelEncoder
from sklearn.preprocessing import StandardScaler
from sklearn.cluster import KMeans
from sklearn.metrics import silhouette_score

le=LabelEncoder()
cat_cols=data.select_dtypes(exclude=['int','float']).columns
enc_data=list(cat_cols)
data[enc_data]=data[enc_data].apply(lambda col:le.fit_transform(col))



scale=StandardScaler()
datas=scale.fit_transform(data)
data_df=pd.DataFrame(datas)

score_list=[]
for n_clusters in range(2,15):
        clusterer = KMeans (n_clusters=n_clusters).fit(data)
        preds = clusterer.predict(data)
        centers = clusterer.cluster_centers_

        score = silhouette_score (data, preds, metric='euclidean')
        score_list.append(score)
        print ("For n_clusters = {}, silhouette score is {})".format(n_clusters, score))

plt.bar(range(2,15),score_list)
plt.show()

In [None]:
model=KMeans(n_clusters=4)
model.fit(data)
print(model.labels_)

In [None]:
target=pd.DataFrame(model.labels_,columns=['target'])
data=pd.concat([pdf,target],axis=1)
data.head()

In [None]:
import matplotlib.pyplot as plt

fig = plt.figure(figsize = (8,8))
ax = fig.add_subplot(1,1,1) 
ax.set_xlabel('Principal Component 1', fontsize = 15)
ax.set_ylabel('Principal Component 2', fontsize = 15)
ax.set_title('Visualizing', fontsize = 20)
targets = [0,1,2,3]
colors = ['r', 'g', 'b','k']
for target, color in zip(targets,colors):
    indicesToKeep = data['target'] == target
    ax.scatter(data.loc[indicesToKeep, 'principal component 1']
               , data.loc[indicesToKeep, 'principal component 2']
               , c = color
               , s = 50)
ax.legend(targets)
ax.grid()
# targets=[0,1,2,3]
# colors=['red','green','blue','black']
# for t,c in zip(targets,colors):
#     keep=data['target']==t
#     plt.scatter(data.loc[keep,'principal component 1'],
#                 data.loc[keep,'principal component 2'],c=colors,s=50)
# plt.legend(targets)
# plt.show()

In [None]:
from sklearn.decomposition import KernelPCA

pca=KernelPCA(n_components=2,kernel='rbf')
pc=pca.fit_transform(datas)

pdf=pd.DataFrame(data=pc,columns=['principal component 1','principal component 2'])
pdf.head()

In [None]:
data=pd.concat([pdf,sim_data],axis=1)

fig = plt.figure(figsize = (8,8))
ax = fig.add_subplot(1,1,1) 
ax.set_xlabel('Principal Component 1', fontsize = 15)
ax.set_ylabel('Principal Component 2', fontsize = 15)
ax.set_title('2 component PCA', fontsize = 20)
targets = ['Yes','No']
colors = ['g','r']
for target, color in zip(targets,colors):
    indicesToKeep = data['Churn'] == target
    ax.scatter(data.loc[indicesToKeep, 'principal component 1']
               , data.loc[indicesToKeep, 'principal component 2']
               , c = color
               , s = 50)
ax.legend(targets)
ax.grid()
# targets=['Yes','No']
# colors=['red','green']
# for t,c in zip(targets,colors):
#     keep=data['Churn']==t
#     plt.scatter(data.loc[keep,'principal component 1'],
#                 data.loc[keep,'principal component 2'],c=colors,s=50)
# plt.legend(targets)
# plt.show()

In [None]:
target=pd.DataFrame(model.labels_,columns=['target'])
data=pd.concat([pdf,target],axis=1)
data.head()

In [None]:
fig = plt.figure(figsize = (8,8))
ax = fig.add_subplot(1,1,1) 
ax.set_xlabel('Principal Component 1', fontsize = 15)
ax.set_ylabel('Principal Component 2', fontsize = 15)
ax.set_title('2 component PCA', fontsize = 20)
targets = [0,1,2,3]
colors = ['r','g','b','k']
for target, color in zip(targets,colors):
    indicesToKeep = data['target'] == target
    ax.scatter(data.loc[indicesToKeep, 'principal component 1']
               , data.loc[indicesToKeep, 'principal component 2']
               , c = color
               , s = 50)
ax.legend(targets)
ax.grid()


# targets=[0,1,2,3]
# colors=['red','green','blue','black']
# for t,c in zip(targets,colors):
#     keep=data['target']==t
#     plt.scatter(data.loc[keep,'principal component 1'],
#                 data.loc[keep,'principal component 2'],c=colors,s=50)
# plt.legend(targets)
# plt.show()