In [None]:
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.preprocessing import LabelEncoder, OneHotEncoder
from sklearn.preprocessing import StandardScaler
from sklearn.decomposition import PCA

# Read data
data = pd.read_csv('../input/datasets-for-churn-telecom/cell2celltrain.csv')
data = data.fillna(0)

In [None]:
categorical_data = data.select_dtypes(include=['object', 'category'])
categorical_cols = list(categorical_data.columns)
binary_cols = []

for col in categorical_cols:
    if data[col].nunique() == 2:
        binary_cols.append(col)

for col in binary_cols:
    categorical_cols.remove(col)

numerical_cols = list(data.drop(categorical_data.columns, axis=1).columns)

binary_cols.remove('Churn')
print('binary columns: ', binary_cols)

categorical_cols.remove('ServiceArea')
print('categorical columns: ',categorical_cols)

numerical_cols.remove('CustomerID')
print('numerical columns: ',numerical_cols)

In [None]:
# Remove churn label from data and store it into xtrain variable
xtrain = data.drop(['Churn', 'CustomerID', 'ServiceArea'], axis=1)
ytrain = data['Churn']

xtr_binary = xtrain[binary_cols]
xtr_cat = xtrain[categorical_cols]
xtr_numerical = xtrain[numerical_cols]

# pd.options.display.max_columns = None
# pd.options.display.max_rows = None

In [None]:
labelencoder = LabelEncoder()
for col in binary_cols:
    xtr_binary[col] = pd.DataFrame(data=labelencoder.fit_transform(xtr_binary[col]), columns=[col])

xtr_binary.head(10)

In [None]:
one_hot_df = pd.get_dummies(xtr_cat)
one_hot_df.head()

In [None]:
numerical_df = StandardScaler().fit_transform(xtr_numerical)
numerical_df = pd.DataFrame(data=numerical_df, columns = numerical_cols)
numerical_df.isnull().sum()

In [None]:
xtrain_final = pd.concat([numerical_df, xtr_binary, one_hot_df], axis=1)
xtrain_final.head()

In [None]:
# PCA Projection
pca = PCA(n_components=.80)
principalComponents = pca.fit_transform(xtrain_final.values)
print(principalComponents.shape)
PCDF = pd.DataFrame(data = principalComponents)
PCDF.head()

In [None]:
from scipy.cluster.hierarchy import dendrogram, linkage
from matplotlib import pyplot as plt

plt.figure(figsize=(10, 7))
dendogram=dendrogram(linkage(PCDF.values[1:1000], "ward"))
plt.title('Dendrogram')
plt.xlabel('Customers')
plt.ylabel('Euclidean distances')
plt.show()

In [None]:
from sklearn.cluster import AgglomerativeClustering

cluster = AgglomerativeClustering(n_clusters=2, affinity='euclidean', linkage='ward')
cluster.fit_predict(PCDF)
cluster_labels = kmeans.labels_

In [None]:
from sklearn.metrics import plot_confusion_matrix
from sklearn.metrics import confusion_matrix
from sklearn.metrics import precision_recall_fscore_support

PCDF_K2 = PCDF.assign(Cluster = cluster_labels)

ypredicted = PCDF_K2['Cluster']
print(confusion_matrix(ytrain_binary, ypredicted))
print(precision_recall_fscore_support(ytrain_binary, ypredicted, average='macro'))

from sklearn.metrics import classification_report
print(classification_report(ytrain_binary, ypredicted))