In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
%matplotlib inline
import seaborn as sns

In [None]:
data = pd.read_csv("../input/CC GENERAL.csv")

In [None]:
data.head()

In [None]:
data.info()

In [None]:
data.describe()

# Missing Values

In [None]:
data.isna().sum()

In [None]:
data = data.fillna(data.mean())
data.isna().sum()

In [None]:
data.drop('CUST_ID', axis=1, inplace=True)

In [None]:
data.head(2)

# Data Exploration

In [None]:
data.dtypes

In [None]:
data.nunique()

In [None]:
data[['CASH_ADVANCE_TRX','PURCHASES_TRX','TENURE']].nunique()

# Corelation Plot

In [None]:
plt.figure(figsize=(12,12))
sns.heatmap(data.corr(),xticklabels=data.columns, yticklabels=data.columns, annot=True)

In [None]:
sns.pairplot(data)

In [None]:
fig, axes = plt.subplots(ncols=1, nrows=3)
ax0,ax1,ax2 = axes.flatten()

ax0.hist(data['CASH_ADVANCE_TRX'],65,histtype='bar', stacked=True)
ax0.set_title('CASH_ADVANCE_TRX')

ax1.hist(data['PURCHASES_TRX'], 173, histtype='bar', stacked=True)
ax1.set_title('PURCHASES_TRX')

ax2.hist(data['TENURE'],7,histtype='bar', stacked=True)
ax2.set_title('TENURE')

fig.tight_layout()

# Feature Generation

In [None]:
features = data.copy()
list(features)

In [None]:
cols = ['BALANCE',
        'PURCHASES',
        'ONEOFF_PURCHASES',
        'INSTALLMENTS_PURCHASES',
        'CASH_ADVANCE',
        'CASH_ADVANCE_TRX',
        'PURCHASES_TRX',
        'CREDIT_LIMIT',
        'PAYMENTS',
        'MINIMUM_PAYMENTS']
features[cols] = np.log(1+features[cols])
features.head()

In [None]:
data.head()

In [None]:
features.describe()

In [None]:
# Determining outliers by boxplot
features.boxplot(rot=90, figsize=(30,10))

# Clustering using Kmeans

### Elbow method to find the appeopreate number of clusters

In [None]:
from sklearn.cluster import KMeans

In [None]:
X = np.array(features)
sumOfSqrdDist = []
K = range(1,15)

for k in K:
    kmeans = KMeans(n_clusters=k, random_state=42)
    kmeans = kmeans.fit(X)
    sumOfSqrdDist.append([k, kmeans.inertia_])
    
plt.plot(K, sumOfSqrdDist, 'bx-')
plt.xlabel('k')
plt.ylabel('Sum of Squared Distances')
plt.title('Elbow method for optimal k')
plt.show();

In [None]:
sumOfSqrdDist

In [None]:
n_cluster = 11
clustering = KMeans(n_clusters=n_cluster, random_state=42)
cluster_labels = clustering.fit_predict(X)

plt.hist(cluster_labels, bins=range(n_cluster+1))
plt.title("# Customers per cluster")
plt.xlabel("Cluster")
plt.ylabel(" # Customers")
plt.show()

features['CLUSTER_INDEX'] = cluster_labels
data['CLUSTER_INDEX'] = cluster_labels

In [None]:
kmeans.cluster_centers_

### Dendograms

In [None]:
from scipy.cluster.hierarchy import ward,dendrogram,linkage
np.set_printoptions(precision=4,suppress=True)

In [None]:
distance = linkage(X,'ward')

In [None]:
plt.figure(figsize=(20,10))
plt.title("Hierarchical Clustering Dendogram")
plt.xlabel("Index")
plt.ylabel("Ward's Distance")
dendrogram(distance, leaf_rotation=90, leaf_font_size=9);
plt.axhline(98, c='k')

### Clusters by distance

In [None]:
from scipy.cluster.hierarchy import fcluster

max_d = 97
clusters = fcluster(distance, max_d, criterion='distance')
clusters

In [None]:
k = 11 #K=3
clusters = fcluster(distance, k, criterion='maxclust')

plt.figure(figsize=(10,8))
plt.scatter(X[:,0], X[:,1], c=clusters);

# Silhoutte_score

In [None]:
from sklearn.metrics import silhouette_score

sumOfSquaredErrors = []
for k in range(2,30):
    kmeans = KMeans(n_clusters=k).fit(X)
    sumOfSquaredErrors.append([k,silhouette_score(X, kmeans.labels_)])

In [None]:
plt.plot(pd.DataFrame(sumOfSquaredErrors)[0],pd.DataFrame(sumOfSquaredErrors)[1])

# Evaluation

In [None]:
data.head()

In [None]:
data['CLUSTER_INDEX'].nunique()

In [None]:
from sklearn.metrics import confusion_matrix,classification_report

In [None]:
print(classification_report(features['CLUSTER_INDEX'], kmeans.labels_))