In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

In [None]:
df = pd.read_csv("/kaggle/input/ccdata/CC GENERAL.csv")
print(df.shape)
# Get some base information on our dataset
print ("Rows     : " , df.shape[0])
print ("Columns  : " , df.shape[1])
print ("\nFeatures : \n" ,df.columns.tolist())
print ("\nMissing values :  ", df.isnull().sum().values.sum())
print ("\nUnique values :  \n", df.nunique())

In [None]:
df.info()

In [None]:
import seaborn as sns
import matplotlib.pyplot as plt
# Create our Corelation Matrix
corr = df.corr()
# Generate our corelation plot or heatmap
plt.figure(figsize = (10,10))
cmap = sns.diverging_palette(220,10,as_cmap = True)

sns.heatmap(corr,xticklabels=corr.columns.values,
           yticklabels=corr.columns.values,cmap=cmap,vmax=.3, center=0, square=True, linewidths=.5, cbar_kws={"shrink": .82})
plt.title('Heatmap of Correlation Matrix')

In [None]:
# Plot some of the data

df.hist(figsize=(20,12))

In [None]:
df.describe()

In [None]:
# Let's try arbitrarily 6 clusters 
from sklearn import cluster, tree, decomposition
km = cluster.KMeans(n_clusters=6, max_iter=300, random_state=None)
df = df.drop("CUST_ID", axis = 1)
df = df.dropna()
df['cluster'] = km.fit_predict(df)

# Principal Component Analysis for Visualization
pca = decomposition.PCA(n_components=2, whiten=True)
#pca.fit(city_groups[features_to_explore])
df['x'] = pca.fit_transform(df)[:, 0]
df['y'] = pca.fit_transform(df)[:, 1]
plt.scatter(df['x'], df['y'], c=df['cluster'])
plt.show()

In [None]:
from sklearn.cluster import KMeans

Sum_of_squared_distances = []

# Use k from 1 to 15
K = range(1,15)
for k in K:
    km = KMeans(n_clusters=k, max_iter=300, random_state=None)
    km = km.fit(df)
    # Get sum of square distances by applying km.inertia_ 
    Sum_of_squared_distances.append(km.inertia_)

# Plot Results
plt.plot(K, Sum_of_squared_distances, marker='o')
plt.xlabel('k')
plt.ylabel('Sum_of_squared_distances')
plt.title('Elbow Method For Optimal k')
plt.show()

In [None]:
from sklearn.metrics import silhouette_score
from sklearn.cluster import KMeans

for n_cluster in range(2, 11):
    kmeans = KMeans(n_clusters=n_cluster).fit(df)
    label = kmeans.labels_
    sil_coeff = silhouette_score(df, label, metric='euclidean')
    print("For n_clusters={}, The Silhouette Coefficient is {}".format(n_cluster, sil_coeff))

In [None]:
# Let's try 3 clusters 
from sklearn import cluster, tree, decomposition
km = cluster.KMeans(n_clusters=3, max_iter=300, random_state=None)

df['cluster'] = km.fit_predict(df)

# Principal Component Analysis for Visualization
pca = decomposition.PCA(n_components=2, whiten=True)
#pca.fit(city_groups[features_to_explore])
df['x'] = pca.fit_transform(df)[:, 0]
df['y'] = pca.fit_transform(df)[:, 1]
plt.scatter(df['x'], df['y'], c=df['cluster'])
plt.show()

In [None]:
# inspect the distrinution of features per cluster
for c in df:
    grid= sns.FacetGrid(df, col='cluster')
    grid.map(plt.hist, c)

In [None]:
# inspect the mean of each column based on cluster
df.groupby('cluster').mean() # for every column

In [None]:
# Plot our Cluster Counts
df.groupby('cluster')["PURCHASES"].agg('count').plot(kind='bar')