In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 5GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

In [None]:
#import necessary modules
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline

sns.set()

In [None]:
#import the dataset
cc= pd.read_csv('/kaggle/input/ccdata/CC GENERAL.csv')
cc.head()

#### Understanding the Dataset

In [None]:
print(cc.shape)

In [None]:
cc.columns

In [None]:
#Look at data using the info() function
cc.info()

    The info() function is critical to understand the data.
    As you can see there are 17 numerical columns and first column as object columns

In [None]:
#Look at summary statistics of data using the describe() function
cc.describe(include='all')

In [None]:
# Let's get unique values for each category
unique_vals = {
    k: cc[k].unique()
    for k in cc.columns
}

unique_vals

**Initial Observations**

*     CUST_ID has unique values for each observation. Keeping this will make algorithm complex. we will ignore the columns from our analyis
*      few of the columns have high variance
*     There are frquency columns where the values are between 0-1. We need to find a way where we can convert them into categorical as 0-no frequently purchased, 1-not frequently purchased
    

In [None]:
#CUST_ID is a dataset artifact, not something useful for analysis
cc= cc.drop("CUST_ID", axis=1)

In [None]:
cc.isnull().sum()

In [None]:
#CREDIT_LIMIT and MINIMUM_PAYMENTS have some missing value.so fill missing values with median value

cc= cc.fillna(cc.median())

# Checking no more NULLs in the data
all(cc.isna().sum() == 0)

In [None]:
cc.describe(include='all')

### Exploratory Data Analysis

In [None]:
#since all the attributes are numerical first we will understand the distributions of the data on each attributes

cc.hist(figsize=(20,15))
plt.title('Data',fontsize=12)
plt.show()

In [None]:
n= len(cc.columns)

plt.figure(figsize=(10,60))
for i in range(n):
    plt.subplot(17,1,i+1)
    sns.boxplot(cc[cc.columns[i]])
    plt.title(cc.columns[i])
plt.tight_layout()

There are outliers present in almost all the features.

In [None]:
# Create the correlation matrix
corr = cc.corr()
# Generate a mask for the upper triangle
mask = np.triu(np.ones_like(corr, dtype=bool))
plt.figure(figsize=(14,10))
# Add the mask to the heatmap
sns.heatmap(corr, mask=mask, cmap='YlGnBu',center=0, linewidths=1, annot=True, fmt=".2f")
plt.show()

In [None]:
cc.var().sort_values()

Most of the features have high variance are in different scales. We need to center these variances around 0.

**Transforming Features for better clustering**

In [None]:
from sklearn.preprocessing import StandardScaler

sc= StandardScaler()
cc_scaled= sc.fit_transform(cc)

In [None]:
#checking optimal value of k using elbow method

from sklearn.cluster import KMeans

ks = range(1, 15)
inertias = []
for k in ks:
    # Create a KMeans instance with k clusters: model
    model= KMeans(n_clusters=k)
        # Fit model to samples
    model.fit(cc_scaled)
        # Append the inertia to the list of inertias
    inertias.append(model.inertia_)
    # Plot ks vs inertias
plt.plot(ks, inertias, '-o')
plt.xlabel('number of clusters, k')
plt.ylabel('inertia')
plt.xticks(ks)
plt.show()

In [None]:
clusters_df=pd.DataFrame({'num_clusters':ks,'cluster_errors':inertias})
clusters_df

In [None]:
#choose k = 4 for number of clusters, based on plot above. also after k=4 the slope of the line is almot constant as well.

from sklearn.cluster import KMeans

KM= KMeans(n_clusters=4)
KM.fit(cc_scaled)

KM_labels = KM.fit_predict(cc_scaled)
KM_labels

In [None]:
KM.cluster_centers_.shape

In [None]:
print(KM.inertia_)

In [None]:
cc['cluster_labels'] = KM_labels
cc.head()

In [None]:
plt.figure(figsize=(20,15))
df1= cc[cc.cluster_labels==0]
df2= cc[cc.cluster_labels==1]
df3= cc[cc.cluster_labels==2]
df4= cc[cc.cluster_labels==3]


plt.scatter(df1['PAYMENTS'], df1['PURCHASES'], color='black')
plt.scatter(df2['PAYMENTS'], df2['PURCHASES'], color='orange')
plt.scatter(df3['PAYMENTS'], df3['PURCHASES'], color='purple')
plt.scatter(df4['PAYMENTS'], df4['PURCHASES'], color='blue')

plt.show()

In [None]:
cc['cluster_labels'].value_counts()

In [None]:
cc.groupby('cluster_labels').mean()

Cluster-0

Balance is very high and gets updated frequently. Majority of purchases being done by paying cash in advance and it is being done quite frequently. Credit Limit is comparitively high. Minimum Payment done for  the purchases are high compared to others

Cluster 1 : 

Low balance but the balance gets updated frequently ie. more no. of transactions. No of purchases from the account are also quite large and majority of the purchases are done either in one go or in installments but not by paying cash in advance.

Cluster-2
Balance is comparatively high and balance gets updated. No of Purchases are high. Purchases done either in one go or in installments are very high and done very frequently. Credit Limit is very high. Payments made are very high. Full Payments Percent of full payment paid by user.

Cluster-3

Balance is high but the balance doesnt get updated. No. of purchases from the account are very low. Purchases done either in one go or in installments are verly low. Purchases are not done quite frequently. Credit Limit is very low. Payment and Minimum payment is very low

In [None]:
for c in cc:
    grid= sns.FacetGrid(cc, col='cluster_labels')
    grid.map(plt.hist, c)

In [None]:
#t-SNE provides great visualizations when the individual samples can be labeled

from sklearn.manifold import TSNE
model = TSNE(learning_rate=200)

# Apply fit_transform to samples: tsne_features
tsne_features = model.fit_transform(cc_scaled)

# Select the 0th feature: xs
xs = tsne_features[:,0]
# Select the 1st feature: ys
ys = tsne_features[:,1]

plt.figure(figsize=(20,15))
# Scatter plot, coloring by variety_numbers
plt.scatter(xs, ys, c=KM_labels)
plt.show()

**Visualising Hierarchies**

In [None]:
from scipy.cluster.hierarchy import dendrogram, linkage

#calculate the linkage: mergings
mergings= linkage(cc_scaled, method='ward')

plt.figure(figsize=(20,15))
#Plot the dendrogram, using labels
dendrogram(mergings, labels=KM_labels, p=5, leaf_rotation=90,leaf_font_size=10, truncate_mode='level')

plt.show()

**Principal Component Analysis**

In [None]:
from sklearn.decomposition import PCA

model= PCA()

model.fit_transform(cc_scaled)

**Variances of the PCA features**

In [None]:
# Plot the explained variances
features = range(model.n_components_)
plt.figure(figsize=(20,15))
plt.bar(features, model.explained_variance_)
plt.xlabel('PCA feature')
plt.ylabel('variance')
plt.xticks(features)
plt.show()

 It looks like PCA features 0 and 1 have significant variance. The intrinsic dimension of this dataset appears to be 2

**Dimensionality reduction with PCA**

In [None]:
pca= PCA(n_components= 2)

pca.fit(cc_scaled)
pca_features=pca.transform(cc_scaled)

print(pca_features.shape)

Notice PCA reduced the high dimension features of 17 to intrinsic dimension of 2. However, the observations remains same

In [None]:
# Create a dataframe with the two PCA components
pca_df = pd.DataFrame(data=pca_features,columns=['pca1','pca2'])
pca_df.head()

In [None]:
# Concatenate the clusters labels to the dataframe
df = pd.concat([pca_df,pd.DataFrame({'cluster':KM_labels})], axis = 1)
df.head()

In [None]:
plt.figure(figsize=(18,12))
sns.scatterplot(x='pca1', y='pca2', hue='cluster', data=df, palette=['purple','orange','blue','black'])
plt.xlabel('Principal Component 1', fontsize=13)
plt.ylabel('Principal Component 2', fontsize=13)
plt.show()