In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load in 

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
import matplotlib.pyplot as plt
import seaborn as sns # data visualization library  
import os
from sklearn.cluster import KMeans
from sklearn.preprocessing import MinMaxScaler
from yellowbrick.cluster import KElbowVisualizer
from scipy.cluster.hierarchy import linkage
from scipy.cluster.hierarchy import dendrogram
from sklearn.decomposition import PCA

from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.metrics import mean_squared_error
from sklearn.linear_model import LassoCV

import warnings

warnings.filterwarnings("ignore", category=DeprecationWarning)
warnings.filterwarnings("ignore", category=FutureWarning)


# Input data files are available in the "../input/" directory.
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# Any results you write to the current directory are saved as output.

In [None]:
# read data
data = pd.read_csv('../input/ccdata/CC GENERAL.csv')
data.head()

## DATA PREPROCESSING - FEATURE ENGINEERING

In [None]:
data.describe().T

In [None]:
#Count missing variable
data.isnull().sum().sort_values(ascending=False)

In [None]:
data.loc[data.isnull().any(axis=1)].head()

In [None]:
#Fill  missing variable
data['MINIMUM_PAYMENTS'].fillna(data["PAYMENTS"], inplace=True)
data['CREDIT_LIMIT'].fillna(data['CREDIT_LIMIT'].median(), inplace=True)

In [None]:
# Feature Engineering
data["new_BALANCE_BALANCE_FREQUENCY"] = data["BALANCE"] * data["BALANCE_FREQUENCY"]
data["new_ONEOFF_PURCHASES_PURCHASES"] = data["ONEOFF_PURCHASES"] / data["PURCHASES"]
data["new_INSTALLMENTS_PURCHASES_PURCHASES"] = data["INSTALLMENTS_PURCHASES"] / data["PURCHASES"]
data["new_CASH_ADVANCE_PURCHASES_PURCHASES"] = data["CASH_ADVANCE"] * data["CASH_ADVANCE_FREQUENCY"]
data["new_PURCHASES_PURCHASES_FREQUENCY"] = data["PURCHASES"] * data["PURCHASES_FREQUENCY"]
data["new_PURCHASES_ONEOFF_PURCHASES_FREQUENCY"] = data["PURCHASES"] * data["ONEOFF_PURCHASES_FREQUENCY"]
data["new_PURCHASES_PURCHASES_TRX"] = data["PURCHASES"] / data["PURCHASES_TRX"]
data["new_CASH_ADVANCE_CASH_ADVANCE_TRX"] = data["CASH_ADVANCE"] / data["CASH_ADVANCE_TRX"]
data["new_BALANCE_CREDIT_LIMIT"] = data["BALANCE"] / data["CREDIT_LIMIT"]
data["new_PAYMENTS_CREDIT_LIMIT"] = data["PAYMENTS"] / data["MINIMUM_PAYMENTS"]

In [None]:
#Checking missing variable
data.isnull().sum().sort_values(ascending=False).head()

In [None]:
data.fillna(0, inplace=True)

In [None]:
#Indexing CUST_ID feature
data.set_index('CUST_ID', inplace=True)
data.head()

In [None]:
def outlier_thresholds(dataframe, variable):
    quartile1 = dataframe[variable].quantile(0.01)
    quartile3 = dataframe[variable].quantile(0.99)
    interquantile_range = quartile3 - quartile1
    up_limit = quartile3 + 1.5 * interquantile_range
    low_limit = quartile1 - 1.5 * interquantile_range
    return low_limit, up_limit


def replace_with_thresholds(dataframe, variable):
    low_limit, up_limit = outlier_thresholds(dataframe, variable)
    dataframe.loc[(dataframe[variable] < low_limit), variable] = low_limit
    dataframe.loc[(dataframe[variable] > up_limit), variable] = up_limit
    
for col in data.columns:
    replace_with_thresholds(data, col)

In [None]:
plt.figure(figsize=(10,10))
sns.boxplot(data=data)
plt.xticks(rotation=90)
plt.show()

In [None]:
# Min Max Scaler
names = data.columns
indexes = data.index
sc = MinMaxScaler((0, 1))
df = sc.fit_transform(data)
data_scaled = pd.DataFrame(df, columns=names, index=indexes)
data_scaled.head()

## KMEANS CLUSTERING

In [None]:
# KMeans Clustering
kmeans = KMeans()
ssd = []
K = range(1, 30)

for k in K:
    kmeans = KMeans(n_clusters=k).fit(data_scaled)
    ssd.append(kmeans.inertia_)

ssd

plt.plot(K, ssd, "bx-")
plt.xlabel("Distance Residual Sums for K Values (WCSS)")
plt.title("Elbow Method for Optimum Number of Clusters")
plt.show()

kmeans = KMeans()
visu = KElbowVisualizer(kmeans, k=(2, 20))
visu.fit(df)
visu.show()

In [None]:
kmeans = KMeans(n_clusters=7).fit(data_scaled)
clusters = kmeans.labels_

pd.DataFrame({"Customers": data.index, "Clusters": clusters})
data["cluster_no"] = clusters
data.head()

In [None]:
data["cluster_no"] = data["cluster_no"] + 1
data.groupby("cluster_no").agg({"cluster_no": "count"})

In [None]:
data.groupby("cluster_no").agg(np.mean)

In [None]:
# Plot the histogram of various clusters
for i in data.columns:
  plt.figure(figsize = (35, 5))
  for j in range(1,8):
    plt.subplot(1,8,j+1)
    cluster = data[data['cluster_no'] == j]
    cluster[i].hist(bins = 20)
    plt.title('{}    \nCluster {} '.format(i,j))
  
  plt.show()

## HIERARCHICAL CLUSTERING

In [None]:
# Average Linkage Method
hc_average = linkage(data_scaled, "average")

plt.figure(figsize=(20, 10))
plt.title("Hierarchical Clustering")
plt.xlabel("Observations")
plt.ylabel("Distance")
dendrogram(hc_average,
           leaf_font_size=10, 
           p=10,
           show_contracted=True,
          truncate_mode='level')
plt.show()

In [None]:
# Complete Linkage Method
hc_complete = linkage(data_scaled, "complete")

plt.figure(figsize=(15, 10))
plt.title("Hierarchical Clustering")
plt.xlabel("Observations")
plt.ylabel("Distance")
dendrogram(hc_complete,
           truncate_mode="lastp",
           p=10,
           show_contracted=True,
           leaf_font_size=10)
plt.show()

## PRINCIPAL COMPONENT ANALYSIS

In [None]:
pca = PCA().fit(data_scaled)
plt.plot(np.cumsum(pca.explained_variance_ratio_))
plt.xlabel("# of components")
plt.ylabel("Cumulative Variance Ratio")
plt.show()

In [None]:
pca = PCA(n_components=6)
pca_fit = pca.fit_transform(data_scaled)
pca.explained_variance_ratio_

In [None]:
np.cumsum(pca.explained_variance_ratio_)

## FEATURE SELECTION

In [None]:
#Using Pearson Correlation
plt.figure(figsize=(25,10))
cor = data_scaled.corr()
sns.heatmap(cor, annot=True, cmap=plt.cm.Reds)
plt.show()

In [None]:
#Correlation with BALANCE variable
cor_target = abs(cor["BALANCE"])
#Selecting highly correlated features
relevant_features = cor_target[cor_target>0.5]
relevant_features

In [None]:
# Model Random Forest Regression
X = data_scaled.drop(["BALANCE","new_BALANCE_BALANCE_FREQUENCY", "new_BALANCE_CREDIT_LIMIT", "BALANCE_FREQUENCY"],1)   #Feature Matrix
y = data_scaled["BALANCE"]          #Target Variable

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.20, random_state=46)

rf_model = RandomForestRegressor(random_state=42).fit(X_train, y_train)
y_pred = rf_model.predict(X_train)
np.sqrt(mean_squared_error(y_train, y_pred))

In [None]:
# Model Tuning
rf_params = {"max_depth": [5, 8, None],
             "max_features": [3, 5, 15],
             "n_estimators": [200, 500],
             "min_samples_split": [2, 5, 8]}

rf_model = RandomForestRegressor(random_state=42)
rf_cv_model = GridSearchCV(rf_model, rf_params, cv=5, n_jobs=-1, verbose=1).fit(X_train, y_train)
rf_cv_model.best_params_

In [None]:
rf_tuned = RandomForestRegressor(**rf_cv_model.best_params_).fit(X_train, y_train)

y_pred = rf_tuned.predict(X_train)
np.sqrt(mean_squared_error(y_train, y_pred))

## Random Forest Feature Importances

In [None]:
def plot_importance(model, features, num=len(X), save=False):
    feature_imp = pd.DataFrame({'Value': model.feature_importances_, 'Feature': features.columns})
    plt.figure(figsize=(10, 10))
    sns.set(font_scale=1)
    sns.barplot(x="Value", y="Feature", data=feature_imp.sort_values(by="Value",
                                                                     ascending=False)[0:num])
    plt.title('Features')
    plt.tight_layout()
    plt.show()
    if save:
        plt.savefig('importances.png')


plot_importance(rf_tuned, X, 20)

## Lasso CV Feature Importances

In [None]:
X = data_scaled.drop(["BALANCE","new_BALANCE_BALANCE_FREQUENCY", "new_BALANCE_CREDIT_LIMIT", "BALANCE_FREQUENCY"],1)   #Feature Matrix
y = data_scaled["BALANCE"]          #Target Variable

reg = LassoCV()
reg.fit(X, y)

In [None]:
print("Best alpha using built-in LassoCV: %f" % reg.alpha_)
print("Best score using built-in LassoCV: %f" %reg.score(X,y))
coef = pd.Series(reg.coef_, index = X.columns)

In [None]:
print("Lasso picked " + str(sum(coef != 0)) + " variables and eliminated the other " +  
      str(sum(coef == 0)) + " variables")

In [None]:
imp_coef = coef.sort_values()
import matplotlib
matplotlib.rcParams['figure.figsize'] = (8.0, 10.0)
imp_coef.plot(kind = "barh")
plt.title("Feature importance using Lasso Model")
plt.show()