![Credit Card](https://milesopedia.com/wp-content/uploads/2019/08/featured-les-meilleures-cartes-de-credit.jpg)

In [None]:
import pandas as pd 
import seaborn as sns
import numpy as np
import matplotlib.pyplot as plt

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

In [None]:
df = pd.read_csv('../input/ccdata/CC GENERAL.csv')


# **STEPS:** 

 1.Understanding Dataset1.Understanding Dataset

 2.Data Perprocessing

 > 2.1.Delete Some Columns

>2.2.Taking Care Of Missing Data

>2.3.feature scalling

3.Analysis Of Relationship

4.Create Model


**1.Understanding Dataset**

In [None]:
df.head()

In [None]:
df.describe()

In [None]:
df.shape

In [None]:
df.info()

In [None]:
df.nunique()

# **1.2.Feature Details**


(I copy this information from [this](https://www.kaggle.com/arjunbhasin2013/ccdata))


>1.**CUSTID** : Identification of Credit Card holder (Categorical)


>2.**BALANCE** : Balance amount left in their account to make purchases 


>3.**BALANCEFREQUENCY** : How frequently the Balance is updated, score between 0 and 1 (1 = frequently updated, 0 = not frequently updated)


>4.**PURCHASES** : Amount of purchases made from account


>5.**ONEOFFPURCHASES** : Maximum purchase amount done in one-go


>6.**INSTALLMENTSPURCHASES** : Amount of purchase done in installment


>7.**CASHADVANCE** : Cash in advance given by the user


>8.**PURCHASESFREQUENCY** : How frequently the Purchases are being made, score between 0 and 1 (1 = frequently purchased, 0 = not frequently purchased)


>9.**ONEOFFPURCHASESFREQUENCY** : How frequently Purchases are happening in one-go (1 = frequently purchased, 0 = not frequently purchased)


>10.**PURCHASESINSTALLMENTSFREQUENCY** : How frequently purchases in installments are being done (1 = frequently done, 0 = not frequently done)


>11.**CASHADVANCEFREQUENCY** : How frequently the cash in advance being paid


>12.**CASHADVANCETRX** : Number of Transactions made with "Cash in Advanced"


>13.**PURCHASESTRX** : Numbe of purchase transactions made


>14.**CREDITLIMIT** : Limit of Credit Card for user


>15.**PAYMENTS** : Amount of Payment done by user


>16.**MINIMUM_PAYMENTS** : Minimum amount of payments made by user


>17.**PRCFULLPAYMENT** : Percent of full payment paid by user


>18.**TENURE** : Tenure of credit card service for user

 **2.Data Perprocessing and feature selection**


> 2.1.Delete Some Columns

In [None]:
df.pop('CUST_ID')

> 2.2.Taking Care Of Missing Data

In [None]:
df.isnull().sum()

In [None]:
x = df.iloc[:,:].values
from sklearn.impute import SimpleImputer
imputer = SimpleImputer(missing_values=np.nan, strategy='mean')
imputer.fit(x[:, -5:-2])
x[:, -5:-2] = imputer.transform(x[:, -5:-2])
df = pd.DataFrame(x , columns = df.columns)
df.isnull().sum()

>2.3.feature scalling

In [None]:
from sklearn.preprocessing import StandardScaler
sc = StandardScaler()
df = pd.DataFrame(sc.fit_transform(df.iloc[:,:].values) , columns = df.columns)
df

**3.Analysis Of Relationship**

In [None]:
corelation_matrix = df.corr()

plt.figure(figsize=(15,15))

sns.heatmap(corelation_matrix , xticklabels = corelation_matrix.columns 
            , yticklabels = corelation_matrix.columns , annot = True,cmap='Spectral', fmt='.2f' )

In [None]:
plt.figure(figsize=(12,55))
for ii, columnName in enumerate(df.columns): 
    plt.subplot(len(df.columns), 1, ii+1)
    plt.hist(df[columnName], alpha=.4, bins=30)
    plt.title(columnName)
    
plt.tight_layout()


**4.Create Model**

I think best model is DBSCAN , but after training this ,this model I create Kmean++ and hc

> 4.1 DBSCAN

In [None]:
from sklearn.neighbors import NearestNeighbors
from sklearn.cluster import DBSCAN
from math import ceil , log

X = df.iloc[:,:].values
nbrs = NearestNeighbors(n_neighbors=len(X)).fit(X)
distances, indices = nbrs.kneighbors(X)

dbscan = DBSCAN(eps=ceil(distances.mean()),min_samples=ceil(log(df.shape[1])))
dbscan.fit(X)

In [None]:
from sklearn.metrics import silhouette_score , davies_bouldin_score
print('silhouette_score:', silhouette_score(X,dbscan.labels_))
print('davies_bouldin_score:', davies_bouldin_score(X,dbscan.labels_))

> 4.2 Kmean++

In [None]:
from sklearn.cluster import KMeans
wcss = []
for i in range(1, df.shape[1]):
    kmeans = KMeans(n_clusters = i, init = 'k-means++', random_state = 42)
    kmeans.fit(X)
    wcss.append(kmeans.inertia_)
plt.plot(range(1, df.shape[1]), wcss)
plt.title('The Elbow Method')
plt.xlabel('Number of clusters')
plt.ylabel('WCSS')
plt.show()

In [None]:
kmeans = KMeans(n_clusters = 4, init = 'k-means++', random_state = 42)
y_kmeans = kmeans.fit_predict(X)

In [None]:
print('silhouette_score:', silhouette_score(x,kmeans.labels_))
print('davies_bouldin_score:', davies_bouldin_score(x,kmeans.labels_))

In [None]:
import scipy.cluster.hierarchy as sch
plt.figure(figsize=(35,35))
dendrogram = sch.dendrogram(sch.linkage(X, method = 'ward'))
plt.show()

In [None]:
from sklearn.cluster import AgglomerativeClustering
hc = AgglomerativeClustering(n_clusters = 4, affinity = 'euclidean', linkage = 'ward')
y_hc = hc.fit_predict(X)

In [None]:
print('silhouette_score:', silhouette_score(X,hc.labels_))
print('davies_bouldin_score:', davies_bouldin_score(X,hc.labels_))