# Credit cards customers EDA

In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
import seaborn as sns
sns.set_theme(style='whitegrid')
from scipy.stats import pearsonr
import math

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

In [None]:
df=pd.read_csv('/kaggle/input/credit-card-customers/BankChurners.csv')

In [None]:
df.set_index('CLIENTNUM', inplace=True)
df = df[['Customer_Age', 'Gender', 'Dependent_count',
       'Education_Level', 'Marital_Status', 'Income_Category', 'Card_Category',
       'Months_on_book', 'Total_Relationship_Count', 'Months_Inactive_12_mon',
       'Contacts_Count_12_mon', 'Credit_Limit', 'Total_Revolving_Bal',
       'Avg_Open_To_Buy', 'Total_Amt_Chng_Q4_Q1', 'Total_Trans_Amt',
       'Total_Trans_Ct', 'Total_Ct_Chng_Q4_Q1', 'Avg_Utilization_Ratio']]

In [None]:
df.shape

### Customer Age

In [None]:
sns.histplot(data=df, x='Customer_Age')

In [None]:
df['Customer_Age'].describe()

In [None]:
df['Customer_Age_Years'] = (df['Customer_Age']/10).apply(math.floor)
df['Customer_Age_Years'] = pd.Categorical(df['Customer_Age_Years'].map({
    2: '20s', 3: '30s', 4: '40s', 5: '50s', 6: '60s', 7: '70s'
}), categories=['20s','30s','40s','50s','60s','70s'], ordered=True)
print(df['Customer_Age_Years'].value_counts())

sns.histplot(data=df, x='Customer_Age_Years', hue='Customer_Age_Years', discrete=True)

### Gender

In [None]:
df['Gender'] = pd.Categorical(df['Gender'])
print(100.00 * df['Gender'].value_counts()/df.shape[0])
sns.histplot(data=df, x='Gender', hue='Gender')

### Dependent count

In [None]:
sns.histplot(df['Dependent_count'])

In [None]:
sns.scatterplot(data=df, y='Dependent_count', x='Customer_Age', hue=pd.Categorical(df['Dependent_count']))

In [None]:
sns.histplot(data=df, x="Dependent_count", hue="Gender", multiple="dodge")

### Education level

In [None]:
df['Education_Level'] = pd.Categorical(
    df['Education_Level'],
    categories=['Uneducated', 'High School', 'College', 'Graduate', 'Post-Graduate', 'Doctorate', 'Unknown'],
    ordered=True
)

In [None]:
print(100.00 * df['Education_Level'].value_counts() / df.shape[0])
sns.histplot(data=df, y='Education_Level', hue='Education_Level')

### Marital status

In [None]:
df['Marital_Status'] = pd.Categorical(df['Marital_Status'])
sns.histplot(data=df, x='Marital_Status', hue='Gender', multiple='dodge')

In [None]:
100.00 * df['Marital_Status'].value_counts() / df.shape[0]

### Income

In [None]:
df['Income_Category'].unique()

In [None]:
df['Income_Category'] = pd.Categorical(
    df['Income_Category'],
    categories=['Less than $40K', '$40K - $60K', '$60K - $80K', '$80K - $120K', '$120K +', 'Unknown'],
    ordered=True
)

In [None]:
sns.histplot(data=df, y='Income_Category', hue='Gender', multiple='stack')

In [None]:
100.00 * df['Income_Category'].value_counts() / df.shape[0]

In [None]:
sns.catplot(data=df, y='Income_Category', x='Credit_Limit', col='Card_Category', kind='strip')

### Card category

In [None]:
df['Card_Category'] = pd.Categorical(df['Card_Category'], categories=['Blue', 'Silver', 'Gold', 'Platinum'], ordered=True)
print(100.00 * df['Card_Category'].value_counts() / df.shape[0])
sns.histplot(data=df, x='Card_Category', hue='Card_Category')

In [None]:
g = sns.FacetGrid(df,col="Income_Category", hue='Card_Category', col_wrap=3, height=3)
g.map(sns.histplot, 'Card_Category')

### Credit Limit

In [None]:
plot = sns.histplot(df['Credit_Limit'], kde=True)
plot.axvline(df['Credit_Limit'].mean(), 0,2000)
plot.annotate('Average value: '+str(round(df['Credit_Limit'].mean())), xy=(df['Credit_Limit'].mean()+500, 1000))

In [None]:
sns.boxplot(data=df, y='Card_Category', x='Credit_Limit')

In [None]:
sns.violinplot(data=df, y='Card_Category', x='Credit_Limit')

### Transaction amount

In [None]:
sns.histplot(data=df, x='Total_Trans_Amt')

In [None]:
from sklearn.cluster import KMeans
model = KMeans(n_clusters=4)
model.fit(df[['Total_Trans_Amt']])
df['Total_Trans_Amt_Segment'] = pd.Categorical(model.predict(df[['Total_Trans_Amt']]))

In [None]:
sns.histplot(data=df, x='Total_Trans_Amt', hue='Total_Trans_Amt_Segment')

In [None]:
df[['Total_Trans_Amt_Segment', 'Total_Trans_Amt']].groupby('Total_Trans_Amt_Segment').agg({'Total_Trans_Amt': [np.min, np.mean, np.max]})

### Transaction count

In [None]:
plot = sns.histplot(data=df, x='Total_Trans_Ct', kde=True)
plot.axvline(df['Total_Trans_Ct'].mean(), 0,2000)
plot.annotate('Average value: '+str(round(df['Total_Trans_Ct'].mean())), xy=(df['Total_Trans_Ct'].mean()-50, 700))

In [None]:
sns.boxplot(data=df, x='Total_Trans_Ct', y='Card_Category')

### Utilization ratio

In [None]:
plot = sns.histplot(data=df, x='Avg_Utilization_Ratio', kde=True)
plot.axvline(df['Avg_Utilization_Ratio'].mean(), 0,2000)
plot.annotate('Average value: '+str(round(df['Avg_Utilization_Ratio'].mean(), 3)), xy=(df['Avg_Utilization_Ratio'].mean()+0.02, 2000))

In [None]:
df['Avg_Utilization_Ratio'].describe()

In [None]:
sns.boxplot(data=df, x='Avg_Utilization_Ratio', y='Card_Category')

### Months on book

In [None]:
sns.histplot(data=df, x='Months_on_book')

In [None]:
df['Years_on_book'] = pd.Categorical((df['Months_on_book']/12).apply(math.floor), categories=[1, 2, 3, 4], ordered=True)

In [None]:
sns.histplot(data=df, x='Years_on_book', discrete=True)

In [None]:
sns.boxplot(data=df, x='Years_on_book', y='Customer_Age', hue='Gender')

In [None]:
sns.boxplot(data=df, x='Years_on_book', y='Total_Trans_Ct')

In [None]:
df[['Years_on_book', 'Total_Trans_Ct']].groupby('Years_on_book').mean()

### Total relationship count

In [None]:
sns.histplot(data=df, x='Total_Relationship_Count')

### Months inactive

In [None]:
sns.histplot(df['Months_Inactive_12_mon'], discrete=True)

In [None]:
100.00 * df['Months_Inactive_12_mon'].value_counts() / df.shape[0]

In [None]:
df['Months_Inactive_12_mon'] = pd.Categorical(df['Months_Inactive_12_mon'])
sns.boxplot(data=df, x='Months_Inactive_12_mon', y='Total_Revolving_Bal')

### Contact frequency

In [None]:
sns.histplot(data=df, x='Contacts_Count_12_mon', discrete=True)

### Adhoc

In [None]:
sns.boxplot(data=df, x='Total_Trans_Amt', y=pd.Categorical(df['Customer_Age_Years']))

In [None]:
sns.boxplot(data=df, x='Customer_Age', y='Income_Category')

In [None]:
sns.boxplot(data=df, x='Customer_Age', y='Card_Category')

In [None]:
sns.pairplot(data=df)