# Credit Exploratory Data Analysis

Quick notebook for exploring what features are most correlative/indicative with a good credit score.

# Essential Imports

In [None]:
import numpy as np
import pandas as pd
import os
import sys
import seaborn as sns
import matplotlib.pyplot as plt
import plotly.graph_objects as go
import plotly.express as px
import plotly.offline as py
import math
import itertools
from scipy.stats import pearsonr
from sklearn.linear_model import LinearRegression
py.init_notebook_mode(connected=True)

In [None]:
df = pd.read_csv("../input/credit-card-balance-data/Credit.csv")
df

# Quantitative Features

- Income, Limit, Cards, Age, Balance

In [None]:
def quant_compare(feature_x, feature_y="Rating"):
    fig = px.scatter(df, x=feature_x, y=feature_y, trendline="ols")
    fig.update_layout(title={'text': f"{feature_x} vs {feature_y}; Corr: {round(pearsonr(df[feature_x], df[feature_y])[0], 3)}", 'x': 0.5,
                             'xanchor': 'center', 'font': {'size': 20}}, yaxis_title="Credit Rating")
    fig.show()

In [None]:
quant_compare('Income')

In [None]:
quant_compare('Limit')

In [None]:
quant_compare('Cards')

In [None]:
quant_compare('Age')

In [None]:
quant_compare('Balance')

# Qualitative Features

- Education, Gender, Student, Married, Ethnicity

In [None]:
quant_compare('Education')

In [None]:
def qual_compare(feature_compare, feature_x="Rating"):
    fig = px.histogram(df, x=feature_x, color=feature_compare, histnorm='probability')
    fig.update_layout(title={'text': f"{feature_x} with Different {feature_compare} Histogram", 'x': 0.5,
                         'xanchor': 'center', 'font': {'size': 20}})
    fig.show()
    
    plt.figure(figsize=(15, 8))
    ax = sns.kdeplot(data=df, x=feature_x, hue=feature_compare)
    ax.set_title(f"{feature_x} with Different {feature_compare} KDE Plot", fontsize=20)
    plt.show()
    
    all_features = df[feature_compare].unique()
    feature_dict = {}
    for feature in all_features:
        arr = df.loc[df[feature_compare] == feature][feature_x]
        feature_dict[feature] = f"{np.mean(arr)} +- {2*np.std(arr)}"

    for feature in feature_dict:
        print(f"{feature} Confidence Interval: {feature_dict[feature]}")

In [None]:
qual_compare('Gender')

In [None]:
qual_compare('Student')

In [None]:
qual_compare('Married')

In [None]:
qual_compare('Ethnicity')

# Linear Regression

The predictors that are most closely associated with default are probably income, limit, balance, and perhaps also married and ethnicity. For the purposes of this exercise (so I don't have to use one-hot encoding), I decide to just use the appropriate quantative features.

In [None]:
model = LinearRegression()
model.fit(X=df[['Income', 'Limit', 'Balance']], y=df['Rating'])

print("MSE of the model is: ", np.round(np.average(np.power(model.predict(df[['Income', 'Limit', 'Balance']])-df['Rating'], 2)), 3))

In [None]:
print("Intercept: ", model.intercept_)
print("Coefficients: ", model.coef_)

We implemented a linear regression model using income, limit, and balance as the input features, with credit rating as the target. Our model can be defined as:

```Rating = 0.134 * Income + 0.062 * Limit + 0.015 * Balance + 44.425```

The Mean Squared Error of this model is ~142.674, which is surprisingly high given the fact the model was trained on the entire test dataset. In reality, the error would be much higher had the model been trained only on a subset of the entire data. 