In [None]:
import pandas as pd

In [None]:
application_train_df = pd.read_csv('application_train.csv')

In [None]:
application_train_df['AGE_YEARS'] = -1 * application_train_df['DAYS_BIRTH']/365


In [None]:
bins = [20, 25, 30, 35, 40, 45, 50, 55, 60, 65, 70]
labels = ['20-25', '25-30', '30-35', '35-40', '40-45', '45-50', '50-55', '55-60', '60-65', '65-70']
application_train_df['AGE_BRACKET'] = pd.cut(application_train_df['AGE_YEARS'], bins = bins, labels = labels, right = False)



In [None]:
grouped_avg = application_train_df.groupby(['AGE_BRACKET', 'CODE_GENDER'])['TARGET'].mean()
grouped_avg = grouped_avg.reset_index()


In [None]:
import altair as alt
alt.Chart(grouped_avg.reset_index()).mark_bar().encode(
    x=alt.X('AGE_BRACKET', axis=alt.Axis(title = 'Age Bracket', labelAngle=45)), # Rotate x-axis labels here and add them in the layer below
    y=alt.Y('TARGET', axis=alt.Axis(title='Proportion', titleAngle=0)), # Rotate y-axis label and rename
    color=alt.Color('CODE_GENDER',  scale=alt.Scale(domain=['M', 'F'], range=['blue', 'pink']), legend=alt.Legend(title='Gender')), # Rename legend
    xOffset='CODE_GENDER'
).properties(
    title='Proportion of Loan Defaults by Age Bracket and Gender'
).interactive()

In [None]:
alt.Chart(application_train_df).mark_boxplot().encode(
    x = alt.X('TARGET', axis=alt.Axis(title = 'Default', labelAngle=45)),
    y = alt.Y('AMT_INCOME_TOTAL')
)

In [None]:
bureau_df = pd.read_csv("bureau.csv")

In [None]:
application_bureau_df = pd.merge(application_train_df, bureau_df, on = "SK_ID_CURR")
total_overdue = application_bureau_df.groupby("SK_ID_CURR")["AMT_CREDIT_SUM_OVERDUE"].sum()
total_debt = application_bureau_df.groupby("SK_ID_CURR")["AMT_CREDIT_SUM_DEBT"].sum()
times_prolonged = application_bureau_df.groupby("SK_ID_CURR")["CNT_CREDIT_PROLONG"].sum()
days_overdue = application_bureau_df.groupby("SK_ID_CURR")["CREDIT_DAY_OVERDUE"].sum()

In [None]:
application_train_merged_df = application_train_df.merge(total_overdue, on='SK_ID_CURR', how='left')
application_train_merged_df = application_train_merged_df.merge(total_debt, on='SK_ID_CURR', how='left')
application_train_merged_df = application_train_merged_df.merge(times_prolonged, on='SK_ID_CURR', how='left')
application_train_merged_df = application_train_merged_df.merge(days_overdue, on = "SK_ID_CURR", how = 'left')

In [None]:
application_train_merged_df['AMT_CREDIT_SUM_OVERDUE'] = application_train_merged_df['AMT_CREDIT_SUM_OVERDUE'].fillna(0)
application_train_merged_df['AMT_CREDIT_SUM_DEBT'] = application_train_merged_df['AMT_CREDIT_SUM_DEBT'].fillna(0)
application_train_merged_df['CNT_CREDIT_PROLONG'] = application_train_merged_df['CNT_CREDIT_PROLONG'].fillna(0)
application_train_merged_df['CREDIT_DAY_OVERDUE'] = application_train_merged_df['CREDIT_DAY_OVERDUE'].fillna(0)

In [None]:
import altair as alt

alt.Chart(application_train_merged_df).mark_circle().encode(
    x=alt.X('AMT_CREDIT_SUM_OVERDUE', title='Total Overdue Credit'),
    y=alt.Y('AMT_CREDIT_SUM_DEBT', title='Total Credit Debt'),
    color=alt.Color('TARGET', title='Target', scale=alt.Scale(range=['blue', 'red']))
).properties(
    title='Total Overdue Credit vs. Total Credit Debt'
).interactive()

In [None]:
%pip install "vl-convert-python>=1.6.0"

In [None]:
import altair as alt

alt.data_transformers.enable("vegafusion")

alt.Chart(application_train_merged_df).mark_circle().encode(
    x=alt.X('AMT_CREDIT_SUM_OVERDUE', title='Total Overdue Credit'),
    y=alt.Y('CNT_CREDIT_PROLONG', title='Times Credit Prolonged'),
    color=alt.Color('TARGET', title='Target', scale=alt.Scale(range=['blue', 'red']))
).properties(
    title='Total Overdue Credit vs. Times Credit Prolonged by Target'
).interactive()

In [None]:
alt.Chart(application_train_merged_df).mark_circle().encode(
    x=alt.X('AMT_CREDIT_SUM_OVERDUE', title='Total Overdue Credit'),
    y=alt.Y('', title='Times Credit Prolonged'),
    color=alt.Color('TARGET', title='Target', scale=alt.Scale(range=['blue', 'red']))
).properties(
    title='Total Overdue Credit vs. Times Credit Prolonged by Target'
).interactive()

In [None]:
%pip install vegafusion

In [None]:
mean = application_train_merged_df['AMT_CREDIT_SUM_DEBT'].mean()
std = application_train_merged_df['AMT_CREDIT_SUM_DEBT'].std()
application_train_merged_df['AMT_CREDIT_SUM_OVERDUE'] = (application_train_merged_df['AMT_CREDIT_SUM_DEBT'] - mean) / std

mean = application_train_merged_df['CNT_CREDIT_PROLONG'].mean()
std = application_train_merged_df['CNT_CREDIT_PROLONG'].std()
application_train_merged_df['CNT_CREDIT_PROLONG'] = (application_train_merged_df['AMT_CREDIT_SUM_DEBT'] - mean) / std

mean = application_train_merged_df['CREDIT_DAY_OVERDUE'].mean()
std = application_train_merged_df['CREDIT_DAY_OVERDUE'].std()
application_train_merged_df['CREDIT_DAY_OVERDUE'] = (application_train_merged_df['CREDIT_DAY_OVERDUE'] - mean) / std

In [None]:
mean = application_train_merged_df['AMT_CREDIT_SUM_OVERDUE'].mean()
std = application_train_merged_df['AMT_CREDIT_SUM_OVERDUE'].std()
application_train_merged_df['AMT_CREDIT_SUM_OVERDUE'] = (application_train_merged_df['AMT_CREDIT_SUM_OVERDUE'] - mean) / std

In [None]:
import numpy as np
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, classification_report

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)


In [None]:
log_reg = LogisticRegression(penalty='l2', solver='lbfgs', max_iter=1000)
log_reg.fit(X_train, y_train)


In [None]:
# SVM classifier with RBF kernel (default)
svc = SVC(kernel='rbf', C=1.0)
svc.fit(X_train, y_train)


In [None]:
lda = LinearDiscriminantAnalysis()
lda.fit(X_train, y_train)


In [None]:
# Predict on the test set
y_pred_log = log_reg.predict(X_test)
y_pred_svc = svc.predict(X_test)
y_pred_lda = lda.predict(X_test)


In [None]:
classification_report(y_test, y_pred_log)

In [None]:
import numpy as np
def age_stratified_split(data, age_bracket_column):
  x_test_indices = []
  for age_bracket in data[age_bracket_column].unique():
    age_bracket_data = data[data[age_bracket_column] == age_bracket]
    n = len(age_bracket_data)
    test_size = int(0.2 * n)
    test_indices = age_bracket_data.sample(n=test_size, random_state=42).index.tolist()
    x_test_indices.extend(test_indices)

  x_test = data.loc[x_test_indices]
  x_train = data.drop(x_test_indices)

  return x_train, x_test

In [None]:
X_train, X_test = age_stratified_split(application_train_merged_df, 'AGE_BRACKET')

In [None]:
group_frequencies = application_train_merged_df.groupby('AGE_BRACKET').size()
total = group_frequencies.sum()
group_frequencies = group_frequencies / total
print(group_frequencies)

In [None]:
group_frequencies = X_train.groupby('AGE_BRACKET').size()
total = group_frequencies.sum()
group_frequencies = group_frequencies / total
print(group_frequencies)

In [None]:
group_frequencies = X_test.groupby('AGE_BRACKET').size()
total = group_frequencies.sum()
group_frequencies = group_frequencies / total
print(group_frequencies)