In [None]:
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
from scipy.stats import zscore

In [None]:
database = pd.read_csv('Bank_Personal_Loan_Modelling_train.csv')
database_test = pd.read_csv('Bank_Personal_Loan_Modelling_reserved.csv')
database.drop(columns=['ID'], inplace=True)
database_test.drop(columns=['ID'], inplace=True)

In [None]:
corr_matrix = database.corr()
plt.figure(figsize=(10, 8))
sns.heatmap(corr_matrix, annot=True, cmap='coolwarm')
plt.show()

least_correlated_feature = corr_matrix.abs().mean(axis=1).idxmin()
print(least_correlated_feature)
database.drop(columns=[least_correlated_feature], inplace=True)
database_test.drop(columns=[least_correlated_feature], inplace=True)

In [None]:
database['Experience'] -= database['Experience'].min()
database_test['Experience'] -= database_test['Experience'].min()
print(database['Experience'].mean())

database['CCAvg'] *= 12
database_test['CCAvg'] *= 12
print(database['CCAvg'].mean())

In [None]:
plt.figure(figsize=(16, 10))
sns.boxplot(data=database)
#plt.xticks(rotation=90)
plt.show()

In [None]:
sns.histplot(database['Mortgage'], bins=30, kde=True)
plt.show()

In [None]:
zscores = zscore(database['Mortgage'])
num_outliers = (zscores.abs() > 3).sum()
print(num_outliers)

data_cleaned = database[zscores.abs() <= 3]
print(data_cleaned.shape[0])

In [None]:
from sklearn.model_selection import train_test_split
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import f1_score

In [None]:
X = data_cleaned.drop(columns=['Personal Loan'])
y = data_cleaned['Personal Loan']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=11, stratify=y)

model = DecisionTreeClassifier(random_state=11, criterion='gini')
model.fit(X_train, y_train)

y_pred = model.predict(X_test)

f1 = f1_score(y_test, y_pred)
print(f1)

In [None]:
model = DecisionTreeClassifier(random_state=11, criterion='entropy')
model.fit(X_train, y_train)

y_pred = model.predict(X_test)

f1 = f1_score(y_test, y_pred)
print(f1)

In [None]:
model = DecisionTreeClassifier(random_state=11, criterion='gini', class_weight='balanced')
model.fit(X_train, y_train)

y_pred = model.predict(X_test)

f1 = f1_score(y_test, y_pred)
print(f1)

In [None]:
model = DecisionTreeClassifier(random_state=11, criterion='entropy', class_weight='balanced')
model.fit(X_train, y_train)

y_pred = model.predict(X_test)

f1 = f1_score(y_test, y_pred)
print(f1)

In [None]:
class_counts = y_train.value_counts()
print(class_counts)
plt.figure(figsize=(8, 6))
sns.barplot(x=class_counts.index, y=class_counts.values)
plt.show()
class_1_ratio = class_counts[0] / len(y_train)
print(class_1_ratio)

In [None]:
from imblearn.over_sampling import SMOTE

In [None]:
smote = SMOTE(random_state=11)
X_train_b, y_train_b = smote.fit_resample(X_train, y_train)
class_counts_b = y_train_b.value_counts()
print(class_counts_b)

In [None]:
model = DecisionTreeClassifier(random_state=11, criterion='gini')
model.fit(X_train_b, y_train_b)

y_pred = model.predict(X_test)

f1 = f1_score(y_test, y_pred)
print(f1)

In [None]:
model = DecisionTreeClassifier(random_state=11, criterion='entropy')
model.fit(X_train_b, y_train_b)

y_pred = model.predict(X_test)

f1 = f1_score(y_test, y_pred)
print(f1)

In [None]:
y_pred = model.predict(database_test)

print(len(y_pred))
prints = str(y_pred)
print(prints.replace(' ', ','))