In [None]:
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.preprocessing import StandardScaler, LabelEncoder
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix
import numpy as np

# --- 1. Data Loading and Exploration ---
train_data = pd.read_csv("train_ctrUa4K.csv")

print(train_data.head())
print(train_data.info())
print(train_data.isnull().sum())

sns.countplot(x='Loan_Status', data=train_data)
plt.show()

sns.countplot(x='Gender', hue='Loan_Status', data=train_data)
plt.show()

sns.histplot(train_data['ApplicantIncome'], bins=50)
plt.show()

# --- 2. Data Preprocessing ---
for col in ['Gender', 'Married', 'Dependents', 'Self_Employed', 'Credit_History']:
    train_data[col].fillna(train_data[col].mode()[0], inplace=True)

for col in ['LoanAmount', 'Loan_Amount_Term']:
    train_data[col].fillna(train_data[col].median(), inplace=True)

train_data['Loan_Status'] = train_data['Loan_Status'].map({'Y': 1, 'N': 0})
train_data = pd.get_dummies(train_data, columns=['Gender', 'Married', 'Education', 'Self_Employed', 'Property_Area'], drop_first=True)

label_encoder = LabelEncoder()
train_data['Dependents'] = label_encoder.fit_transform(train_data['Dependents'])
train_data['Credit_History'] = label_encoder.fit_transform(train_data['Credit_History'])

train_data.drop('Loan_ID', axis=1, inplace=True)

# --- 3. Feature Engineering ---
train_data['Total_Income'] = train_data['ApplicantIncome'] + train_data['CoapplicantIncome']
train_data['LoanAmount_log'] = np.log(train_data['LoanAmount'])
train_data['Total_Income_log'] = np.log(train_data['Total_Income'])

# --- 4. Model Training ---
X = train_data.drop('Loan_Status', axis=1)
y = train_data['Loan_Status']

scaler = StandardScaler()
X = scaler.fit_transform(X)

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

lr_model = LogisticRegression()
lr_model.fit(X_train, y_train)

dt_model = DecisionTreeClassifier()
dt_model.fit(X_train, y_train)

# --- 5. Model Evaluation ---
y_pred_lr = lr_model.predict(X_test)
y_pred_dt = dt_model.predict(X_test)

print("Logistic Regression Results:")
print("Accuracy:", accuracy_score(y_test, y_pred_lr))
print(classification_report(y_test, y_pred_lr))
print(confusion_matrix(y_test, y_pred_lr))

print("\nDecision Tree Results:")
print("Accuracy:", accuracy_score(y_test, y_pred_dt))
print(classification_report(y_test, y_pred_dt))
print(confusion_matrix(y_test, y_pred_dt))

# --- 6. Interpretation and Insights ---
# Logistic Regression Feature Importance
feature_importances = pd.DataFrame({'feature': train_data.drop('Loan_Status', axis=1).columns, 'importance': np.abs(lr_model.coef_[0])})
feature_importances = feature_importances.sort_values('importance', ascending=False)

plt.figure(figsize=(10, 6))
plt.barh(feature_importances['feature'], feature_importances['importance'])
plt.xlabel("Feature Importance")
plt.ylabel("Feature")
plt.title("Feature Importance in Logistic Regression Model")
plt.show()

# Decision Tree Feature Importance
feature_importances_dt = pd.DataFrame({'feature': train_data.drop('Loan_Status', axis=1).columns, 'importance': dt_model.feature_importances_})
feature_importances_dt = feature_importances_dt.sort_values('importance', ascending=False)

plt.figure(figsize=(10, 6))
plt.barh(feature_importances_dt['feature'], feature_importances_dt['importance'])
plt.xlabel("Feature Importance")
plt.ylabel("Feature")
plt.title("Feature Importance in Decision Tree Model")
plt.show()