Cell 1: Import libraries

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score


Cell 2: Load dataset

In [None]:
# Load CSV file
df = pd.read_csv("../data/loan_data.csv")   # path adjust if needed
df.head()


Cell 3: Basic info

In [None]:
df.info()
df.describe()
df.isnull().sum()


Cell 4: Data Cleaning

In [None]:
df['LoanAmount'] = df['LoanAmount'].fillna(df['LoanAmount'].median())
df['Loan_Status'] = df['Loan_Status'].map({'Y': 1, 'N': 0})
df = df.dropna()

df.to_csv("../data/loan_data_cleaned.csv", index=False)
print("✅ Cleaned data saved")


Cell 5: Exploratory Data Analysis (EDA)

In [None]:
# Loan status distribution
sns.countplot(x="Loan_Status", data=df)
plt.title("Loan Status Distribution")
plt.show()

# Income vs LoanAmount
sns.scatterplot(x="ApplicantIncome", y="LoanAmount", hue="Loan_Status", data=df)
plt.title("Income vs Loan Amount by Status")
plt.show()


Cell 6: Model Training

In [None]:
X = df[['ApplicantIncome','CoapplicantIncome','LoanAmount']]
y = df['Loan_Status']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

model = LogisticRegression(max_iter=1000)
model.fit(X_train, y_train)

y_pred = model.predict(X_test)
print("✅ Accuracy:", accuracy_score(y_test, y_pred))
