In [None]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report, accuracy_score

# --- 1. Load and Explore the Data ---
print("Loading data...")
# Make sure 'train.csv' is in the same directory as your notebook
df = pd.read_csv('train.csv')

print("--- Initial Data Exploration ---")
print("\nFirst 5 Rows:")
print(df.head())

print("\nData Info:")
df.info()

print("\nMissing Values Count:")
print(df.isnull().sum())


# --- 2. Data Cleaning and Preprocessing ---
print("\n--- Starting Data Cleaning and Preprocessing ---")

# Drop Loan_ID as it's not useful for prediction
df.drop('Loan_ID', axis=1, inplace=True)

# Impute missing values for categorical columns with the mode (most frequent value)
for col in ['Gender', 'Married', 'Dependents', 'Self_Employed']:
    df[col].fillna(df[col].mode()[0], inplace=True)

# Impute missing Credit_History with the mode
df['Credit_History'].fillna(df['Credit_History'].mode()[0], inplace=True)

# Impute missing LoanAmount with the median to avoid outlier influence
df['LoanAmount'].fillna(df['LoanAmount'].median(), inplace=True)

# Impute missing Loan_Amount_Term with the mode
df['Loan_Amount_Term'].fillna(df['Loan_Amount_Term'].mode()[0], inplace=True)

print("\nMissing values after imputation (should all be zero):")
print(df.isnull().sum())

# Convert categorical columns to numerical using one-hot encoding
# drop_first=True helps avoid multicollinearity
df_processed = pd.get_dummies(df, drop_first=True)

print("\nData processed and ready for modeling.")


# --- 3. Build Prediction System ---
print("\n--- Building Prediction System ---")

# Define features (X) and target (y)
X = df_processed.drop('Loan_Status_Y', axis=1)
y = df_processed['Loan_Status_Y']

# Split data into training (80%) and testing (20%) sets
# stratify=y ensures that the proportion of classes is the same in both train and test sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42, stratify=y)

# Initialize the RandomForestClassifier
# class_weight='balanced' helps the model handle the imbalanced nature of the target variable
model = RandomForestClassifier(n_estimators=100, random_state=42, class_weight='balanced')

# Train the model on the training data
print("Training the model...")
model.fit(X_train, y_train)
print("Model training complete.")


# --- 4. Evaluate the Model ---
print("\n--- Model Evaluation ---")

# Make predictions on the test data
y_pred = model.predict(X_test)

# Print the accuracy and a detailed classification report
print(f"Accuracy: {accuracy_score(y_test, y_pred):.2f}")
print("\nClassification Report:")
print(classification_report(y_test, y_pred, target_names=['Rejected (0)', 'Approved (1)']))


# --- 5. Feature Importance ---
print("\n--- Top 10 Feature Importances ---")

# Get and display the most important features
feature_importances = pd.Series(model.feature_importances_, index=X.columns).sort_values(ascending=False)
print(feature_importances.head(10))