In [None]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.preprocessing import LabelEncoder, StandardScaler
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.neighbors import KNeighborsClassifier
from lightgbm import LGBMClassifier
from sklearn.metrics import accuracy_score, classification_report
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.svm import SVC
from sklearn.neural_network import MLPClassifier
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, Dropout
from tensorflow.keras.optimizers import Adam


import os
from sklearn.decomposition import PCA


# Load training and test datasets
train_data = pd.read_csv('train.csv')
test_data = pd.read_csv('test.csv')

# EDA: Overview of the data
print("Train Data Overview:")
print(train_data.info())
print("Test Data Overview:")
print(test_data.info())

print("\nMissing Values in Train Data:\n", train_data.isnull().sum())
print("\nMissing Values in Test Data:\n", test_data.isnull().sum())

# EDA: Target distribution
plt.figure(figsize=(6, 4))
sns.countplot(x='Default', data=train_data)
plt.title("Distribution of Target Variable (Default)")
plt.show()

# EDA: Correlation heatmap
plt.figure(figsize=(12, 8))
tdt = train_data.drop
numeric_data = train_data.select_dtypes(include=[np.number])
corr = numeric_data.corr()

# corr = train_data.corr()
sns.heatmap(corr, annot=True, fmt=".2f", cmap="coolwarm", cbar=True)
plt.title("Correlation Heatmap")
plt.show()


# EDA: Correlation heatmap (exclude non-numeric columns)
def plot_correlation_heatmap(data, target_col):
    numeric_data = data.select_dtypes(include=[np.number])  # Select only numeric columns
    plt.figure(figsize=(12, 8))
    corr = numeric_data.corr()  # Compute correlation on numeric data only
    sns.heatmap(corr, annot=True, fmt=".2f", cmap="coolwarm", cbar=True)
    plt.title("Correlation Heatmap")
    plt.show()
    if target_col in corr.columns:
        print(f"Correlations with {target_col}:\n", corr[target_col].sort_values(ascending=False))
    else:
        print("Target column not found in the correlation matrix.")

# Preprocessing function
def preprocess_data(data, target_col=None, drop_cols=[]):
    data = data.drop(columns=drop_cols)  # Drop unnecessary columns
    le = LabelEncoder()
    
    # Encoding binary categorical variables
    binary_cols = ['HasMortgage', 'HasDependents', 'HasCoSigner']
    for col in binary_cols:
        data[col] = data[col].map({'Yes': 1, 'No': 0})
    
    # Encoding multi-class categorical variables
    categorical_cols = ['Education', 'EmploymentType', 'MaritalStatus', 'LoanPurpose']
    for col in categorical_cols:
        data[col] = le.fit_transform(data[col])
    
    # Scaling numerical features
    scaler = StandardScaler()
    numerical_cols = ['Age', 'Income', 'LoanAmount', 'CreditScore', 
                      'MonthsEmployed', 'InterestRate', 'LoanTerm', 'DTIRatio', 'NumCreditLines']
    data[numerical_cols] = scaler.fit_transform(data[numerical_cols])
    
    if target_col:
        X = data.drop(columns=[target_col])
        y = data[target_col]
        return X, y
    return data

# Drop irrelevant columns and preprocess
plot_correlation_heatmap(train_data, "Default")
X_train, y_train = preprocess_data(train_data, target_col="Default", drop_cols=["LoanID"])
X_test = preprocess_data(test_data, drop_cols=["LoanID"])

# Split training data into train and validation sets
X_train_split, X_val, y_train_split, y_val = train_test_split(X_train, y_train, test_size=0.2, random_state=42, stratify=y_train)

# Initialize models
models = {
    "Logistic Regression": LogisticRegression(max_iter=1000, random_state=42),
    "Random Forest": RandomForestClassifier(random_state=42),
    "Decision Tree": DecisionTreeClassifier(random_state=42),
    "K-Nearest Neighbors": KNeighborsClassifier(),
    "LightGBM": LGBMClassifier(random_state=42)
}

 # Train and evaluate models
model_performance = {}
for name, model in models.items():
    model.fit(X_train_split, y_train_split)
    y_pred = model.predict(X_val)
    model_performance[name] = accuracy_score(y_val, y_pred)
    print(f"{name} Classification Report:\n{classification_report(y_val, y_pred)}")

# Hyperparameter tuning for Random Forest
rf_params = {
    "n_estimators": [50, 100, 200],
    "max_depth": [None, 10, 20],
    "min_samples_split": [2, 5, 10]
}
rf_grid = GridSearchCV(RandomForestClassifier(random_state=42), rf_params, cv=3, scoring="accuracy")
rf_grid.fit(X_train_split, y_train_split)
best_rf = rf_grid.best_estimator_
print("Best Random Forest Params:", rf_grid.best_params_)
model_performance["Tuned Random Forest"] = accuracy_score(y_val, best_rf.predict(X_val))

# Hyperparameter tuning for LightGBM
lgbm_params = {
    "n_estimators": [50, 100, 200],
    "learning_rate": [0.01, 0.1, 0.2],
    "max_depth": [-1, 10, 20]
}
lgbm_grid = GridSearchCV(LGBMClassifier(random_state=42), lgbm_params, cv=3, scoring="accuracy")
lgbm_grid.fit(X_train_split, y_train_split)
best_lgbm = lgbm_grid.best_estimator_
print("Best LightGBM Params:", lgbm_grid.best_params_)
model_performance["Tuned LightGBM"] = accuracy_score(y_val, best_lgbm.predict(X_val))

# Final predictions on test data
test_predictions = {}
for name, model in models.items():
    test_predictions[name] = model.predict(X_test)

test_predictions["Tuned Random Forest"] = best_rf.predict(X_test)
test_predictions["Tuned LightGBM"] = best_lgbm.predict(X_test)

# # Save predictions to CSV
output_dir = "./output_predictions/"
os.makedirs(output_dir, exist_ok=True)
for name, preds in test_predictions.items():
    output = pd.DataFrame({"LoanID": test_data["LoanID"], "Default": preds})
    file_path = f"{output_dir}predictions_{name.replace(' ', '_')}.csv"
    output.to_csv(file_path, index=False)
    print(f"Saved: {file_path}")
    output.to_csv(f"predictions_{name.replace(' ', '_')}.csv", index=False)

# # Print final model performance
print("Model Performance:", model_performance)



# Train Neural Network using Keras
def train_neural_network(X_train, y_train, X_val, y_val):
    input_dim = X_train.shape[1]
    model = Sequential([
        Dense(64, input_dim=input_dim, activation='relu'),
        Dropout(0.3),
        Dense(32, activation='relu'),
        Dropout(0.3),
        Dense(1, activation='sigmoid')
    ])
    model.compile(optimizer=Adam(learning_rate=0.001), loss='binary_crossentropy', metrics=['accuracy'])
    model.fit(X_train, y_train, validation_data=(X_val, y_val), epochs=20, batch_size=32, verbose=1)
    return model

# Train SVM
def train_svm(X_train, y_train, X_val, y_val):
    # svm_model = SVC(probability=True, kernel='rbf', random_state=42)
    # svm_model.fit(X_train, y_train)
    
    pca = PCA(n_components=0.95)
    X_train_reduced = pca.fit_transform(X_train)

    model = SVC(kernel='linear')
    model.fit(X_train_reduced, y_train)

    y_pred = svm_model.predict(X_val)
    print("SVM Classification Report:\n", classification_report(y_val, y_pred))
    return svm_model

# Train and evaluate Neural Network
nn_model = train_neural_network(X_train_split, y_train_split, X_val, y_val)
y_val_pred_nn = (nn_model.predict(X_val) > 0.5).astype(int).flatten()
print("Neural Network Classification Report:\n", classification_report(y_val, y_val_pred_nn))
model_performance["Neural Network"] = accuracy_score(y_val, y_val_pred_nn)

# Train and evaluate SVM
svm_model = train_svm(X_train_split, y_train_split, X_val, y_val)
model_performance["SVM"] = accuracy_score(y_val, svm_model.predict(X_val))

# Final predictions
test_predictions["Neural Network"] = (nn_model.predict(X_test) > 0.5).astype(int).flatten()
test_predictions["SVM"] = svm_model.predict(X_test)

# Save predictions for Neural Network and SVM
for name in ["Neural Network", "SVM"]:
    output = pd.DataFrame({"LoanID": test_data["LoanID"], "Default": test_predictions[name]})
    file_path = f"{output_dir}predictions_{name.replace(' ', '_')}.csv"
    output.to_csv(file_path, index=False)
    print(f"Saved: {file_path}")

# Final performance summary
print("Model Performance:", model_performance)