In [1]:
import pandas as pd
from google.colab import drive
drive.mount('/content/drive')
# Load the datasets
train_data = pd.read_csv('/content/drive/MyDrive/Colab Notebooks/CSI/TrainingDataset.csv')
test_data = pd.read_csv('/content/drive/MyDrive/Colab Notebooks/CSI/TestDataset.csv')

# Display the first few rows of the datasets
print("Train Data:")
print(train_data.head())
print("\nTest Data:")
print(test_data.head())

# Check for missing values and data types in the train data
print("\nTrain Data Info:")
print(train_data.info())
print("\nTest Data Info:")
print(test_data.info())


Mounted at /content/drive
Train Data:
    Loan_ID Gender Married Dependents     Education Self_Employed  \
0  LP001002   Male      No          0      Graduate            No   
1  LP001003   Male     Yes          1      Graduate            No   
2  LP001005   Male     Yes          0      Graduate           Yes   
3  LP001006   Male     Yes          0  Not Graduate            No   
4  LP001008   Male      No          0      Graduate            No   

   ApplicantIncome  CoapplicantIncome  LoanAmount  Loan_Amount_Term  \
0             5849                0.0         NaN             360.0   
1             4583             1508.0       128.0             360.0   
2             3000                0.0        66.0             360.0   
3             2583             2358.0       120.0             360.0   
4             6000                0.0       141.0             360.0   

   Credit_History Property_Area Loan_Status  
0             1.0         Urban           Y  
1             1.0         Ru

In [2]:
# Fill missing values in the training data
train_data['LoanAmount'] = train_data['LoanAmount'].fillna(train_data['LoanAmount'].median())
train_data['Loan_Amount_Term'] = train_data['Loan_Amount_Term'].fillna(train_data['Loan_Amount_Term'].mode()[0])
train_data['Credit_History'] = train_data['Credit_History'].fillna(train_data['Credit_History'].mode()[0])

# Fill missing values in categorical columns with mode
for column in train_data.select_dtypes(include=[object]).columns:
    train_data[column] = train_data[column].fillna(train_data[column].mode()[0])

# Fill missing values in the test data using the same strategy
test_data['LoanAmount'] = test_data['LoanAmount'].fillna(test_data['LoanAmount'].median())
test_data['Loan_Amount_Term'] = test_data['Loan_Amount_Term'].fillna(test_data['Loan_Amount_Term'].mode()[0])
test_data['Credit_History'] = test_data['Credit_History'].fillna(test_data['Credit_History'].mode()[0])

for column in test_data.select_dtypes(include=[object]).columns:
    test_data[column] = test_data[column].fillna(test_data[column].mode()[0])


In [3]:
import numpy as np

# Create new feature: Total_Income
train_data['Total_Income'] = train_data['ApplicantIncome'] + train_data['CoapplicantIncome']
test_data['Total_Income'] = test_data['ApplicantIncome'] + test_data['CoapplicantIncome']

# Log transformation for skewed features
train_data['Log_LoanAmount'] = np.log1p(train_data['LoanAmount'])
test_data['Log_LoanAmount'] = np.log1p(test_data['LoanAmount'])

train_data['Log_Total_Income'] = np.log1p(train_data['Total_Income'])
test_data['Log_Total_Income'] = np.log1p(test_data['Total_Income'])


In [4]:
from sklearn.model_selection import train_test_split

# Define features and target
X = train_data.drop(columns=['Loan_ID', 'Loan_Status'])
y = train_data['Loan_Status'].map({'Y': 1, 'N': 0})  # Convert target to binary
X_test = test_data.drop(columns=['Loan_ID'])

# One-Hot Encoding for categorical variables
X = pd.get_dummies(X)
X_test = pd.get_dummies(X_test)

# Align the train and test data to ensure they have the same columns
X, X_test = X.align(X_test, join='left', axis=1, fill_value=0)

# Split the data into training and validation sets
X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.2, random_state=42)


In [5]:
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier

# Initialize the models
log_reg = LogisticRegression(max_iter=1000)
rf_clf = RandomForestClassifier(n_estimators=100, random_state=42)
gb_clf = GradientBoostingClassifier(n_estimators=100, learning_rate=0.1, random_state=42)


In [6]:
# Train the Logistic Regression model
log_reg.fit(X_train, y_train)

# Train the Random Forest model
rf_clf.fit(X_train, y_train)

# Train the Gradient Boosting model
gb_clf.fit(X_train, y_train)


In [7]:
from sklearn.metrics import accuracy_score, roc_auc_score

# Predict and evaluate the Logistic Regression model
log_reg_pred = log_reg.predict(X_val)
log_reg_acc = accuracy_score(y_val, log_reg_pred)
log_reg_auc = roc_auc_score(y_val, log_reg.predict_proba(X_val)[:, 1])

# Predict and evaluate the Random Forest model
rf_clf_pred = rf_clf.predict(X_val)
rf_clf_acc = accuracy_score(y_val, rf_clf_pred)
rf_clf_auc = roc_auc_score(y_val, rf_clf.predict_proba(X_val)[:, 1])

# Predict and evaluate the Gradient Boosting model
gb_clf_pred = gb_clf.predict(X_val)
gb_clf_acc = accuracy_score(y_val, gb_clf_pred)
gb_clf_auc = roc_auc_score(y_val, gb_clf.predict_proba(X_val)[:, 1])

print(f"Logistic Regression Accuracy: {log_reg_acc}, AUC: {log_reg_auc}")
print(f"Random Forest Accuracy: {rf_clf_acc}, AUC: {rf_clf_auc}")
print(f"Gradient Boosting Accuracy: {gb_clf_acc}, AUC: {gb_clf_auc}")


Logistic Regression Accuracy: 0.7560975609756098, AUC: 0.7465116279069768
Random Forest Accuracy: 0.7723577235772358, AUC: 0.7462209302325581
Gradient Boosting Accuracy: 0.7642276422764228, AUC: 0.7447674418604651


In [8]:
# Choose the best model based on evaluation metrics (assuming Gradient Boosting performed best)
best_model = gb_clf  # Update this based on your evaluation results

# Make predictions on the test set
test_predictions = best_model.predict(X_test)


In [9]:
# Prepare the submission DataFrame
submission = pd.DataFrame({
    'Loan_ID': test_data['Loan_ID'],
    'Loan_Status': np.where(test_predictions == 1, 'Y', 'N')
})

# Save the submission to a CSV file
submission.to_csv('submission.csv', index=False)

print("Submission file created successfully!")


Submission file created successfully!
