In [2]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score
from sklearn.preprocessing import LabelEncoder

# Load dataset
train_data = pd.read_excel("D:\Stream\GPT 4 code\VC_Train file.xlsx")  # replace with your actual train file path
test_data = pd.read_excel("D:\Stream\GPT 4 code\VC_Test file.xlsx")  # replace with your actual test file path

# EDA
print(train_data.head())
print(train_data.describe())
print(train_data.info())

# Preprocessing
def preprocess_data(df):
    # First, replace '3+' in Dependents column with 3
    df['Dependents'].replace('3+', 3, inplace=True)

    # Select numeric columns and fill missing values with column mean
    numeric_cols = df.select_dtypes(include=[np.number])
    numeric_cols.fillna(numeric_cols.mean(), inplace=True)

    # Select non-numeric columns and fill missing values with mode (most frequent value)
    non_numeric_cols = df.select_dtypes(exclude=[np.number])
    non_numeric_cols.fillna(non_numeric_cols.mode().iloc[0], inplace=True)

    # Apply Label Encoding to non_numeric_cols
    non_numeric_cols = non_numeric_cols.apply(LabelEncoder().fit_transform)

    # Combine numeric and non-numeric dataframes
    df = pd.concat([numeric_cols, non_numeric_cols], axis=1)

    return df


# Preprocess both train and test data
train_data = preprocess_data(train_data)
test_data = preprocess_data(test_data)

# Separate features and target from train data
X_train = train_data.drop(columns=["Loan_Status", "Loan_ID"], axis=1)
y_train = train_data["Loan_Status"]

# Modelling and fine tuning
parameters = {'n_estimators': [50, 100, 200], 'max_depth': [2, 5, 10]}
rf = RandomForestClassifier()
clf = GridSearchCV(rf, parameters)
clf.fit(X_train, y_train)

# Get the best model
best_model = clf.best_estimator_
print(best_model)

# Predict for test data
test_data = test_data.drop("Loan_ID", axis=1)  # remove "Loan_ID" as it's not a feature for prediction
predictions = best_model.predict(test_data)

# Save the predictions into a CSV file
submission = pd.read_excel("D:\Stream\GPT 4 code\VC_Sample submission.xlsx")  # replace with your actual test file path
submission['Loan_Status'] = predictions
submission.to_csv("predictions.csv", index=False)


    Loan_ID Gender Married Dependents     Education Self_Employed   
0  LP001002   Male      No          0      Graduate            No  \
1  LP001003   Male     Yes          1      Graduate            No   
2  LP001005   Male     Yes          0      Graduate           Yes   
3  LP001006   Male     Yes          0  Not Graduate            No   
4  LP001008   Male      No          0      Graduate            No   

   ApplicantIncome  CoapplicantIncome  LoanAmount  Loan_Amount_Term   
0             5849                0.0         NaN             360.0  \
1             4583             1508.0       128.0             360.0   
2             3000                0.0        66.0             360.0   
3             2583             2358.0       120.0             360.0   
4             6000                0.0       141.0             360.0   

   Credit_History Property_Area Loan_Status  
0             1.0         Urban           Y  
1             1.0         Rural           N  
2             1.0   