In [14]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.preprocessing import LabelEncoder, StandardScaler
from sklearn.impute import SimpleImputer
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix

In [15]:
train_data = pd.read_csv('train.csv')
test_data = pd.read_csv('test.csv')
submission_data = pd.read_csv('sample_submission.csv')


In [16]:
def preprocess_data(data):
    # Handle '3+' in Dependents column
    data['Dependents'] = data['Dependents'].replace('3+', 3).astype(float)
    
    # Handle missing values
    imputer = SimpleImputer(strategy='median')
    data['LoanAmount'] = imputer.fit_transform(data[['LoanAmount']])
    data['Loan_Amount_Term'] = imputer.fit_transform(data[['Loan_Amount_Term']])
    data['Credit_History'] = imputer.fit_transform(data[['Credit_History']])
    
    # Fill missing values for categorical data with mode
    data['Gender'].fillna(data['Gender'].mode()[0], inplace=True)
    data['Married'].fillna(data['Married'].mode()[0], inplace=True)
    data['Dependents'].fillna(data['Dependents'].mode()[0], inplace=True)
    data['Self_Employed'].fillna(data['Self_Employed'].mode()[0], inplace=True)
    
    # Convert categorical variables to numerical
    le = LabelEncoder()
    for column in ['Gender', 'Married', 'Education', 'Self_Employed', 'Property_Area']:
        data[column] = le.fit_transform(data[column])
    
    # Normalize numerical features
    scaler = StandardScaler()
    data[['ApplicantIncome', 'CoapplicantIncome', 'LoanAmount']] = scaler.fit_transform(
        data[['ApplicantIncome', 'CoapplicantIncome', 'LoanAmount']])
    
    return data


In [17]:
train_data = preprocess_data(train_data)
test_data = preprocess_data(test_data)

In [18]:
X = train_data.drop(columns=['Loan_ID', 'Loan_Status'])
y = train_data['Loan_Status'].map({'Y': 1, 'N': 0})

In [19]:
print(X.info())
print(y.info())

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 614 entries, 0 to 613
Data columns (total 11 columns):
 #   Column             Non-Null Count  Dtype  
---  ------             --------------  -----  
 0   Gender             614 non-null    int32  
 1   Married            614 non-null    int32  
 2   Dependents         614 non-null    float64
 3   Education          614 non-null    int32  
 4   Self_Employed      614 non-null    int32  
 5   ApplicantIncome    614 non-null    float64
 6   CoapplicantIncome  614 non-null    float64
 7   LoanAmount         614 non-null    float64
 8   Loan_Amount_Term   614 non-null    float64
 9   Credit_History     614 non-null    float64
 10  Property_Area      614 non-null    int32  
dtypes: float64(6), int32(5)
memory usage: 40.9 KB
None
<class 'pandas.core.series.Series'>
RangeIndex: 614 entries, 0 to 613
Series name: Loan_Status
Non-Null Count  Dtype
--------------  -----
614 non-null    int64
dtypes: int64(1)
memory usage: 4.9 KB
None


In [20]:
X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.2, random_state=42)


In [21]:
model = RandomForestClassifier(random_state=42)
model.fit(X_train, y_train)

In [22]:
y_pred = model.predict(X_val)
print(f'Accuracy: {accuracy_score(y_val, y_pred)}')
print(confusion_matrix(y_val, y_pred))
print(classification_report(y_val, y_pred))


Accuracy: 0.7560975609756098
[[18 25]
 [ 5 75]]
              precision    recall  f1-score   support

           0       0.78      0.42      0.55        43
           1       0.75      0.94      0.83        80

    accuracy                           0.76       123
   macro avg       0.77      0.68      0.69       123
weighted avg       0.76      0.76      0.73       123



In [23]:
X_test = test_data.drop(columns=['Loan_ID'])
test_predictions = model.predict(X_test)

In [24]:
submission_data['Loan_Status'] = test_predictions
submission_data['Loan_Status'] = submission_data['Loan_Status'].map({1: 'Y', 0: 'N'})
submission_data.to_csv('submission.csv', index=False)