In [None]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [None]:
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, classification_report

In [None]:
df = pd.read_csv('/content/bank_loan_IN2.csv')

In [None]:
# Convert 'CCAvg' column from incorrect formatting (if necessary)
df['CCAvg'] = df['CCAvg'].str.replace(r'[^0-9.]', '', regex=True).astype('float64')

In [None]:
# Convert negative 'Experience' values to positive
df['Experience'] = df['Experience'].abs()

In [None]:
# Define feature variables and target variable
target = 'Personal Loan'
features = df.drop(columns=[target])
X = features
y = df[target]

In [None]:
# Convert negative 'Experience' values to positive
df['Experience'] = df['Experience'].abs()

In [None]:
# Check for missing values
print("Missing Values:\n", df.isna().sum().to_frame().T)

Missing Values:
    Age  Experience  Income  Family  CCAvg  Education  Mortgage  Personal Loan  \
0    0           0       0       0      0          0         0              0   

   Securities Account  CD Account  Online  CreditCard  
0                   0           0       0           0  


In [None]:
# Split the dataset into 70% train and 30% test
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)

In [None]:
# Train Random Forest Classifier
rf_model = RandomForestClassifier(n_estimators=100, random_state=42)
rf_model.fit(X_train, y_train)

In [None]:
# Make predictions
y_pred = rf_model.predict(X_test)

In [None]:
# Evaluate the model
accuracy = accuracy_score(y_test, y_pred)
print(f'Accuracy: {accuracy:.2f}')
print('Classification Report:\n', classification_report(y_test, y_pred))

Accuracy: 0.98
Classification Report:
               precision    recall  f1-score   support

           0       0.99      0.99      0.99      1156
           1       0.94      0.87      0.90       119

    accuracy                           0.98      1275
   macro avg       0.96      0.93      0.94      1275
weighted avg       0.98      0.98      0.98      1275



In [None]:
X_train, X_temp, y_train, y_temp = train_test_split(X, y, test_size=0.3, random_state=42)
X_val, X_test, y_val, y_test = train_test_split(X_temp, y_temp, test_size=0.5, random_state=42)

In [None]:
from sklearn.model_selection import cross_val_score

cv_scores = cross_val_score(rf_model, X_train, y_train, cv=5)
print(f'Cross-validation scores: {cv_scores}')
print(f'Mean CV score: {cv_scores.mean()}')

Cross-validation scores: [0.98655462 0.98319328 0.97310924 0.98151261 0.97647059]
Mean CV score: 0.9801680672268909


In [None]:
rf_model = RandomForestClassifier(
    n_estimators=100,
    max_depth=10,  # Limit the depth of each tree
    min_samples_split=10,  # Minimum samples required to split a node
    min_samples_leaf=5,  # Minimum samples required at each leaf node
    random_state=42
)

# Fit the model to the training data
rf_model.fit(X_train, y_train)

# Generate new predictions with the current X_test
y_pred = rf_model.predict(X_test) # This line is added to regenerate predictions

train_accuracy = accuracy_score(y_train, rf_model.predict(X_train))
test_accuracy = accuracy_score(y_test, y_pred)
print(f"Training Accuracy: {train_accuracy:.2f}")
print(f"Testing Accuracy: {test_accuracy:.2f}")

Training Accuracy: 0.98
Testing Accuracy: 0.98


In [None]:
importances = rf_model.feature_importances_
feature_importance = pd.DataFrame({'Feature': X.columns, 'Importance': importances})
feature_importance.sort_values(by='Importance', ascending=False, inplace=True)
print(feature_importance)


               Feature  Importance
2               Income    0.506304
5            Education    0.201443
3               Family    0.098849
8           CD Account    0.072778
6             Mortgage    0.044701
0                  Age    0.021766
1           Experience    0.020636
4                CCAvg    0.020262
10          CreditCard    0.007113
9               Online    0.003824
7   Securities Account    0.002325


In [None]:
from sklearn.model_selection import GridSearchCV

param_grid = {
    'n_estimators': [50, 100, 200],
    'max_depth': [5, 10, 15],
    'min_samples_split': [2, 5, 10],
    'min_samples_leaf': [1, 2, 5]
}

grid_search = GridSearchCV(RandomForestClassifier(random_state=42), param_grid, cv=3, scoring='accuracy')
grid_search.fit(X_train, y_train)
print(f"Best Parameters: {grid_search.best_params_}")


Best Parameters: {'max_depth': 10, 'min_samples_leaf': 2, 'min_samples_split': 5, 'n_estimators': 200}


In [None]:
# Load unseen data
unseen_data = pd.read_csv('/content/Testing2.csv')

# Preprocess the unseen data (use the same steps as before)
unseen_data['CCAvg'] = unseen_data['CCAvg'].str.replace(r'[^0-9.]', '', regex=True).astype('float64')
unseen_data['Experience'] = unseen_data['Experience'].abs()
#unseen_data.drop(columns=['ID', 'ZIP Code'], errors='ignore', inplace=True)

# Ensure the same feature set as training data
X_unseen = unseen_data[features.columns]  # Use `features.columns` from the training step


In [None]:
# Predict with the trained model
unseen_predictions = rf_model.predict(X_unseen)

# Add predictions to the unseen data
unseen_data['Predicted Loan Approval'] = unseen_predictions


In [None]:
# Assuming true labels are available in 'Personal Loan' column
y_unseen_true = unseen_data['Personal Loan']
accuracy_unseen = accuracy_score(y_unseen_true, unseen_predictions)
print(f"Accuracy on Unseen Data: {accuracy_unseen:.2f}")
print("Classification Report:\n", classification_report(y_unseen_true, unseen_predictions))

Accuracy on Unseen Data: 0.99
Classification Report:
               precision    recall  f1-score   support

           0       0.99      1.00      0.99       690
           1       0.96      0.85      0.90        60

    accuracy                           0.99       750
   macro avg       0.97      0.92      0.95       750
weighted avg       0.99      0.99      0.98       750



In [None]:
#save the model
unseen_data.to_csv('/content/drive/MyDrive/coop_ml/unseen_predictions22.csv', index=False)

In [None]:
import joblib

joblib.dump(rf_model, '/content/drive/MyDrive/COOP_random_forest_model22.pkl')
print("Model saved to Google Drive!")


Model saved to Google Drive!


In [None]:
import pandas as pd
import joblib

# Load the trained model
rf_model = joblib.load('/content/drive/MyDrive/COOP_random_forest_model22.pkl')

# Function to take user input and predict loan approval
def predict_loan():
    # Collect user inputs
    age = int(input("Enter your age: "))
    experience = int(input("Enter your work experience in years: "))
    income = float(input("Enter your monthly income: "))
    family = int(input("Enter the number of family members: "))
    cavg = float(input("Enter your average credit card spending per month: "))
    education = int(input("Enter education level (1: Undergrad, 2: Graduate, 3: Advanced/Professional): "))
    mortgage = float(input("Enter your mortgage value (0 if none): "))
    securities_account = int(input("Do you have a securities account? (1: Yes, 0: No): "))
    cd_account = int(input("Do you have a certificate of deposit (CD) account? (1: Yes, 0: No): "))
    online = int(input("Do you use online banking? (1: Yes, 0: No): "))
    credit_card = int(input("Do you have a credit card? (1: Yes, 0: No): "))

    # Create a DataFrame for the input, ensuring the correct order of columns
    input_data = pd.DataFrame({
        'Age': [age],
        'Experience': [experience],
        'Income': [income],
        'Family': [family],
        'CCAvg': [cavg],
        'Education': [education],
        'Mortgage': [mortgage],
        'Securities Account': [securities_account],
        'CD Account': [cd_account],
        'Online': [online],
        'CreditCard': [credit_card]
    })

    # Predict using the model
    prediction = rf_model.predict(input_data)[0]

    # Print the result
    if prediction == 1:
        print("Congratulations! You are eligible for a loan.")
    else:
        print("Unfortunately, you are not eligible for a loan at this time.")

# Call the function
predict_loan()

Enter your age: 42
Enter your work experience in years: 16
Enter your monthly income: 1000
Enter the number of family members: 2
Enter your average credit card spending per month: 4700
Enter education level (1: Undergrad, 2: Graduate, 3: Advanced/Professional): 1
Enter your mortgage value (0 if none): 3
Do you have a securities account? (1: Yes, 0: No): 1
Do you have a certificate of deposit (CD) account? (1: Yes, 0: No): 1
Do you use online banking? (1: Yes, 0: No): 0
Do you have a credit card? (1: Yes, 0: No): 1
Unfortunately, you are not eligible for a loan at this time.
