In [62]:
import pandas as pd

# Loading the financial dataset
file_path = r"C:\Users\irosh\Downloads\financial_data.csv"
df = pd.read_csv(file_path)

# first few rows  of the dataset
df.head()


Unnamed: 0,Age,Employment_Status,Marital_Status,Dependents,Monthly_Income,Savings,Food_Grocery_Expenses,Internet_Mobile_Expenses,Electricity_Bills,Shopping_Expenses,...,Loan_1_Amount,Loan_1_EMI,Loan_2_Amount,Loan_2_EMI,Credit_Score,Investments,Stock_Quantity,Loan_Approval_Chance,Investment_Recommendation,Portfolio_Optimization
0,50,Employed,Married,3,173240,388828,15807,3002,4000,8434,...,201150,17529,827150,0,300,,0,0,Stocks,No
1,33,Employed,Single,2,114450,259668,17698,1981,3548,4899,...,253622,5165,903891,21253,300,,0,0,Stocks,No
2,41,Employed,Single,1,92399,105064,13133,3101,2624,5610,...,604834,19930,485773,0,300,,0,0,SIP,No
3,40,Employed,Single,0,195658,420181,13274,3144,3641,5770,...,992766,27365,305376,17038,300,Stocks,450,0,Stocks,Yes
4,48,Employed,Married,2,83044,21025,16012,3894,3224,3794,...,722938,29143,628776,0,300,Stocks,0,0,,No


In [63]:
# Checking for missing values
print(df.isnull().sum())

Age                             0
Employment_Status               0
Marital_Status                  0
Dependents                      0
Monthly_Income                  0
Savings                         0
Food_Grocery_Expenses           0
Internet_Mobile_Expenses        0
Electricity_Bills               0
Shopping_Expenses               0
Personal_Expenses               0
Miscellaneous_Expenses          0
Rent                            0
Own_House                       0
Active_Loans                    0
Loan_1_Amount                   0
Loan_1_EMI                      0
Loan_2_Amount                   0
Loan_2_EMI                      0
Credit_Score                    0
Investments                  2975
Stock_Quantity                  0
Loan_Approval_Chance            0
Investment_Recommendation    2564
Portfolio_Optimization          0
dtype: int64


In [64]:
# Filling missing values in Investments column with None
df['Investments'].fillna('None', inplace=True)

# Filling missing values in Investment_Recommendation with No Recommendation
df['Investment_Recommendation'].fillna('No Recommendation', inplace=True)

# checking the missing values again
print(df.isnull().sum())


Age                          0
Employment_Status            0
Marital_Status               0
Dependents                   0
Monthly_Income               0
Savings                      0
Food_Grocery_Expenses        0
Internet_Mobile_Expenses     0
Electricity_Bills            0
Shopping_Expenses            0
Personal_Expenses            0
Miscellaneous_Expenses       0
Rent                         0
Own_House                    0
Active_Loans                 0
Loan_1_Amount                0
Loan_1_EMI                   0
Loan_2_Amount                0
Loan_2_EMI                   0
Credit_Score                 0
Investments                  0
Stock_Quantity               0
Loan_Approval_Chance         0
Investment_Recommendation    0
Portfolio_Optimization       0
dtype: int64


The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  df['Investments'].fillna('None', inplace=True)
The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  df['Investment_Recommendation'].fillna('No Recommendation', inplace=True)


In [65]:
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.model_selection import train_test_split

# Converting categorical columns to one-hot encoding
categorical_cols = ['Employment_Status', 'Marital_Status', 'Investments']

# One-hot encoding
df_encoded = pd.get_dummies(df, columns=categorical_cols, drop_first=True)

#  numerical columns to scale
numerical_cols = ['Monthly_Income', 'Savings', 'Food_Grocery_Expenses', 'Internet_Mobile_Expenses', 
                  'Electricity_Bills', 'Shopping_Expenses', 'Personal_Expenses', 'Miscellaneous_Expenses', 
                  'Rent', 'Loan_1_Amount', 'Loan_1_EMI', 'Loan_2_Amount', 'Loan_2_EMI', 'Credit_Score']

#  scaling
scaler = StandardScaler()
df_encoded[numerical_cols] = scaler.fit_transform(df_encoded[numerical_cols])

# splitting the data into train and test sets
X = df_encoded.drop(['Loan_Approval_Chance', 'Investment_Recommendation', 'Portfolio_Optimization'], axis=1)
y_loan_approval = df_encoded['Loan_Approval_Chance']


# splitting the dataset for loan approval model
X_train_loan, X_test_loan, y_train_loan, y_test_loan = train_test_split(X, y_loan_approval, test_size=0.2, random_state=42)


# Checking the shapes to ensure correct splitting
print(X_train_loan.shape, X_test_loan.shape)



(8000, 26) (2000, 26)


In [66]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report, accuracy_score

# Training the Random Forest Classifier for Loan Approval Prediction
rf_loan_model = RandomForestClassifier(random_state=42, n_estimators=100)

# Fitting the model
rf_loan_model.fit(X_train_loan, y_train_loan)

# Predictions
y_pred_loan = rf_loan_model.predict(X_test_loan)

# Evaluation
print("Loan Approval Prediction Accuracy: ", accuracy_score(y_test_loan, y_pred_loan))
print("Classification Report:\n", classification_report(y_test_loan, y_pred_loan))


Loan Approval Prediction Accuracy:  0.886
Classification Report:
               precision    recall  f1-score   support

           0       0.98      0.82      0.90      1186
           1       0.79      0.98      0.87       814

    accuracy                           0.89      2000
   macro avg       0.89      0.90      0.89      2000
weighted avg       0.90      0.89      0.89      2000



In [67]:
from sklearn.linear_model import LogisticRegression

# Logistic Regression
lr_model = LogisticRegression(max_iter=1000)
lr_model.fit(X_train_loan, y_train_loan)
y_pred_lr = lr_model.predict(X_test_loan)
print("Logistic Regression Accuracy:", accuracy_score(y_test_loan, y_pred_lr))
print("Classification Report for Logistic Regression:")
print(classification_report(y_test_loan, y_pred_lr))


Logistic Regression Accuracy: 0.8715
Classification Report for Logistic Regression:
              precision    recall  f1-score   support

           0       0.90      0.88      0.89      1186
           1       0.83      0.86      0.84       814

    accuracy                           0.87      2000
   macro avg       0.87      0.87      0.87      2000
weighted avg       0.87      0.87      0.87      2000



In [68]:
from sklearn.svm import SVC
# Support Vector Classifier
svc_model = SVC(probability=True)
svc_model.fit(X_train_loan, y_train_loan)
y_pred_svc = svc_model.predict(X_test_loan)
print("SVC Accuracy:", accuracy_score(y_test_loan, y_pred_svc))
print("Classification Report for SVC:")
print(classification_report(y_test_loan, y_pred_svc))

SVC Accuracy: 0.593
Classification Report for SVC:
              precision    recall  f1-score   support

           0       0.59      1.00      0.74      1186
           1       0.00      0.00      0.00       814

    accuracy                           0.59      2000
   macro avg       0.30      0.50      0.37      2000
weighted avg       0.35      0.59      0.44      2000



  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


In [69]:
from sklearn.neural_network import MLPClassifier

# Deep Learning: Multilayer Perceptron (MLP)
mlp_model = MLPClassifier(hidden_layer_sizes=(100, 50), max_iter=500, random_state=42)
mlp_model.fit(X_train_loan, y_train_loan)
y_pred_mlp = mlp_model.predict(X_test_loan)
print("MLP Accuracy:", accuracy_score(y_test_loan, y_pred_mlp))
print("Classification Report for MLP:")
print(classification_report(y_test_loan, y_pred_mlp))


MLP Accuracy: 0.879
Classification Report for MLP:
              precision    recall  f1-score   support

           0       0.94      0.85      0.89      1186
           1       0.81      0.92      0.86       814

    accuracy                           0.88      2000
   macro avg       0.87      0.89      0.88      2000
weighted avg       0.89      0.88      0.88      2000



In [70]:
from sklearn.model_selection import GridSearchCV

# Hyperparameter grid for Random Forest
rf_param_grid = {
    'n_estimators': [100, 200, 300],
    'max_depth': [None, 10, 20, 30],
    'min_samples_split': [2, 5, 10],
    'min_samples_leaf': [1, 2, 4]
}

# Hyperparameter grid for MLP
mlp_param_grid = {
    'hidden_layer_sizes': [(50,), (100,), (100, 50), (150, 100, 50)],
    'activation': ['tanh', 'relu'],
    'solver': ['adam', 'sgd'],
    'learning_rate': ['constant', 'adaptive'],
}

# Random Forest tuning
rf_grid_search = GridSearchCV(RandomForestClassifier(random_state=42), rf_param_grid, cv=5, scoring='accuracy', n_jobs=-1)
rf_grid_search.fit(X_train_loan, y_train_loan)

# MLP tuning
mlp_grid_search = GridSearchCV(MLPClassifier(max_iter=500, random_state=42), mlp_param_grid, cv=5, scoring='accuracy', n_jobs=-1)
mlp_grid_search.fit(X_train_loan, y_train_loan)

# Best parameters and accuracy for Random Forest
best_rf_params = rf_grid_search.best_params_
best_rf_accuracy = rf_grid_search.best_score_

# Best parameters and accuracy for MLP
best_mlp_params = mlp_grid_search.best_params_
best_mlp_accuracy = mlp_grid_search.best_score_

print(f"Best Parameters for Random Forest: {best_rf_params}")
print(f"Best Random Forest Accuracy: {best_rf_accuracy}")

print(f"Best Parameters for MLP: {best_mlp_params}")
print(f"Best MLP Accuracy: {best_mlp_accuracy}")


Best Parameters for Random Forest: {'max_depth': 10, 'min_samples_leaf': 4, 'min_samples_split': 2, 'n_estimators': 200}
Best Random Forest Accuracy: 0.9013750000000001
Best Parameters for MLP: {'activation': 'relu', 'hidden_layer_sizes': (100,), 'learning_rate': 'constant', 'solver': 'adam'}
Best MLP Accuracy: 0.8838750000000001


In [71]:
# Retraining Random Forest with best parameters
best_rf_model = RandomForestClassifier(
    max_depth=best_rf_params['max_depth'],
    min_samples_leaf=best_rf_params['min_samples_leaf'],
    min_samples_split=best_rf_params['min_samples_split'],
    n_estimators=best_rf_params['n_estimators'],
    random_state=42
)
best_rf_model.fit(X_train_loan, y_train_loan)

# Predictions and evaluation for Random Forest
y_pred_rf = best_rf_model.predict(X_test_loan)
rf_accuracy = accuracy_score(y_test_loan, y_pred_rf)
rf_classification_report = classification_report(y_test_loan, y_pred_rf)

print(f"Random Forest Test Accuracy: {rf_accuracy}")
print("Random Forest Classification Report:")
print(rf_classification_report)

# Retraining MLP with best parameters
best_mlp_model = MLPClassifier(
    activation=best_mlp_params['activation'],
    hidden_layer_sizes=best_mlp_params['hidden_layer_sizes'],
    learning_rate=best_mlp_params['learning_rate'],
    solver=best_mlp_params['solver'],
    max_iter=500,
    random_state=42
)
best_mlp_model.fit(X_train_loan, y_train_loan)

# Predictions and evaluation for MLP
y_pred_mlp = best_mlp_model.predict(X_test_loan)
mlp_accuracy = accuracy_score(y_test_loan, y_pred_mlp)
mlp_classification_report = classification_report(y_test_loan, y_pred_mlp)

print(f"MLP Test Accuracy: {mlp_accuracy}")
print("MLP Classification Report:")
print(mlp_classification_report)


Random Forest Test Accuracy: 0.8865
Random Forest Classification Report:
              precision    recall  f1-score   support

           0       0.98      0.82      0.90      1186
           1       0.79      0.98      0.88       814

    accuracy                           0.89      2000
   macro avg       0.89      0.90      0.89      2000
weighted avg       0.91      0.89      0.89      2000

MLP Test Accuracy: 0.875
MLP Classification Report:
              precision    recall  f1-score   support

           0       0.90      0.89      0.89      1186
           1       0.84      0.86      0.85       814

    accuracy                           0.88      2000
   macro avg       0.87      0.87      0.87      2000
weighted avg       0.88      0.88      0.88      2000



In [72]:
#import joblib

# Saving the trained Random Forest model
#joblib.dump(best_rf_model, 'random_forest_loan_approval_model.pkl')

#print("Random Forest model saved successfully!")


Random Forest model saved successfully!
