In [25]:
# !pip install markupsafe==2.0.1
# !pip install werkzeug==2.0.3
# !pip install xgboost pandas-profiling
# !pip install tensorflow
# !pip install --upgrade markupsafe
# !pip install werkzeug==2.0.3


In [14]:
# Import necessary libraries
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder, StandardScaler
from sklearn.impute import SimpleImputer
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score
from xgboost import XGBClassifier
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, Input
import numpy as np

# Load the dataset
file_path = 'D:/Jupyter_projects/RFM_loan/simulated_lendingclub_dataset_modified.csv'
data = pd.read_csv(file_path)

# Data Preprocessing
# Encode categorical variables
label_enc = LabelEncoder()
data['TransactionType'] = label_enc.fit_transform(data['TransactionType'])
data['EngagementLevel'] = label_enc.fit_transform(data['EngagementLevel'])

# Handle missing values by filling with mean
imputer = SimpleImputer(strategy='mean')
numerical_columns = data.select_dtypes(include=['float64', 'int64']).columns
data[numerical_columns] = imputer.fit_transform(data[numerical_columns])

# Scale numerical features
scaler = StandardScaler()
data[numerical_columns] = scaler.fit_transform(data[numerical_columns])

# Define target and features
X = data.drop(['CustomerID', 'Mortgage', 'Personal_Loan', 'Credit_Card'], axis=1)
y = data[['Mortgage', 'Personal_Loan', 'Credit_Card']]

# Convert target columns to binary (0 or 1)
y_binary = y.apply(lambda x: (x > 0).astype(int))

# Split data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y_binary, test_size=0.2, random_state=42)

# Function to evaluate model performance
def evaluate_model(y_test, y_pred):
    accuracy = accuracy_score(y_test, y_pred)
    precision = precision_score(y_test, y_pred, average='macro')
    recall = recall_score(y_test, y_pred, average='macro')
    f1 = f1_score(y_test, y_pred, average='macro')
    print(f"Accuracy: {accuracy:.2f}")
    print(f"Precision: {precision:.2f}")
    print(f"Recall: {recall:.2f}")
    print(f"F1 Score: {f1:.2f}")

# Model 1: Random Forest
print("Random Forest Model Evaluation")
rf_model = RandomForestClassifier(random_state=42)
rf_model.fit(X_train, y_train)
y_pred_rf = rf_model.predict(X_test)
evaluate_model(y_test, y_pred_rf)

# Model 2: XGBoost
print("\nXGBoost Model Evaluation")
xgb_model = XGBClassifier(use_label_encoder=False, eval_metric='logloss')
xgb_model.fit(X_train, y_train)
y_pred_xgb = xgb_model.predict(X_test)
evaluate_model(y_test, y_pred_xgb)

# Model 3: Neural Network with binary cross-entropy for each product
print("\nNeural Network Model Evaluation")

# Convert each product type into its own binary classification task
y_train_nn = np.array(y_train)
y_test_nn = np.array(y_test)

# Build Neural Network
nn_model = Sequential()
nn_model.add(Input(shape=(X_train.shape[1],)))
nn_model.add(Dense(64, activation='relu'))
nn_model.add(Dense(32, activation='relu'))
nn_model.add(Dense(3, activation='sigmoid'))  # Using 'sigmoid' for binary outputs per product

# Compile Neural Network with binary cross-entropy loss for each product category
nn_model.compile(optimizer='adam', loss='binary_crossentropy', metrics=['accuracy'])

# Train Neural Network
nn_model.fit(X_train, y_train_nn, epochs=10, batch_size=32, verbose=1)

# Predict with Neural Network
y_pred_nn = nn_model.predict(X_test)
y_pred_nn_classes = (y_pred_nn > 0.5).astype(int)

# Evaluate Neural Network using the function defined previously
evaluate_model(y_test, y_pred_nn_classes)


Random Forest Model Evaluation
Accuracy: 0.26
Precision: 0.66
Recall: 0.94
F1 Score: 0.77

XGBoost Model Evaluation
Accuracy: 0.18
Precision: 0.65
Recall: 0.75
F1 Score: 0.70

Neural Network Model Evaluation


Parameters: { "use_label_encoder" } are not used.



Epoch 1/10
[1m25/25[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 3ms/step - accuracy: 0.2472 - loss: 0.7033 
Epoch 2/10
[1m25/25[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 1ms/step - accuracy: 0.3996 - loss: 0.6461
Epoch 3/10
[1m25/25[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 3ms/step - accuracy: 0.4519 - loss: 0.6414
Epoch 4/10
[1m25/25[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 3ms/step - accuracy: 0.4453 - loss: 0.6358
Epoch 5/10
[1m25/25[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 1ms/step - accuracy: 0.4659 - loss: 0.6207
Epoch 6/10
[1m25/25[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 2ms/step - accuracy: 0.4804 - loss: 0.6356
Epoch 7/10
[1m25/25[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 1ms/step - accuracy: 0.4509 - loss: 0.6269
Epoch 8/10
[1m25/25[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 1ms/step - accuracy: 0.4454 - loss: 0.6306
Epoch 9/10
[1m25/25[0m [32m━━━━━━━━━━━━━━━━━━━━[0m

In [21]:
# Import necessary libraries
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder, StandardScaler
from sklearn.impute import SimpleImputer
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score
from xgboost import XGBClassifier
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, Input
import numpy as np

# Load the dataset
file_path = 'D:/Jupyter_projects/RFM_loan/simulated_lendingclub_dataset_modified.csv'
data = pd.read_csv(file_path)

# Data Preprocessing
# Encode categorical variables
label_enc = LabelEncoder()
data['TransactionType'] = label_enc.fit_transform(data['TransactionType'])
data['EngagementLevel'] = label_enc.fit_transform(data['EngagementLevel'])

# Handle missing values by filling with mean
imputer = SimpleImputer(strategy='mean')
numerical_columns = data.select_dtypes(include=['float64', 'int64']).columns
data[numerical_columns] = imputer.fit_transform(data[numerical_columns])

# Scale numerical features
scaler = StandardScaler()
data[numerical_columns] = scaler.fit_transform(data[numerical_columns])

# Define target and features
X = data.drop(['CustomerID', 'Mortgage', 'Personal_Loan', 'Credit_Card'], axis=1)
y = data[['Mortgage', 'Personal_Loan', 'Credit_Card']]

# Convert target columns to binary (0 or 1)
y_binary = y.apply(lambda x: (x > 0).astype(int))

# Split data into training, validation, and unseen test sets
X_train, X_temp, y_train, y_temp = train_test_split(X, y_binary, test_size=0.3, random_state=42)
X_test, X_unseen, y_test, y_unseen = train_test_split(X_temp, y_temp, test_size=0.5, random_state=42)

# Function to evaluate model performance
def evaluate_model(y_true, y_pred):
    accuracy = accuracy_score(y_true, y_pred)
    precision = precision_score(y_true, y_pred, average='macro')
    recall = recall_score(y_true, y_pred, average='macro')
    f1 = f1_score(y_true, y_pred, average='macro')
    print(f"Accuracy: {accuracy:.2f}")
    print(f"Precision: {precision:.2f}")
    print(f"Recall: {recall:.2f}")
    print(f"F1 Score: {f1:.2f}")

# Model 1: Random Forest
print("Random Forest Model Evaluation")
rf_model = RandomForestClassifier(random_state=42)
rf_model.fit(X_train, y_train)
y_pred_rf = rf_model.predict(X_test)
evaluate_model(y_test, y_pred_rf)

# Model 2: XGBoost
print("\nXGBoost Model Evaluation")
xgb_model = XGBClassifier(use_label_encoder=False, eval_metric='logloss')
xgb_model.fit(X_train, y_train)
y_pred_xgb = xgb_model.predict(X_test)
evaluate_model(y_test, y_pred_xgb)

# Model 3: Neural Network with binary cross-entropy for each product
print("\nNeural Network Model Evaluation")

# Convert each product type into its own binary classification task
y_train_nn = np.array(y_train)
y_test_nn = np.array(y_test)

# Build Neural Network
nn_model = Sequential()
nn_model.add(Input(shape=(X_train.shape[1],)))
nn_model.add(Dense(64, activation='relu'))
nn_model.add(Dense(32, activation='relu'))
nn_model.add(Dense(3, activation='sigmoid'))  # Using 'sigmoid' for binary outputs per product

# Compile Neural Network with binary cross-entropy loss for each product category
nn_model.compile(optimizer='adam', loss='binary_crossentropy', metrics=['accuracy'])

# Train Neural Network
nn_model.fit(X_train, y_train_nn, epochs=10, batch_size=32, verbose=1)

# Predict with Neural Network on test data
y_pred_nn = nn_model.predict(X_test)
y_pred_nn_classes = (y_pred_nn > 0.5).astype(int)
evaluate_model(y_test, y_pred_nn_classes)

# Evaluation on Unseen Data
print("\nEvaluation on Unseen Data")
# Random Forest on Unseen Data
y_pred_rf_unseen = rf_model.predict(X_unseen)
print("Random Forest on Unseen Data:")
evaluate_model(y_unseen, y_pred_rf_unseen)

# XGBoost on Unseen Data
y_pred_xgb_unseen = xgb_model.predict(X_unseen)
print("XGBoost on Unseen Data:")
evaluate_model(y_unseen, y_pred_xgb_unseen)

# Neural Network on Unseen Data
y_pred_nn_unseen = nn_model.predict(X_unseen)
y_pred_nn_unseen_classes = (y_pred_nn_unseen > 0.5).astype(int)
print("Neural Network on Unseen Data:")
evaluate_model(y_unseen, y_pred_nn_unseen_classes)


Random Forest Model Evaluation
Accuracy: 0.25
Precision: 0.68
Recall: 0.89
F1 Score: 0.77

XGBoost Model Evaluation
Accuracy: 0.16
Precision: 0.67
Recall: 0.73
F1 Score: 0.70

Neural Network Model Evaluation


Parameters: { "use_label_encoder" } are not used.



Epoch 1/10
[1m22/22[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 4ms/step - accuracy: 0.2719 - loss: 0.6656
Epoch 2/10
[1m22/22[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 2ms/step - accuracy: 0.4272 - loss: 0.6327
Epoch 3/10
[1m22/22[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 3ms/step - accuracy: 0.4601 - loss: 0.6454 
Epoch 4/10
[1m22/22[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 3ms/step - accuracy: 0.4405 - loss: 0.6273
Epoch 5/10
[1m22/22[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 1ms/step - accuracy: 0.4412 - loss: 0.6327
Epoch 6/10
[1m22/22[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 1ms/step - accuracy: 0.4890 - loss: 0.6302
Epoch 7/10
[1m22/22[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 1ms/step - accuracy: 0.4661 - loss: 0.6218
Epoch 8/10
[1m22/22[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 2ms/step - accuracy: 0.4530 - loss: 0.6188
Epoch 9/10
[1m22/22[0m [32m━━━━━━━━━━━━━━━━━━━━[0m

In [22]:
# Predict with each model on the unseen data
# Random Forest predictions
print("Random Forest Recommendations on Unseen Data:")
rf_recommendations = rf_model.predict(X_unseen)

# XGBoost predictions
print("\nXGBoost Recommendations on Unseen Data:")
xgb_recommendations = xgb_model.predict(X_unseen)

# Neural Network predictions (using thresholding)
print("\nNeural Network Recommendations on Unseen Data:")
nn_recommendations = nn_model.predict(X_unseen)
nn_recommendations = (nn_recommendations > 0.5).astype(int)  # Convert probabilities to binary (0 or 1)

# Display recommendations for each customer in unseen data
for i, customer in enumerate(X_unseen.index):
    print(f"\nCustomer ID: {data.loc[customer, 'CustomerID']}")
    
    # Random Forest recommendation
    rf_rec = rf_recommendations[i]
    print(f"Random Forest Prediction: Mortgage={rf_rec[0]}, Personal Loan={rf_rec[1]}, Credit Card={rf_rec[2]}")
    
    # XGBoost recommendation
    xgb_rec = xgb_recommendations[i]
    print(f"XGBoost Prediction: Mortgage={xgb_rec[0]}, Personal Loan={xgb_rec[1]}, Credit Card={xgb_rec[2]}")
    
    # Neural Network recommendation
    nn_rec = nn_recommendations[i]
    print(f"Neural Network Prediction: Mortgage={nn_rec[0]}, Personal Loan={nn_rec[1]}, Credit Card={nn_rec[2]}")
    
    # Interpreted Recommendations
    rf_product_rec = [product for product, pred in zip(['Mortgage', 'Personal Loan', 'Credit Card'], rf_rec) if pred == 1]
    xgb_product_rec = [product for product, pred in zip(['Mortgage', 'Personal Loan', 'Credit Card'], xgb_rec) if pred == 1]
    nn_product_rec = [product for product, pred in zip(['Mortgage', 'Personal Loan', 'Credit Card'], nn_rec) if pred == 1]

    print(f"Recommended Products (Random Forest): {', '.join(rf_product_rec) if rf_product_rec else 'None'}")
    print(f"Recommended Products (XGBoost): {', '.join(xgb_product_rec) if xgb_product_rec else 'None'}")
    print(f"Recommended Products (Neural Network): {', '.join(nn_product_rec) if nn_product_rec else 'None'}")


Random Forest Recommendations on Unseen Data:

XGBoost Recommendations on Unseen Data:

Neural Network Recommendations on Unseen Data:
[1m5/5[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 3ms/step 

Customer ID: CUST0558
Random Forest Prediction: Mortgage=1, Personal Loan=0, Credit Card=0
XGBoost Prediction: Mortgage=0.0, Personal Loan=1.0, Credit Card=0.0
Neural Network Prediction: Mortgage=1, Personal Loan=0, Credit Card=1
Recommended Products (Random Forest): Mortgage
Recommended Products (XGBoost): Personal Loan
Recommended Products (Neural Network): Mortgage, Credit Card

Customer ID: CUST0799
Random Forest Prediction: Mortgage=1, Personal Loan=1, Credit Card=1
XGBoost Prediction: Mortgage=1.0, Personal Loan=1.0, Credit Card=1.0
Neural Network Prediction: Mortgage=1, Personal Loan=1, Credit Card=1
Recommended Products (Random Forest): Mortgage, Personal Loan, Credit Card
Recommended Products (XGBoost): Mortgage, Personal Loan, Credit Card
Recommended Products (Neural Netw

In [23]:
# (Assuming all previous code has been run and models are trained)

# --- Predicting and Generating Recommendations for Unseen Data ---

# Reset index for unseen data
X_unseen = X_unseen.reset_index(drop=True)
y_unseen = y_unseen.reset_index(drop=True)

# If CustomerID is available, extract it for identification
if 'CustomerID' in data.columns:
    customer_ids = data.loc[X_unseen.index, 'CustomerID'].reset_index(drop=True)
else:
    customer_ids = pd.Series(range(len(X_unseen)), name='CustomerID')

# Predict using the Neural Network model on unseen data
y_pred_nn_unseen = nn_model.predict(X_unseen)
y_pred_nn_unseen_classes = (y_pred_nn_unseen > 0.5).astype(int)

# Convert predictions to DataFrame
predictions_df = pd.DataFrame(y_pred_nn_unseen_classes, columns=['Mortgage', 'Personal_Loan', 'Credit_Card'])

# Combine CustomerID and predictions
unseen_predictions = pd.concat([customer_ids, predictions_df], axis=1)

# Map binary predictions to product recommendations
product_mapping = {
    'Mortgage': 'Mortgage',
    'Personal_Loan': 'Personal Loan',
    'Credit_Card': 'Credit Card'
}

def get_recommendations(row):
    recommended_products = []
    for product in ['Mortgage', 'Personal_Loan', 'Credit_Card']:
        if row[product] == 1:
            recommended_products.append(product_mapping[product])
    return recommended_products if recommended_products else ['No products recommended']

unseen_predictions['Recommendations'] = unseen_predictions.apply(get_recommendations, axis=1)

# Display the recommendations for each customer
print(unseen_predictions[['CustomerID', 'Recommendations']])


[1m5/5[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 6ms/step 
    CustomerID                         Recommendations
0     CUST0001                 [Mortgage, Credit Card]
1     CUST0002  [Mortgage, Personal Loan, Credit Card]
2     CUST0003  [Mortgage, Personal Loan, Credit Card]
3     CUST0004  [Mortgage, Personal Loan, Credit Card]
4     CUST0005  [Mortgage, Personal Loan, Credit Card]
..         ...                                     ...
145   CUST0146  [Mortgage, Personal Loan, Credit Card]
146   CUST0147  [Mortgage, Personal Loan, Credit Card]
147   CUST0148  [Mortgage, Personal Loan, Credit Card]
148   CUST0149  [Mortgage, Personal Loan, Credit Card]
149   CUST0150  [Mortgage, Personal Loan, Credit Card]

[150 rows x 2 columns]



Evaluation on Unseen Data
[1m5/5[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 3ms/step 
Neural Network on Unseen Data:
Accuracy: 0.26
Precision: 0.62
Recall: 1.00
F1 Score: 0.77
[1m5/5[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 8ms/step 
    CustomerID                         Recommendations
0     CUST0001                 [Mortgage, Credit Card]
1     CUST0002  [Mortgage, Personal Loan, Credit Card]
2     CUST0003  [Mortgage, Personal Loan, Credit Card]
3     CUST0004  [Mortgage, Personal Loan, Credit Card]
4     CUST0005  [Mortgage, Personal Loan, Credit Card]
..         ...                                     ...
145   CUST0146  [Mortgage, Personal Loan, Credit Card]
146   CUST0147  [Mortgage, Personal Loan, Credit Card]
147   CUST0148  [Mortgage, Personal Loan, Credit Card]
148   CUST0149  [Mortgage, Personal Loan, Credit Card]
149   CUST0150  [Mortgage, Personal Loan, Credit Card]

[150 rows x 2 columns]
