In [5]:
# Import necessary libraries
import pandas as pd
from sklearn.preprocessing import LabelEncoder, StandardScaler
from sklearn.impute import SimpleImputer

# Load the original dataset
file_path = 'D:/Jupyter_projects/RFM_loan/simulated_lendingclub_dataset_modified.csv'
data = pd.read_csv(file_path)

# Feature Engineering
# 1. Income per Engagement: Adjusting Income based on EngagementLevel
# engagement_map = {'Low': 1, 'Medium': 2, 'High': 3}
# data['Income_per_Engagement'] = data['Income'] / data['EngagementLevel'].replace(engagement_map)
# Adjusting Income_per_Engagement calculation with explicit type conversion
# data['Income_per_Engagement'] = data['Income'] / data['EngagementLevel'].replace(engagement_map).astype(float)

# # Convert EngagementLevel to numeric using engagement_map and specify as float to avoid future warning
# data['EngagementLevel'] = data['EngagementLevel'].replace(engagement_map).astype(float)

# Map EngagementLevel separately and assign to avoid future warnings
# data['EngagementLevel'] = data['EngagementLevel'].replace(engagement_map)

# # Calculate Income_per_Engagement separately
# data['Income_per_Engagement'] = data['Income'] / data['EngagementLevel']

# Map EngagementLevel using pd.Series to avoid downcasting warnings
data['EngagementLevel'] = pd.Series(data['EngagementLevel'].map(engagement_map), dtype=float)

# Calculate Income_per_Engagement separately
data['Income_per_Engagement'] = data['Income'] / data['EngagementLevel']



# 2. Age Group: Segmenting Age into categories
data['Age_Group'] = pd.cut(data['Age'], bins=[0, 30, 50, 100], labels=['Young', 'Middle-Aged', 'Senior'])

# 3. High-Value Customer: Identifying customers with high income
median_income = data['Income'].median()
data['Is_High_Value_Customer'] = (data['Income'] > median_income).astype(int)

# Encoding Categorical Variables
label_enc = LabelEncoder()
data['TransactionType'] = label_enc.fit_transform(data['TransactionType'])
data['EngagementLevel'] = data['EngagementLevel'].replace(engagement_map)  # Map to numeric values
data['Age_Group'] = label_enc.fit_transform(data['Age_Group'])  # Encode Age Group

# Handle Missing Values by filling with mean
imputer = SimpleImputer(strategy='mean')
numerical_columns = data.select_dtypes(include=['float64', 'int64']).columns
data[numerical_columns] = imputer.fit_transform(data[numerical_columns])

# Scale Numerical Features
scaler = StandardScaler()
data[numerical_columns] = scaler.fit_transform(data[numerical_columns])

# Save the enhanced dataset
output_file_path = 'D:/Jupyter_projects/RFM_loan/simulated_lendingclub_dataset_modified_with_features_new.csv'
data.to_csv(output_file_path, index=False)

print("Enhanced dataset saved to:", output_file_path)


Enhanced dataset saved to: D:/Jupyter_projects/RFM_loan/simulated_lendingclub_dataset_modified_with_features_new.csv


In [4]:
# Import necessary libraries
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from xgboost import XGBClassifier
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score
import pandas as pd

# Load the enhanced dataset
file_path = 'D:/Jupyter_projects/RFM_loan/simulated_lendingclub_dataset_modified_with_features_new.csv'
data = pd.read_csv(file_path)

# Define target and features
X = data.drop(['CustomerID', 'Mortgage', 'Personal_Loan', 'Credit_Card'], axis=1)
y = data[['Mortgage', 'Personal_Loan', 'Credit_Card']]
y_binary = y.apply(lambda x: (x > 0).astype(int))

# Split data into training and testing sets
X_train, X_temp, y_train, y_temp = train_test_split(X, y_binary, test_size=0.3, random_state=42)
X_test, X_unseen, y_test, y_unseen = train_test_split(X_temp, y_temp, test_size=0.5, random_state=42)

# Model Evaluation Function
def evaluate_model(y_true, y_pred):
    accuracy = accuracy_score(y_true, y_pred)
    precision = precision_score(y_true, y_pred, average='macro')
    recall = recall_score(y_true, y_pred, average='macro')
    f1 = f1_score(y_true, y_pred, average='macro')
    print(f"Accuracy: {accuracy:.2f}")
    print(f"Precision: {precision:.2f}")
    print(f"Recall: {recall:.2f}")
    print(f"F1 Score: {f1:.2f}")

# Model 1: Random Forest
print("Random Forest Model Evaluation")
rf_model = RandomForestClassifier(random_state=42)
rf_model.fit(X_train, y_train)
y_pred_rf = rf_model.predict(X_test)
evaluate_model(y_test, y_pred_rf)

# Model 2: XGBoost (trained per target variable due to multi-output requirements)
xgb_models = {}
xgb_predictions = {}

# Fit an XGBoost model for each product target (Mortgage, Personal_Loan, Credit_Card)
for target in y.columns:
    print(f"\nXGBoost Model Evaluation for {target}")
    xgb_model = XGBClassifier(eval_metric='logloss', n_estimators=50)  # Removed use_label_encoder
    xgb_model.fit(X_train, y_train[target])
    y_pred_xgb = xgb_model.predict(X_test)
    evaluate_model(y_test[target], y_pred_xgb)
    
    # Save model and predictions
    xgb_models[target] = xgb_model
    xgb_predictions[target] = y_pred_xgb

# Evaluate each model on Unseen Data for each target variable
print("\nEvaluation on Unseen Data for XGBoost (per target)")
for target in y.columns:
    print(f"\nXGBoost on Unseen Data for {target}:")
    y_pred_xgb_unseen = xgb_models[target].predict(X_unseen)
    evaluate_model(y_unseen[target], y_pred_xgb_unseen)

# Random Forest Unseen Data evaluation
print("\nRandom Forest on Unseen Data:")
y_pred_rf_unseen = rf_model.predict(X_unseen)
evaluate_model(y_unseen, y_pred_rf_unseen)


Random Forest Model Evaluation
Accuracy: 0.30
Precision: 0.68
Recall: 0.90
F1 Score: 0.77

XGBoost Model Evaluation for Mortgage
Accuracy: 0.58
Precision: 0.46
Recall: 0.47
F1 Score: 0.46

XGBoost Model Evaluation for Personal_Loan
Accuracy: 0.55
Precision: 0.48
Recall: 0.48
F1 Score: 0.47

XGBoost Model Evaluation for Credit_Card
Accuracy: 0.58
Precision: 0.43
Recall: 0.44
F1 Score: 0.43

Evaluation on Unseen Data for XGBoost (per target)

XGBoost on Unseen Data for Mortgage:
Accuracy: 0.55
Precision: 0.50
Recall: 0.50
F1 Score: 0.47

XGBoost on Unseen Data for Personal_Loan:
Accuracy: 0.55
Precision: 0.47
Recall: 0.48
F1 Score: 0.47

XGBoost on Unseen Data for Credit_Card:
Accuracy: 0.57
Precision: 0.51
Recall: 0.51
F1 Score: 0.50

Random Forest on Unseen Data:
Accuracy: 0.25
Precision: 0.62
Recall: 0.93
F1 Score: 0.74


In [6]:
# Import necessary libraries
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from xgboost import XGBClassifier
import pandas as pd

# Load the enhanced dataset
file_path = 'D:/Jupyter_projects/RFM_loan/simulated_lendingclub_dataset_modified_with_features_new.csv'
data = pd.read_csv(file_path)

# Define target and features
X = data.drop(['CustomerID', 'Mortgage', 'Personal_Loan', 'Credit_Card'], axis=1)
y = data[['Mortgage', 'Personal_Loan', 'Credit_Card']]
y_binary = y.apply(lambda x: (x > 0).astype(int))

# Split data into training and testing sets
X_train, X_temp, y_train, y_temp = train_test_split(X, y_binary, test_size=0.3, random_state=42)
X_test, X_unseen, y_test, y_unseen = train_test_split(X_temp, y_temp, test_size=0.5, random_state=42)

# Train Random Forest model for multi-output
rf_model = RandomForestClassifier(random_state=42)
rf_model.fit(X_train, y_train)

# Train XGBoost models for each target variable
xgb_models = {}
for target in y.columns:
    xgb_model = XGBClassifier(eval_metric='logloss', n_estimators=50)
    xgb_model.fit(X_train, y_train[target])
    xgb_models[target] = xgb_model

# Function to print cross-sell/up-sell recommendations for each customer
def generate_recommendations(models, model_name, multi_output=False):
    print(f"\n{model_name} Recommendations on Unseen Data:")
    for i, customer in enumerate(X_unseen.index):
        recommendations = []
        if multi_output:  # For multi-output models like Random Forest
            predictions = models.predict(X_unseen.iloc[[i]])[0]
            for target, pred in zip(y.columns, predictions):
                if pred == 1:
                    recommendations.append(target)
        else:  # For single-output models like XGBoost
            for target in y.columns:
                if models[target].predict(X_unseen.iloc[[i]])[0] == 1:
                    recommendations.append(target)
        
        # Print recommendation for this customer
        customer_id = data.loc[customer, 'CustomerID']
        if recommendations:
            print(f"Customer ID: {customer_id} - Recommend: {', '.join(recommendations)}")
        else:
            print(f"Customer ID: {customer_id} - No recommendations")

# Generate recommendations using Random Forest and XGBoost
generate_recommendations(rf_model, "Random Forest", multi_output=True)
generate_recommendations(xgb_models, "XGBoost", multi_output=False)



Random Forest Recommendations on Unseen Data:
Customer ID: CUST0558 - Recommend: Mortgage
Customer ID: CUST0799 - Recommend: Mortgage, Credit_Card
Customer ID: CUST0978 - Recommend: Mortgage, Personal_Loan, Credit_Card
Customer ID: CUST0137 - Recommend: Mortgage, Credit_Card
Customer ID: CUST0576 - Recommend: Mortgage, Personal_Loan, Credit_Card
Customer ID: CUST0545 - Recommend: Mortgage, Personal_Loan, Credit_Card
Customer ID: CUST0333 - Recommend: Mortgage, Personal_Loan, Credit_Card
Customer ID: CUST0918 - Recommend: Mortgage, Personal_Loan, Credit_Card
Customer ID: CUST0679 - Recommend: Mortgage, Personal_Loan, Credit_Card
Customer ID: CUST0364 - Recommend: Mortgage, Personal_Loan, Credit_Card
Customer ID: CUST0964 - Recommend: Mortgage, Personal_Loan, Credit_Card
Customer ID: CUST0011 - Recommend: Mortgage, Personal_Loan, Credit_Card
Customer ID: CUST0908 - Recommend: Mortgage, Personal_Loan, Credit_Card
Customer ID: CUST0278 - Recommend: Mortgage, Personal_Loan, Credit_Card
Cus