In [2]:
import numpy as np
import pandas as pd
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error
import math
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense

df = pd.read_csv('data_final.csv')

# Features (X) and target (y)
X = df.drop(columns=["Monthly_Savings"])
y = df["Monthly_Savings"]



In [3]:
from sklearn.preprocessing import LabelEncoder
# Instantiate the LabelEncoder
le = LabelEncoder()

# Identify non-numeric columns
non_numeric_columns = df.select_dtypes(exclude=['number']).columns.tolist()

# Apply LabelEncoder to each non-numeric column and print the results
for column in non_numeric_columns:
    # Apply LabelEncoder
    df[column] = le.fit_transform(df[column])
    
    # Print the Label Encoding mapping for the current column
    print(f"Label Encoding for column: {column}")
    print(f"Mapping: {dict(zip(le.classes_, le.transform(le.classes_)))}\n")

Q1 = df.select_dtypes(include=[np.number]).quantile(0.25)
Q3 = df.select_dtypes(include=[np.number]).quantile(0.75)
IQR = Q3 - Q1

# Define the outlier bounds
lower_bound = Q1 - 1.5 * IQR
upper_bound = Q3 + 1.5 * IQR

# Filter out the outliers
df_cleaned_iqr = df[~((df.select_dtypes(include=[np.number]) < lower_bound) | 
                                  (df.select_dtypes(include=[np.number]) > upper_bound)).any(axis=1)]

X = df_cleaned_iqr.drop(columns=["Monthly_Savings"])
y = df_cleaned_iqr['Monthly_Savings']


Label Encoding for column: Education_Level
Mapping: {"Bachelor's": 0, 'Doctorate': 1, 'High School': 2, "Master's": 3}

Label Encoding for column: Occupation
Mapping: {'Education': 0, 'Finance': 1, 'Healthcare': 2, 'Others': 3, 'Technology': 4}

Label Encoding for column: Location
Mapping: {'Rural': 0, 'Suburban': 1, 'Urban': 2}

Label Encoding for column: Marital_Status
Mapping: {'Divorced': 0, 'Married': 1, 'Single': 2}

Label Encoding for column: Employment_Status
Mapping: {'Full-time': 0, 'Part-time': 1, 'Self-employed': 2}

Label Encoding for column: Homeownership_Status
Mapping: {'Own': 0, 'Rent': 1}

Label Encoding for column: Type_of_Housing
Mapping: {'Apartment': 0, 'Single-family home': 1, 'Townhouse': 2}

Label Encoding for column: Gender
Mapping: {'Female': 0, 'Male': 1}

Label Encoding for column: Primary_Mode_of_Transportation
Mapping: {'Biking': 0, 'Car': 1, 'Public transit': 2, 'Walking': 3}

Label Encoding for column: Career_Level
Mapping: {'Entry-Level': 0, 'Late-Care

In [4]:
from sklearn.ensemble import RandomForestRegressor
# Step 2: Scale features (X) and target (y) using StandardScaler
scaler_X = StandardScaler()
scaler_y = StandardScaler()

# Scale X (features)
X_scaled = scaler_X.fit_transform(X)

# Scale y (target) (reshape y for scaling)
y_scaled = scaler_y.fit_transform(y.values.reshape(-1, 1)).ravel()  # Convert to 2D for scaling

# Step 3: Split data into training and testing sets (80% train, 20% test)
X_train, X_test, y_train, y_test = train_test_split(X_scaled, y_scaled, test_size=0.2, random_state=42)

# Step 4: Define the Random Forest Regressor model
rf_model = RandomForestRegressor(max_depth = 20, min_samples_split=2, n_estimators =200,random_state=42)


from sklearn.model_selection import cross_val_score
cv_scores = cross_val_score(rf_model, X_scaled, y_scaled, cv=5, scoring='neg_mean_squared_error')



# Step 5: Train the Random Forest model
rf_model.fit(X_train, y_train.ravel())  # ravel() is used to flatten y_train for fitting

# Step 6: Make predictions on the test set (scaled values)
y_pred_scaled = rf_model.predict(X_test)

# Inverse transform the predictions back to the original scale
y_pred = scaler_y.inverse_transform(y_pred_scaled.reshape(-1, 1))

# Inverse transform the actual values from the test set
y_test_original = scaler_y.inverse_transform(y_test.reshape(-1, 1))

# Step 7: Evaluate the model's performance using MSE and RMSE
mse = mean_squared_error(y_test_original, y_pred)
rmse = math.sqrt(mse)

# Step 8: Display results
print("Predicted (original scale):", y_pred)
print("Actual (original scale):", y_test_original)
print(f'Mean Squared Error (MSE): {mse}')
print(f'Root Mean Squared Error (RMSE): {rmse}')

Predicted (original scale): [[13720.48278296]
 [17301.7378773 ]
 [22191.35533518]
 ...
 [ 9341.90592504]
 [26458.99718941]
 [14964.0330178 ]]
Actual (original scale): [[12984.05366094]
 [13898.77      ]
 [20172.04670723]
 ...
 [10802.68597283]
 [19604.46394006]
 [16076.52741378]]
Mean Squared Error (MSE): 30111836.507880904
Root Mean Squared Error (RMSE): 5487.42530772683


In [5]:
import joblib

# Save the model and scalers
joblib.dump(rf_model, 'monthlySavings_model.pkl')
joblib.dump(scaler_X, 'scaler_X.pkl')
joblib.dump(scaler_y, 'scaler_y.pkl')

['scaler_y.pkl']

### How to use the Model

In [None]:
# Load the model and scalers
model = joblib.load('random_forest_model.pkl')
scaler_X = joblib.load('scaler_X.pkl')
scaler_y = joblib.load('scaler_y.pkl')

# Assuming X_test is your new test data
X_test_scaled = scaler_X.transform(X_test)

# Make predictions
y_pred_scaled = model.predict(X_test_scaled)

# Inverse scale the predictions
y_pred = scaler_y.inverse_transform(y_pred_scaled.reshape(-1, 1))

# Now y_pred contains the predictions in the original scale