In [1]:
import numpy as np
import pandas as pd
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error
import math
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense

df = pd.read_csv('realistic_dataset_final.csv')

# Features (X) and target (y)
X = df.drop(columns=["Monthly_Savings"])
y = df["Monthly_Savings"]



In [2]:
from sklearn.preprocessing import LabelEncoder
# Instantiate the LabelEncoder
le = LabelEncoder()

# Identify non-numeric columns
non_numeric_columns = df.select_dtypes(exclude=['number']).columns.tolist()

# Apply LabelEncoder to each non-numeric column and print the results
for column in non_numeric_columns:
    # Apply LabelEncoder
    df[column] = le.fit_transform(df[column])
    
    # Print the Label Encoding mapping for the current column
    print(f"Label Encoding for column: {column}")
    print(f"Mapping: {dict(zip(le.classes_, le.transform(le.classes_)))}\n")

Q1 = df.select_dtypes(include=[np.number]).quantile(0.25)
Q3 = df.select_dtypes(include=[np.number]).quantile(0.75)
IQR = Q3 - Q1

# Define the outlier bounds
lower_bound = Q1 - 1.5 * IQR
upper_bound = Q3 + 1.5 * IQR

# Filter out the outliers
df_cleaned_iqr = df[~((df.select_dtypes(include=[np.number]) < lower_bound) | 
                                  (df.select_dtypes(include=[np.number]) > upper_bound)).any(axis=1)]

X = df_cleaned_iqr.drop(columns=["Monthly_Savings"])
y = df_cleaned_iqr['Monthly_Savings']


Label Encoding for column: Career_Level
Mapping: {'Entry-Level': 0, 'Late-Career': 1, 'Mid-Career': 2}

Label Encoding for column: Employment_Status
Mapping: {'Employed': 0, 'Unemployed': 1}

Label Encoding for column: Financial_Status
Mapping: {'High Income': 0, 'Low Income': 1, 'Middle Income': 2}

Label Encoding for column: Location
Mapping: {'Rural': 0, 'Suburban': 1, 'Urban': 2}



In [11]:
from sklearn.ensemble import RandomForestRegressor
# Step 2: Scale features (X) and target (y) using StandardScaler
scaler_X = StandardScaler()
scaler_y = StandardScaler()

# Scale X (features)
X_scaled = scaler_X.fit_transform(X)

# Scale y (target) (reshape y for scaling)
y_scaled = scaler_y.fit_transform(y.values.reshape(-1, 1)).ravel()  # Convert to 2D for scaling

# Step 3: Split data into training and testing sets (80% train, 20% test)
X_train, X_test, y_train, y_test = train_test_split(X_scaled, y_scaled, test_size=0.2, random_state=42)

# Step 4: Define the Random Forest Regressor model
rf_model = RandomForestRegressor(max_depth = 15, min_samples_split=2, n_estimators =100,random_state=42)


from sklearn.model_selection import cross_val_score
cv_scores = cross_val_score(rf_model, X_scaled, y_scaled, cv=5, scoring='neg_mean_squared_error')



# Step 5: Train the Random Forest model
rf_model.fit(X_train, y_train.ravel())  # ravel() is used to flatten y_train for fitting

# Step 6: Make predictions on the test set (scaled values)
print(X_train.shape)
y_pred_scaled = rf_model.predict(X_test)

# Inverse transform the predictions back to the original scale
y_pred = scaler_y.inverse_transform(y_pred_scaled.reshape(-1, 1))

# Inverse transform the actual values from the test set
y_test_original = scaler_y.inverse_transform(y_test.reshape(-1, 1))

# Step 7: Evaluate the model's performance using MSE and RMSE
mse = mean_squared_error(y_test_original, y_pred)
rmse = math.sqrt(mse)

# Step 8: Display results
print("Predicted (original scale):", y_pred)
print("Actual (original scale):", y_test_original)
print(f'Mean Squared Error (MSE): {mse}')
print(f'Root Mean Squared Error (RMSE): {rmse}')

(526, 23)
Predicted (original scale): [[ 785.1132    ]
 [1428.6727    ]
 [1245.98032   ]
 [1235.9631    ]
 [1154.5577    ]
 [3259.7586    ]
 [1835.437     ]
 [ 385.7217    ]
 [3014.0573    ]
 [1570.0558    ]
 [1290.0407    ]
 [ 512.0610119 ]
 [1018.531     ]
 [1090.5829    ]
 [1540.9484    ]
 [ 917.98      ]
 [ 974.2342    ]
 [1965.293     ]
 [ 923.3807    ]
 [2919.40126667]
 [ 964.2745    ]
 [ 970.6661    ]
 [ 926.926295  ]
 [1858.9168    ]
 [2385.1723    ]
 [ 523.8984    ]
 [2153.2248    ]
 [1290.7379    ]
 [1280.8752    ]
 [ 610.9098    ]
 [2655.4682    ]
 [1529.5353    ]
 [ 849.74523333]
 [ 664.1429    ]
 [3447.4739    ]
 [1969.7351    ]
 [3104.2631    ]
 [ 341.49095   ]
 [ 625.85176667]
 [1416.0916    ]
 [2804.2219    ]
 [ 570.530225  ]
 [ 394.88048214]
 [ 330.020675  ]
 [1870.3046    ]
 [1357.5061    ]
 [3323.2693    ]
 [ 544.41395   ]
 [2249.451725  ]
 [ 678.810175  ]
 [ 512.3781    ]
 [2289.1384    ]
 [3244.5753    ]
 [ 840.5761    ]
 [1063.3801    ]
 [ 537.7403    ]
 [1540.134

In [4]:
import joblib

# Save the model and scalers
joblib.dump(rf_model, 'monthlySavings_model.pkl')
joblib.dump(scaler_X, 'scaler_X.pkl')
joblib.dump(scaler_y, 'scaler_y.pkl')

['scaler_y.pkl']

### How to use the Model

In [13]:
# Load the model and scalers
model = joblib.load('monthlySavings_model.pkl')
scaler_X = joblib.load('scaler_X.pkl')
scaler_y = joblib.load('scaler_y.pkl')

# Assuming X_test is your new test data
#X_test_scaled = scaler_X.transform(X_test)

# Make predictions
y_pred_scaled = model.predict(X_test)

# Inverse scale the predictions
y_pred = scaler_y.inverse_transform(y_pred_scaled.reshape(-1, 1))

# Now y_pred contains the predictions in the original scale

(132, 23)
