In [1]:
import xgboost as xgb
import numpy as np
import pandas as pd
from sklearn.preprocessing import LabelEncoder

from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.metrics import accuracy_score

In [2]:
# Load the dataset
df = pd.read_csv('data_final.csv')

# Initialize LabelEncoder
label_encoder = LabelEncoder()

# Detect categorical columns (those of type 'object')
categorical_columns = df.select_dtypes(include=['object']).columns

# Apply LabelEncoder to each categorical column to change all to numerical
for col in categorical_columns:
    df[col] = label_encoder.fit_transform(df[col])



## Check for any multicollinearity for the chosen features.

In [3]:
#Based on the EDA "Income","Food","Total_Invested_Amount","Emergency_Funds","Tax_Rate","Cost_of_Living","Expected_ROI",
#"Healthcare_Cost","Debt","Savings_Rate","Desired_Expenses" have the largest correlation
import statsmodels.api as sm
from statsmodels.stats.outliers_influence import variance_inflation_factor

columns = ["Income", "Tax_Rate","Cost_of_Living", "Healthcare_Cost", "Debt", "Desired_Expenses"]

# Subset your DataFrame to include only the relevant columns
X = df[columns]

# Add a constant to the model for intercept calculation
X = sm.add_constant(X)

# Calculate VIF for each feature
vif_data = pd.DataFrame()
vif_data["Variable"] = X.columns
vif_data["VIF"] = [variance_inflation_factor(X.values, i) for i in range(X.shape[1])]

# Display the VIF values
print(vif_data)

# VIF > 10: High multicollinearity (consider removing or combining the variables).
# VIF between 5 and 10: Moderate multicollinearity (possible issue, but may be acceptable).
# VIF < 5: Low multicollinearity (no issues).

           Variable        VIF
0             const  15.781902
1            Income  66.397127
2          Tax_Rate   1.281060
3    Cost_of_Living   7.373007
4   Healthcare_Cost  15.633030
5              Debt  11.886220
6  Desired_Expenses  35.490483


### Will choose 
- Income (required feature)
- Debt (required feature)
- Tax_rate (low multicollinearity)
- Cost_of_Living (low multicollinearity)

## Filtering and Splitting data for training

In [4]:
# Define features (X) and target (y)
#X = df.drop(columns=['Monthly_Savings'])
X = df[["Income","Tax_Rate","Cost_of_Living","Debt"]]
y = df['Monthly_Savings']


# Split data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

## Splitting data and Training Model

In [5]:
# Initialize the XGBoost model
model = xgb.XGBRegressor()# Hyperparameter grid
param_grid = {
    'learning_rate': [0.01, 0.05, 0.1, 0.2],
    'n_estimators': [100, 200, 500],
     'max_depth': [3, 5, 7],
     'min_child_weight': [1, 3, 5]
}

# Perform Grid Search
grid_search = GridSearchCV(estimator=model, param_grid=param_grid, scoring='neg_mean_squared_error', cv=5, verbose=2)

# Fit the model
grid_search.fit(X_train, y_train)

# Best parameters
print("Best hyperparameters found: ", grid_search.best_params_)
# Fit the model on training data
#y_train = label_encoder.fit_transform(y_train)
model.fit(X_train,y_train )



Fitting 5 folds for each of 108 candidates, totalling 540 fits
[CV] END learning_rate=0.01, max_depth=3, min_child_weight=1, n_estimators=100; total time=   0.3s
[CV] END learning_rate=0.01, max_depth=3, min_child_weight=1, n_estimators=100; total time=   0.0s
[CV] END learning_rate=0.01, max_depth=3, min_child_weight=1, n_estimators=100; total time=   0.0s
[CV] END learning_rate=0.01, max_depth=3, min_child_weight=1, n_estimators=100; total time=   0.0s
[CV] END learning_rate=0.01, max_depth=3, min_child_weight=1, n_estimators=100; total time=   0.0s
[CV] END learning_rate=0.01, max_depth=3, min_child_weight=1, n_estimators=200; total time=   0.0s
[CV] END learning_rate=0.01, max_depth=3, min_child_weight=1, n_estimators=200; total time=   0.0s
[CV] END learning_rate=0.01, max_depth=3, min_child_weight=1, n_estimators=200; total time=   0.0s
[CV] END learning_rate=0.01, max_depth=3, min_child_weight=1, n_estimators=200; total time=   0.0s
[CV] END learning_rate=0.01, max_depth=3, min_

XGBRegressor(base_score=None, booster=None, callbacks=None,
             colsample_bylevel=None, colsample_bynode=None,
             colsample_bytree=None, device=None, early_stopping_rounds=None,
             enable_categorical=False, eval_metric=None, feature_types=None,
             gamma=None, grow_policy=None, importance_type=None,
             interaction_constraints=None, learning_rate=None, max_bin=None,
             max_cat_threshold=None, max_cat_to_onehot=None,
             max_delta_step=None, max_depth=None, max_leaves=None,
             min_child_weight=None, missing=nan, monotone_constraints=None,
             multi_strategy=None, n_estimators=None, n_jobs=None,
             num_parallel_tree=None, random_state=None, ...)

## Testing model accuracy

In [6]:
from sklearn.metrics import mean_squared_error, r2_score
# Predict on the test set
y_pred = grid_search.predict(X_test)

# Evaluate the model
mse = mean_squared_error(y_test, y_pred)
r2 = r2_score(y_test, y_pred)

print(f"Mean Squared Error: {mse:.4f}")
print(f"R-squared: {r2:.4f}")

Mean Squared Error: 1520386974.6080
R-squared: 0.9937


## Saving model

In [7]:
model.save_model('Monthly_Savings_model.json')

In [8]:
#xgb version (important when loading the model)
xgb.__version__

'2.1.3'

### Method to use the model

In [9]:

# Load the saved model
loaded_model = xgb.XGBRegressor()
loaded_model.load_model('Monthly_Savings_model.json')

# Now you can use the loaded model for predictions
predictions = loaded_model.predict(X_test)  # Assuming you have test data
