In [2]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression, Ridge, Lasso
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import mean_squared_error
import numpy as np

In [3]:
# Load the dataset
file_path = '/kaggle/input/oss-dataset/data_final.csv'
data = pd.read_csv(file_path)


In [4]:
# Data Cleaning
data['Emergency_Funds'].fillna(data['Emergency_Funds'].median(), inplace=True)
data['Monthly_Savings'].fillna(0, inplace=True)
data['Cost_of_Living'].fillna(3000, inplace=True)  # Approximate avg living cost in Singapore
data['Healthcare_Cost'].fillna(data['Healthcare_Cost'].mean(), inplace=True)
data['Desired_Expenses'].fillna(data['Cost_of_Living'] + data['Healthcare_Cost'], inplace=True)


The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  data['Emergency_Funds'].fillna(data['Emergency_Funds'].median(), inplace=True)
The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  data['Monthly_Savings'].fillna(0, inplace=True)
The behavior will change in pandas 3.0. This inplace method will never work because the intermediate obje

In [5]:
# Remove logically inconsistent rows
data = data[(data['Emergency_Funds'] > 0) & 
            (data['Cost_of_Living'] > 0) & 
            (data['Healthcare_Cost'] > 0) & 
            (data['Desired_Expenses'] >= data['Cost_of_Living'] + data['Healthcare_Cost'])]

In [6]:
# Feature Engineering
data['Total_Monthly_Expenses'] = data['Cost_of_Living'] + data['Healthcare_Cost']
data['Savings_to_Expenses_Ratio'] = data['Monthly_Savings'] / data['Total_Monthly_Expenses']


In [7]:
# Target Variable
data['Fund_Duration'] = data['Emergency_Funds'] / (data['Desired_Expenses'] * data['Total_Monthly_Expenses'])


In [8]:
# Handle Outliers
q1 = data[['Emergency_Funds', 'Healthcare_Cost', 'Cost_of_Living']].quantile(0.25)
q3 = data[['Emergency_Funds', 'Healthcare_Cost', 'Cost_of_Living']].quantile(0.75)
iqr = q3 - q1
outlier_mask = ~((data[['Emergency_Funds', 'Healthcare_Cost', 'Cost_of_Living']] < (q1 - 1.5 * iqr)) |
                 (data[['Emergency_Funds', 'Healthcare_Cost', 'Cost_of_Living']] > (q3 + 1.5 * iqr))).any(axis=1)
data = data[outlier_mask]

In [9]:
# Split Data
X = data[['Emergency_Funds', 'Monthly_Savings', 'Cost_of_Living', 'Healthcare_Cost', 'Desired_Expenses']]
y = data['Fund_Duration']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)


In [10]:
# Scale Features
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

In [11]:
# Train Linear Regression Model
linear_model = LinearRegression()
linear_model.fit(X_train_scaled, y_train)
linear_predictions = linear_model.predict(X_test_scaled)

# Evaluate Linear Regression
linear_rmse = np.sqrt(mean_squared_error(y_test, linear_predictions))
linear_mae = mean_absolute_error(y_test, linear_predictions)
linear_r2 = r2_score(y_test, linear_predictions)
linear_explained_variance = explained_variance_score(y_test, linear_predictions)

print("Linear Regression Metrics:")
print("  RMSE:", linear_rmse)
print("  MAE:", linear_mae)
print("  R² Score:", linear_r2)
print("  Explained Variance:", linear_explained_variance)


Linear Regression Metrics:
  RMSE: 3.709187203404586e-05
  MAE: 2.192328906522166e-05
  R² Score: 0.24169242257801782
  Explained Variance: 0.2422777595227119


In [12]:
# Train Ridge Regression Model
ridge_model = Ridge(alpha=1.0)
ridge_model.fit(X_train_scaled, y_train)
ridge_predictions = ridge_model.predict(X_test_scaled)

# Evaluate Ridge Regression
ridge_rmse = np.sqrt(mean_squared_error(y_test, ridge_predictions))
ridge_mae = mean_absolute_error(y_test, ridge_predictions)
ridge_r2 = r2_score(y_test, ridge_predictions)
ridge_explained_variance = explained_variance_score(y_test, ridge_predictions)

print("Ridge Regression Metrics:")
print("  RMSE:", ridge_rmse)
print("  MAE:", ridge_mae)
print("  R² Score:", ridge_r2)
print("  Explained Variance:", ridge_explained_variance)


Ridge Regression Metrics:
  RMSE: 3.7091433257475884e-05
  MAE: 2.1923449715990406e-05
  R² Score: 0.24171036320013106
  Explained Variance: 0.24229562596717313


In [13]:
# Train Lasso Regression Model
lasso_model = Lasso(alpha=0.1)
lasso_model.fit(X_train_scaled, y_train)
lasso_predictions = lasso_model.predict(X_test_scaled)

# Evaluate Lasso Regression
lasso_rmse = np.sqrt(mean_squared_error(y_test, lasso_predictions))
lasso_mae = mean_absolute_error(y_test, lasso_predictions)
lasso_r2 = r2_score(y_test, lasso_predictions)
lasso_explained_variance = explained_variance_score(y_test, lasso_predictions)

print("Lasso Regression Metrics:")
print("  RMSE:", lasso_rmse)
print("  MAE:", lasso_mae)
print("  R² Score:", lasso_r2)
print("  Explained Variance:", lasso_explained_variance)


Lasso Regression Metrics:
  RMSE: 4.2608854756723926e-05
  MAE: 2.689999437389409e-05
  R² Score: -0.0006624723445445291
  Explained Variance: 0.0


In [14]:
# Evaluate Models
linear_rmse = np.sqrt(mean_squared_error(y_test, linear_predictions))
ridge_rmse = np.sqrt(mean_squared_error(y_test, ridge_predictions))
lasso_rmse = np.sqrt(mean_squared_error(y_test, lasso_predictions))


In [15]:
# Print Results
print("Linear Regression RMSE:", linear_rmse)
print("Ridge Regression RMSE:", ridge_rmse)
print("Lasso Regression RMSE:", lasso_rmse)

Linear Regression RMSE: 3.709187203404586e-05
Ridge Regression RMSE: 3.7091433257475884e-05
Lasso Regression RMSE: 4.2608854756723926e-05


In [16]:
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import GradientBoostingRegressor
from sklearn.model_selection import GridSearchCV

# Decision Tree Regressor
decision_tree = DecisionTreeRegressor(random_state=42)
decision_tree.fit(X_train_scaled, y_train)
dt_predictions = decision_tree.predict(X_test_scaled)

# Gradient Boosting Regressor
gradient_boosting = GradientBoostingRegressor(random_state=42)
gradient_boosting.fit(X_train_scaled, y_train)
gb_predictions = gradient_boosting.predict(X_test_scaled)

# Evaluate Decision Tree
dt_rmse = np.sqrt(mean_squared_error(y_test, dt_predictions))
dt_mae = mean_absolute_error(y_test, dt_predictions)
dt_r2 = r2_score(y_test, dt_predictions)
dt_explained_variance = explained_variance_score(y_test, dt_predictions)

print("Decision Tree Regression Metrics:")
print("  RMSE:", dt_rmse)
print("  MAE:", dt_mae)
print("  R² Score:", dt_r2)
print("  Explained Variance:", dt_explained_variance)

# Evaluate Gradient Boosting
gb_rmse = np.sqrt(mean_squared_error(y_test, gb_predictions))
gb_mae = mean_absolute_error(y_test, gb_predictions)
gb_r2 = r2_score(y_test, gb_predictions)
gb_explained_variance = explained_variance_score(y_test, gb_predictions)

print("\nGradient Boosting Regression Metrics:")
print("  RMSE:", gb_rmse)
print("  MAE:", gb_mae)
print("  R² Score:", gb_r2)
print("  Explained Variance:", gb_explained_variance)


Decision Tree Regression Metrics:
  RMSE: 1.679829865323076e-05
  MAE: 4.8886794498333235e-06
  R² Score: 0.8444684548667141
  Explained Variance: 0.8444733481687374

Gradient Boosting Regression Metrics:
  RMSE: 1.2168250610618181e-05
  MAE: 4.240484689616917e-06
  R² Score: 0.9183898490116096
  Explained Variance: 0.9183899784083909


In [17]:
from sklearn.model_selection import GridSearchCV

# Define parameter grid for Ridge and Lasso
param_grid = {'alpha': [0.001, 0.01, 0.1, 1, 10, 100]}

# Grid Search for Ridge
ridge_grid = GridSearchCV(Ridge(), param_grid, cv=5, scoring='neg_mean_squared_error')
ridge_grid.fit(X_train_scaled, y_train)
best_ridge = ridge_grid.best_estimator_

# Grid Search for Lasso
lasso_grid = GridSearchCV(Lasso(), param_grid, cv=5, scoring='neg_mean_squared_error')
lasso_grid.fit(X_train_scaled, y_train)
best_lasso = lasso_grid.best_estimator_

# Print best alpha values
print("Best Ridge alpha:", ridge_grid.best_params_['alpha'])
print("Best Lasso alpha:", lasso_grid.best_params_['alpha'])

# Train models with the best alpha values
ridge_predictions = best_ridge.predict(X_test_scaled)
lasso_predictions = best_lasso.predict(X_test_scaled)

# Evaluate Ridge
ridge_rmse = np.sqrt(mean_squared_error(y_test, ridge_predictions))
ridge_mae = mean_absolute_error(y_test, ridge_predictions)
ridge_r2 = r2_score(y_test, ridge_predictions)
ridge_explained_variance = explained_variance_score(y_test, ridge_predictions)

print("\nOptimized Ridge Regression Metrics:")
print("  RMSE:", ridge_rmse)
print("  MAE:", ridge_mae)
print("  R² Score:", ridge_r2)
print("  Explained Variance:", ridge_explained_variance)

# Evaluate Lasso
lasso_rmse = np.sqrt(mean_squared_error(y_test, lasso_predictions))
lasso_mae = mean_absolute_error(y_test, lasso_predictions)
lasso_r2 = r2_score(y_test, lasso_predictions)
lasso_explained_variance = explained_variance_score(y_test, lasso_predictions)

print("\nOptimized Lasso Regression Metrics:")
print("  RMSE:", lasso_rmse)
print("  MAE:", lasso_mae)
print("  R² Score:", lasso_r2)
print("  Explained Variance:", lasso_explained_variance)


Best Ridge alpha: 10
Best Lasso alpha: 0.001

Optimized Ridge Regression Metrics:
  RMSE: 3.70875673429981e-05
  MAE: 2.1924941496099225e-05
  R² Score: 0.24186842288445987
  Explained Variance: 0.24245302334985463

Optimized Lasso Regression Metrics:
  RMSE: 4.2608854756723926e-05
  MAE: 2.689999437389409e-05
  R² Score: -0.0006624723445445291
  Explained Variance: 0.0
