In [4]:
# Import necessary libraries
import pandas as pd
import numpy as np
from sklearn.tree import DecisionTreeRegressor
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import mean_squared_error, r2_score
import joblib

# Define the Gradient Boosting Regressor class
class GradientBoostingRegressorScratch:
    def __init__(self, n_estimators=100, learning_rate=0.1, max_depth=3):
        self.n_estimators = n_estimators
        self.learning_rate = learning_rate
        self.max_depth = max_depth
        self.trees = []
        self.initial_prediction = None

    def fit(self, X, y):
        # Initialize predictions with the mean of the target variable
        self.initial_prediction = np.mean(y)
        predictions = np.full(y.shape, self.initial_prediction)
        for _ in range(self.n_estimators):
            residuals = y - predictions
            tree = DecisionTreeRegressor(max_depth=self.max_depth)
            tree.fit(X, residuals)
            self.trees.append(tree)
            predictions += self.learning_rate * tree.predict(X)

    def predict(self, X):
        predictions = np.full(X.shape[0], self.initial_prediction)
        for tree in self.trees:
            predictions += self.learning_rate * tree.predict(X)
        return predictions

# Load the dataset
data = pd.read_csv("cost_of_living_us.csv")

# Define the selected features and target variable
features = ['food_cost', 'other_necessities_cost', 'childcare_cost', 'taxes', 'healthcare_cost']
target = 'total_cost'

# Preprocess data
X = data[features]
y = data[target]

# Handle missing values
X = X.fillna(X.median())

# Standardize the features
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)

# Split the dataset
X_train, X_test, y_train, y_test = train_test_split(X_scaled, y, test_size=0.2, random_state=42)

# Train the Gradient Boosting model
model = GradientBoostingRegressorScratch(n_estimators=100, learning_rate=0.1, max_depth=3)
model.fit(X_train, y_train)

# Save the model and scaler
joblib.dump(model, "cost_of_living_model.pkl")
joblib.dump(scaler, "scaler.pkl")

# Evaluate the model
y_pred = model.predict(X_test)
mse = mean_squared_error(y_test, y_pred)
r2 = r2_score(y_test, y_pred)

print(f"Mean Squared Error: {mse:.2f}")
print(f"R2 Score: {r2:.2f}")

print("Model and scaler saved successfully!")


Mean Squared Error: 2416151.70
R2 Score: 0.99
Model and scaler saved successfully!
