In [5]:
import pandas as pd
import json
from sklearn.linear_model import LinearRegression
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error, mean_absolute_error
import statsmodels.api as sm
from xgboost import XGBRegressor

In [9]:
# Step 1: Load data
df = pd.read_csv("reimbursements.csv")
X = df[["trip_duration_days", "miles_traveled", "total_receipts_amount"]]
y = df["expected_output"]

# ========== Linear Regression ==========
lr_model = LinearRegression()
lr_model.fit(X, y)
lr_preds = lr_model.predict(X)
print("Linear Regression MAE:", mean_absolute_error(y, lr_preds))

# ========== Generalized Linear Model (GLM) ==========
X_glm = sm.add_constant(X)  # statsmodels needs explicit intercept
glm_model = sm.GLM(y, X_glm, family=sm.families.Gaussian())
glm_results = glm_model.fit()
glm_preds = glm_results.predict(X_glm)
print("GLM MAE:", mean_absolute_error(y, glm_preds))
print(glm_results.summary())  # Optional: view detailed model summary

# ========== XGBoost Regressor ==========
xgb_model = XGBRegressor(n_estimators=100, learning_rate=0.1, verbosity=0)
xgb_model.fit(X, y)
xgb_preds = xgb_model.predict(X)
print("XGBoost MAE:", mean_absolute_error(y, xgb_preds))

Intercept: 266.70768050486413
Coefficients: [50.05048622  0.44564529  0.38286076]
Training MSE: 47734.51387704825
Training MAE: 175.49208949851337
