In [9]:
from split_get_scale import SplitGetScale
from prepare import Prepare
from sklearn.linear_model import LinearRegression, LassoCV, RidgeCV
from sklearn.model_selection import RandomizedSearchCV
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_squared_error

nutrition_facts = Prepare().get_food_prep()
print(nutrition_facts.shape)
sgs = SplitGetScale()
train, validate, test = sgs.split(nutrition_facts)
(X_train, y_train), (X_validate, y_validate), (X_test, y_test) = sgs.get_Xy(train, validate, test, target_col="calories", cols_drop=["calories", "food_group"])

(14036, 46)


In [10]:
print(f"TRAIN\nX_train Shape: {X_train.shape}")
print(f"y_train Shape: {y_train.shape}")
print(f"VALIDATE\nX_validate Shape: {X_validate.shape}")
print(f"y_validate Shape: {y_validate.shape}")
print(f"TEST\nX_test Shape: {X_test.shape}")
print(f"y_test Shape: {y_test.shape}")

TRAIN
X_train Shape: (9544, 44)
y_train Shape: (9544,)
VALIDATE
X_validate Shape: (2386, 44)
y_validate Shape: (2386,)
TEST
X_test Shape: (2106, 44)
y_test Shape: (2106,)


In [11]:
X_train_scaled, X_validate_scaled, X_test_scaled = sgs.scale(X_train, X_validate, X_test)

### Baseline

In [12]:
from pandas import DataFrame
act_pred_error = DataFrame({"actual": y_train})
act_pred_error["baseline_prediction"] = y_train.mean()

In [13]:
act_pred_error.head()

Unnamed: 0,actual,baseline_prediction
5653,44.0,192.341745
9675,223.0,192.341745
8162,207.0,192.341745
7994,67.0,192.341745
10728,267.0,192.341745


In [14]:
baseline_rmse = mean_squared_error(act_pred_error["actual"], act_pred_error["baseline_prediction"], squared=False)
baseline_rmse

149.2582021605062

In [15]:
def grid_search(X, y, model, params_dic):
    grid = RandomizedSearchCV(estimator=model, param_distributions=params_dic, n_jobs=-1, n_iter=5)
    return grid.fit(X, y)

In [16]:
# rfr_grid = {"n_estimators": [10, 25, 50, 75, 100] , "max_depth": [None, 10, 25, 50] , "min_samples_split": [2, 4, 6, 8, 10], "min_samples_leaf": [1, 2.5, 5,], "bootstrap": [True, False], "warm_start": [False, True]}
# rfr = grid_search(X_train_scaled, y_train, RandomForestRegressor(random_state=123, n_jobs=-1), rfr_grid)

# print(f"random forest regressor: {rfr.best_estimator_}")
# print(f"random forest regressor: {rfr.best_params_}")

In [17]:
lasso = LassoCV(random_state=123).fit(X_train_scaled, y_train)
ridge = RidgeCV().fit(X_train_scaled, y_train)
rfr = RandomForestRegressor(n_estimators=10, max_depth=25, min_samples_split=2, min_samples_leaf=1, bootstrap=True, warm_start=True, random_state=123, n_jobs=-1).fit(X_train_scaled, y_train)
lr = LinearRegression().fit(X_train_scaled, y_train)

In [18]:
lass_pred_train = lasso.predict(X_train_scaled)
ridge_pred_train = ridge.predict(X_train_scaled)
rfr_pred_train = rfr.predict(X_train_scaled)
lr_pred_train = lr.predict(X_train_scaled)

lass_pred_val = lasso.predict(X_validate_scaled)
ridge_pred_val = ridge.predict(X_validate_scaled)
rfr_pred_val = rfr.predict(X_validate_scaled)
lr_pred_val = lr.predict(X_validate_scaled)

print(f"TRAIN\nlasso train rmse: {mean_squared_error(y_train, lass_pred_train, squared=False)}\nridge train rmse: {mean_squared_error(y_train, ridge_pred_train, squared=False)}\nrfr train rmse: {mean_squared_error(y_train, rfr_pred_train, squared=False)}\nlr train rmse: { mean_squared_error(y_train, lr_pred_train, squared=False)}")

print(f"VALIDATE\nlasso validation rmse: {mean_squared_error(y_validate, lass_pred_val, squared=False)}\nridge validation rmse: {mean_squared_error(y_validate, ridge_pred_val, squared=False)}\nrfr validation rmse: {mean_squared_error(y_validate, rfr_pred_val, squared=False)}\nlr validation rmse: { mean_squared_error(y_validate, lr_pred_val, squared=False)}")

TRAIN
lasso train rmse: 7.426538832963972
ridge train rmse: 7.12315109427869
rfr train rmse: 4.168578206046564
lr train rmse: 7.122945291633488
VALIDATE
lasso validation rmse: 8.789195467455302
ridge validation rmse: 7.994577223116341
rfr validation rmse: 9.980837696552372
lr validation rmse: 7.992957636825487


### move forward with Linear Regresion
- lasso is almost overfitting
- ridge looks good
- rfr overfit
- lr beat ridfe

### Clustering Model
<p>TRAIN</p>
<p>lasso train rmse: 7.407998823002653</p>
<p>ridge train rmse: 7.10763970624999</p>
<p>rfr train rmse: 4.268055366061249</p>
<p>lr train rmse: 7.107435935804069</p>
<p>VALIDATE</p>
<p>lasso validation rmse: 8.745662599770194</p>
<p>ridge validation rmse: 7.961317627625173</p>
<p>rfr validation rmse: 10.686753744206765</p>
<p>lr validation rmse: 7.960004839921713</p>

### Clustering Model is slightly better