In [1]:
from sklearn.cluster import KMeans, DBSCAN
from prepare import Prepare
from split_get_scale import SplitGetScale
import pandas as pd
from sklearn.metrics import mean_squared_error
from sklearn.linear_model import LinearRegression, LassoCV, RidgeCV
from sklearn.model_selection import RandomizedSearchCV
from sklearn.ensemble import RandomForestRegressor

nutrition_facts = Prepare().get_food_prep()

sgs = SplitGetScale()
train, validate, test = sgs.split(nutrition_facts)
(X_train, y_train), (X_validate, y_validate), (X_test, y_test) = sgs.get_Xy(train, validate, test, target_col="calories", cols_drop=["food_group", "calories"])

X_train_scaled, X_validate_scaled, X_test_scaled = sgs.scale(X_train, X_validate, X_test)

X_train_scaled.head()

Unnamed: 0,fat,protein,carbohydrate,sugars,fiber,cholesterol,saturated_fats,calcium,iron,potassium,magnesium,vitamin_a,vitamin_c,vitamin_b12,vitamin_d,vitamin_e_alphatocopherol,water,omega_3s,omega_6s,pral_score,phosphorus,sodium,zinc,copper,selenium,thiamin_b1,riboflavin_b2,niacin_b3,vitamin_b6,folate_b9,folic_acid,food_folate,folate_dfe,choline,retinol,carotene_beta,carotene_alpha,lycopene,lutein_plus_zeaxanthin,vitamin_k,fatty_acids_total_monounsaturated,fatty_acids_total_polyunsaturated,alcohol,caffeine,theobromine
5653,-0.679827,-1.036063,-0.300931,0.492466,-0.537367,-0.350203,-0.558501,-0.416059,-0.450038,-0.811608,-0.650832,-0.135566,-0.138123,0.409945,-0.148007,-0.296869,1.018601,-0.175639,-0.43252,-0.755126,-0.823591,-0.303481,-0.522053,-0.316597,-0.479409,-0.438419,-0.54928,-0.708121,1.770296,-0.376956,-0.17665,-0.482113,-0.312864,-0.574692,-0.092949,-0.237682,-0.140486,-0.15551,-0.202845,-0.232067,-0.572393,-0.452614,-0.079248,0.403413,-0.072008
9675,0.211305,0.401395,-0.153366,-0.206112,-0.018763,0.009872,0.32121,0.606675,-0.046611,-0.0982,-0.047258,-0.081624,-0.106019,0.030653,-0.08544,-0.159159,-0.136939,-0.172645,-0.023525,0.021437,0.24368,0.257249,0.384729,-0.128166,0.158436,-0.069342,-0.069183,0.07487,-0.087652,-0.092188,-0.125722,0.052281,-0.097005,0.146798,-0.049777,-0.169886,-0.105463,0.56036,-0.168559,-0.115501,0.12509,-0.030031,-0.079248,-0.030786,-0.072008
8162,0.469108,-0.850584,-0.046867,0.789298,-0.293318,0.081888,1.337468,-0.214037,-0.411291,-0.389454,-0.44964,0.11427,0.049436,-0.239124,0.070978,-0.12965,0.15662,-0.163661,-0.278106,-0.591666,-0.626832,-0.368234,-0.469864,-0.244431,-0.416367,-0.386475,-0.269224,-0.646132,-0.422313,-0.285423,-0.17665,-0.281715,-0.250196,-0.382806,0.154568,-0.199594,-0.13173,-0.15551,-0.193141,-0.207266,0.085069,-0.284011,-0.079248,-0.030786,-0.072008
7994,-0.42575,-0.883558,-0.472413,0.168339,-0.537367,-0.350203,-0.281727,-0.11934,-0.219834,-0.575987,-0.516704,-0.048975,-0.024912,-0.199058,0.164829,-0.103419,0.968116,-0.166655,-0.279986,-0.673139,-0.674531,-0.390277,-0.342654,-0.192312,-0.405241,-0.255248,-0.280654,-0.529522,-0.488668,-0.275253,-0.04933,-0.482113,-0.19449,-0.265756,-0.005167,-0.237682,-0.140486,-0.15551,-0.202845,-0.165103,-0.362114,-0.294534,-0.079248,-0.030786,-0.072008
10728,0.223227,-0.781545,0.90576,1.042629,0.072756,-0.350203,0.023131,-0.365553,-0.190204,-0.582532,-0.471995,-0.135566,-0.112777,-0.257822,-0.148007,-0.037843,-0.548297,0.051954,0.435688,-0.687147,-0.644719,-0.095446,-0.456817,-0.182289,-0.193862,-0.036535,-0.226358,-0.390047,-0.572333,-0.071847,0.128918,-0.348515,0.014406,-0.488343,-0.092949,-0.233873,-0.140486,-0.15551,-0.189907,-0.129141,0.264315,0.375094,-0.079248,-0.030786,-0.072008


In [2]:
features = ["vitamin_a", "vitamin_c", "vitamin_b12", "vitamin_d", "vitamin_e_alphatocopherol", "thiamin_b1", "riboflavin_b2", "niacin_b3", "vitamin_b6", "folate_b9", "vitamin_k"]

clusters_train = X_train_scaled[features]
clusters_validate = X_validate_scaled[features]
clusters_test = X_test_scaled[features]

# creating the object
kmeans = KMeans(n_clusters=8, max_iter=500)

# fitting the object
kmeans.fit(clusters_train)

#predicting using the kmeans object
y_kmeans_train = kmeans.predict(clusters_train)
X_train_scaled['feat_clusters'] = y_kmeans_train

#predicting using the kmeans object
y_kmeans_validate = kmeans.predict(clusters_validate)
X_validate_scaled['feat_clusters'] = y_kmeans_validate

#predicting using the kmeans object
y_kmeans_test = kmeans.predict(clusters_test)
X_test_scaled['feat_clusters'] = y_kmeans_test

In [3]:
print(f"TRAIN\X_train_scaled Shape: {X_train_scaled.shape}")
print(f"y_train Shape: {y_train.shape}")
print(f"VALIDATE\X_validate_scaled Shape: {X_validate_scaled.shape}")
print(f"y_validate Shape: {y_validate.shape}")
print(f"TEST\X_test_scaled Shape: {X_test_scaled.shape}")
print(f"y_test Shape: {y_test.shape}")

TRAIN\X_train_scaled Shape: (9544, 46)
y_train Shape: (9544,)
VALIDATE\X_validate_scaled Shape: (2386, 46)
y_validate Shape: (2386,)
TEST\X_test_scaled Shape: (2106, 46)
y_test Shape: (2106,)


In [4]:
act_pred_error = pd.DataFrame({"actual": y_train})
act_pred_error["baseline_prediction"] = y_train.mean()
baseline_rmse = mean_squared_error(act_pred_error["actual"], act_pred_error["baseline_prediction"], squared=False)
baseline_rmse

149.2582021605062

In [5]:
def grid_search(X, y, model, params_dic):
    grid = RandomizedSearchCV(model, params_dic, n_jobs=-1)
    return grid.fit(X, y)

In [7]:
rfr_grid = {"n_estimators": [10, 25, 50, 75, 100] , "max_depth": [None, 10, 25, 50] , "min_samples_split": [2, 4, 6, 8, 10], "min_samples_leaf": [1, 2.5, 5,], "bootstrap": [True, False], "warm_start": [False, True]}
rfr = grid_search(X_train_scaled, y_train, RandomForestRegressor(random_state=123, n_jobs=-1), rfr_grid)

print(f"random forest regressor: {rfr.best_estimator_}")
print(f"random forest regressor: {rfr.best_params_}")

random forest regressor: RandomForestRegressor(max_depth=25, n_estimators=10, n_jobs=-1,
                      random_state=123, warm_start=True)
random forest regressor: {'warm_start': True, 'n_estimators': 10, 'min_samples_split': 2, 'min_samples_leaf': 1, 'max_depth': 25, 'bootstrap': True}


In [8]:
lasso = LassoCV(random_state=123).fit(X_train_scaled, y_train)
ridge = RidgeCV().fit(X_train_scaled, y_train)
rfr = RandomForestRegressor(n_estimators=10, max_depth=25, min_samples_split=2, min_samples_leaf=1, bootstrap=True, warm_start=True, random_state=123, n_jobs=-1).fit(X_train_scaled, y_train)
lr = LinearRegression().fit(X_train_scaled, y_train)

In [9]:
lass_pred_train = lasso.predict(X_train_scaled)
ridge_pred_train = ridge.predict(X_train_scaled)
rfr_pred_train = rfr.predict(X_train_scaled)
lr_pred_train = lr.predict(X_train_scaled)

lass_pred_val = lasso.predict(X_validate_scaled)
ridge_pred_val = ridge.predict(X_validate_scaled)
rfr_pred_val = rfr.predict(X_validate_scaled)
lr_pred_val = lr.predict(X_validate_scaled)

print(f"TRAIN\nlasso train rmse: {mean_squared_error(y_train, lass_pred_train, squared=False)}\nridge train rmse: {mean_squared_error(y_train, ridge_pred_train, squared=False)}\nrfr train rmse: {mean_squared_error(y_train, rfr_pred_train, squared=False)}\nlr train rmse: { mean_squared_error(y_train, lr_pred_train, squared=False)}")

print(f"VALIDATE\nlasso validation rmse: {mean_squared_error(y_validate, lass_pred_val, squared=False)}\nridge validation rmse: {mean_squared_error(y_validate, ridge_pred_val, squared=False)}\nrfr validation rmse: {mean_squared_error(y_validate, rfr_pred_val, squared=False)}\nlr validation rmse: { mean_squared_error(y_validate, lr_pred_val, squared=False)}")

TRAIN
lasso train rmse: 7.407998823002653
ridge train rmse: 7.10763970624999
rfr train rmse: 4.268055366061249
lr train rmse: 7.107435935804069
VALIDATE
lasso validation rmse: 8.745662599770194
ridge validation rmse: 7.961317627625173
rfr validation rmse: 10.686753744206765
lr validation rmse: 7.960004839921713


### Linear Regression is best

In [11]:
lr_pred_test = lr.predict(X_test_scaled)
print(f"lr test rmse: {mean_squared_error(y_test, lr_pred_test, squared=False)}")

lr test rmse: 11.250274985444475
