In [1]:
%load_ext lab_black

import pandas as pd
import os
from joblib import dump, load

from sklearn.model_selection import train_test_split
from sklearn.model_selection import KFold
from sklearn.metrics import classification_report
from sklearn.metrics import accuracy_score
from sklearn.metrics import precision_score
from sklearn.metrics import r2_score
from sklearn.metrics import mean_squared_error

from sklearn.ensemble import RandomForestClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.naive_bayes import MultinomialNB
from sklearn.linear_model import LinearRegression
from sklearn.linear_model import Ridge
from sklearn.neural_network import MLPClassifier
from sklearn.linear_model import Lars

import matplotlib

## IMPORT DATA

In [2]:
df_wne = pd.read_csv(os.path.join("Data", "wine_participant.csv"))

In [3]:
df_wne["FAT_CARB"] = df_wne["fat"] / df_wne["carbohydrates"]
df_wne["FAT_PRO"] = df_wne["fat"] / df_wne["protein"]
df_wne["PROTEIN_NEW"] = df_wne["protein_token"] * df_wne["protein"]
df_wne_final = df_wne.dropna()
df_wne_final

Unnamed: 0.1,Unnamed: 0,X,Unnamed..0,name,ingredients,Predicts,calories,protein,fat,sugar,...,style2,wine2,token2,style3,wine3,token3,protein_token,FAT_CARB,FAT_PRO,PROTEIN_NEW
0,38514,38514,57762,spinach feta pastries,frozen spinach feta cheese eggs sun-dried t...,['melon de bourgogne' 'sauvignon blanc' 'riesl...,0.39,0.37,1.49,0.82,...,1,sauvignon blanc,8,1,riesling dry,9,9,2.191176,4.027027,3.33
1,21398,21398,28,sorrel tarragon sauce,mayonnaise sour cream fresh sorrel fresh ta...,['chardonnay' 'gewurztraminer' 'champagne'],8.50,0.34,7.16,9.59,...,2,gewurztraminer,11,1,champagne,40,9,3.527094,21.058824,3.06
2,64706,64706,65351,honey dijon salmon,dijon mustard honey garlic clove soy sauce ...,['riesling dry' 'champagne' 'pinot noir'],4.21,4.33,1.34,7.55,...,1,champagne,40,3,pinot noir,5,5,1.576471,0.309469,21.65
3,66089,66089,7939,savory fried chicken,chicken pieces flour salt pepper sage oni...,['gewurztraminer' 'sauvignon blanc' 'pinot noir'],3.59,2.01,2.54,0.20,...,1,sauvignon blanc,8,3,pinot noir,5,4,2.988235,1.263682,8.04
4,45255,45255,74968,swai filets greek basil garlic parmesan,swai fillets extra virgin olive oil basil g...,['riesling dry' 'chardonnay' 'riesling off-dry'],2.86,1.88,2.84,0.41,...,2,chardonnay,7,3,riesling off-dry,10,9,16.705882,1.510638,16.92
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
50158,55848,55848,61990,chou fleur persill cauliflower with egg and p...,cauliflower water salt & freshly ground blac...,['gewurztraminer' 'chardonnay' 'vermentino'],0.48,0.40,1.87,3.06,...,2,chardonnay,7,1,vermentino,36,9,3.666667,4.675000,3.60
50159,5886,5886,43254,chou glorieux glorious cabbage,head of cabbage unsalted butter vegetable oi...,['pinot noir' 'riesling off-dry' 'gewurztramin...,3.74,0.84,3.28,6.73,...,3,riesling off-dry,10,2,gewurztraminer,11,9,2.143791,3.904762,7.56
50160,46978,46978,67691,chicken quesadilla base,chicken breast flour tortillas lime garlic ...,['chardonnay' 'pinot noir' 'marsanne blend'],4.86,1.81,2.99,1.02,...,3,pinot noir,5,2,marsanne blend,33,4,1.769231,1.651934,7.24
50161,6296,6296,55024,charquican chilean stew,meat corn assorted fresh vegetable lard br...,['chenin blanc off-dry' 'riesling off-dry' 'ge...,5.91,1.17,1.42,5.92,...,3,riesling off-dry,10,2,gewurztraminer,11,9,0.270476,1.213675,10.53


In [4]:
df_wne_holdout = pd.read_csv(os.path.join("Data", "wine_holdout.csv"))

In [5]:
df_wne_holdout["FAT_CARB"] = df_wne_holdout["fat"] / df_wne_holdout["carbohydrates"]
df_wne_holdout["FAT_PRO"] = df_wne_holdout["fat"] / df_wne_holdout["protein"]
df_wne_holdout["PROTEIN_NEW"] = (
    df_wne_holdout["protein_token"] * df_wne_holdout["protein"]
)
df_wne_holdout_final = df_wne_holdout.dropna()

In [6]:
X = df_wne_final[
    [
        "protein",
        "protein_token",
        "calories",
        "fat",
        "sodium",
        "sugar",
        "FAT_PRO",
        "PROTEIN_NEW",
    ]
].values
# X = x.astype(int)

In [7]:
Y = df_wne["style2"].values
y = Y.astype(int)

## TEST TRAIN SPLIT

In [8]:
kf = KFold(n_splits=100, shuffle=True, random_state=1234)
kf.get_n_splits(X)

print(kf)

for train_index, test_index in kf.split(X):
    print("TRAIN:", train_index, "TEST:", test_index)
    X_train, X_test = X[train_index], X[test_index]
    y_train, y_test = y[train_index], y[test_index]

# X_train, X_test, y_train, y_test = train_test_split(
#     X, y, test_size=0.175, random_state=7456
# )

KFold(n_splits=100, random_state=1234, shuffle=True)
TRAIN: [    0     1     2 ... 50159 50160 50161] TEST: [   27   778   806   923   960  1045  1088  1239  1440  2011  2106  2118
  2235  2458  2689  2691  2732  2804  2828  2860  2903  3035  3092  3159
  3328  3378  3521  3545  3671  3725  3795  3859  3944  3975  4066  4114
  4343  4348  4356  4375  4591  4768  4802  4930  4987  5090  5197  5243
  5301  5515  5632  5683  5866  5884  6041  6065  6112  6128  6343  6404
  6492  6534  6580  6859  7097  7157  7493  7569  7635  7754  7914  7937
  7989  8055  8062  8067  8278  8393  8428  8509  8786  8872  8916  9056
  9133  9258  9439  9502  9631  9799  9823  9994 10071 10101 10247 10362
 10403 10567 10595 10719 10927 10942 11087 11130 11208 11354 11577 11776
 11783 11879 11880 11935 12189 12392 12501 12508 12819 12833 12839 12867
 12901 13451 13616 13635 14110 14123 14144 14438 14509 14719 14948 14971
 15084 15089 15175 15442 15521 15664 15769 15815 15871 15885 15928 16037
 16510 16739 167

## Random forest, GaussianNB, Linear Regression, Weighted RF, NN

In [9]:
# clf = MLPClassifier(
#     solver="sgd", alpha=1e-2, max_iter=500, hidden_layer_sizes=(175,), random_state=1
# )
# clf.fit(X_train, y_train)
# y_pred = clf.predict(X_test)
# print(classification_report(y_test, y_pred))

# lrr = Ridge()
# lrr.fit(X_train, y_train)
# y_pred = lrr.predict(X_test)
# print(r2_score(y_test, y_pred))

# lr_wne = LinearRegression()
# lr_wne.fit(X_train, y_train)
# y_pred = lr_wne.predict(X_test)
# print(r2_score(y_test, y_pred))

# lar_wne = Lars()
# lar_wne.fit(X_train, y_train)
# y_pred = lar_wne.predict(X_test)
# print(r2_score(y_test, y_pred))

# nbm_wne = MultinomialNB()
# nbm_wne.fit(X_train, y_train)
# y_pred = nbm_wne.predict(X_test)
# print(classification_report(y_test, y_pred))

# nb_wne = GaussianNB()
# nb_wne.fit(X_train, y_train)
# y_pred = nb_wne.predict(X_test)
# print(classification_report(y_test, y_pred))

wrf_wne = RandomForestClassifier(class_weight="balanced")
wrf_wne.fit(X_train, y_train)
y_pred = wrf_wne.predict(X_test)
print(mean_squared_error(y_test, y_pred))
print(classification_report(y_test, y_pred))
# rf_wne = RandomForestClassifier()
# rf_wne.fit(X_train, y_train)
# y_pred = rf_wne.predict(X_test)
# print(classification_report(y_test, y_pred))

1.8562874251497006
              precision    recall  f1-score   support

           1       0.34      0.28      0.31        85
           2       0.18      0.10      0.12        94
           3       0.36      0.43      0.39       169
           4       0.31      0.39      0.34       115
           5       0.24      0.18      0.21        38

    accuracy                           0.32       501
   macro avg       0.29      0.28      0.28       501
weighted avg       0.30      0.32      0.30       501



In [10]:
# dump(lrr, "lrr.joblib")

# dump(lr_wne, "lr_wne.joblib")

# dump(lar_wne, "lar_wne.joblib")

# dump(nb_wne, "nb_wne.joblib")

# dump(nbm_wne, "nbm_wne.joblib")

dump(wrf_wne, "wrf_wne.joblib")

# dump(rf_wne, "rf_wne.joblib")

['wrf_wne.joblib']

## Run Model

In [11]:
X = df_wne_holdout_final[
    [
        "protein",
        "protein_token",
        "calories",
        "fat",
        "sodium",
        "sugar",
        "FAT_PRO",
        "PROTEIN_NEW",
    ]
].values
Y = df_wne_holdout_final["style1"].values
y = Y.astype(int)

In [12]:
# reg = load("nb_wne.joblib")

# reg = load("nb.joblib")

# reg = load("rf_wne.joblib")

reg = load("wrf_wne.joblib")

In [13]:
prediction = reg.predict(X)

In [14]:
print(prediction)

[3 1 2 ... 5 3 1]


In [16]:
# df_pred = pd.DataFrame(prediction)

# df_pred.head(15)
# df_pred.rename(columns={0: "Predicted Bo_Ft"})
df_wne_holdout_final["WINE_PRED"] = prediction.astype(int)
df_wne_holdout_final[
    [
        "name",
        "ingredients",
        "Predicts",
        "calories",
        "protein",
        "fat",
        "sugar",
        "style1",
        "wine1",
        "style2",
        "wine2",
        "style3",
        "wine3",
        "WINE_PRED",
    ]
].head(50)

df_test = df_wne_holdout_final[(df_wne_holdout_final["WINE_PRED"] == 1)]
df_test[
    [
        "name",
        "ingredients",
        "Predicts",
        "calories",
        "protein",
        "fat",
        "sugar",
        "style1",
        "wine1",
        "style2",
        "wine2",
        "style3",
        "wine3",
        "WINE_PRED",
    ]
].head(50)

Unnamed: 0,name,ingredients,Predicts,calories,protein,fat,sugar,style1,wine1,style2,wine2,style3,wine3,WINE_PRED
1,kanttarellikastike finnish mushrooms,butter onion wild mushrooms vegetable stock...,['chardonnay' 'pinot noir' 'gewurztraminer'],2.14,0.44,3.28,3.47,2,chardonnay,3,pinot noir,2,gewurztraminer,1
29,barbecued bacon and scallop brochettes,fresh scallops olive oil lemon juice dill s...,['champagne' 'sangiovese' 'riesling off-dry'],2.36,1.01,3.13,0.0,1,champagne,4,sangiovese,3,riesling off-dry,1
35,tuna or chicken macaroni supper salad,elbow macaroni mayonnaise italian salad dres...,['sauvignon blanc' 'riesling dry' 'vermentino'],5.42,1.68,2.91,5.92,1,sauvignon blanc,1,riesling dry,1,vermentino,1
41,filipino chicken adobo,chicken pieces garlic white vinegar black p...,['sauvignon blanc' 'riesling off-dry' 'pinot n...,2.23,1.85,2.39,0.2,1,sauvignon blanc,3,riesling off-dry,3,pinot noir,1
43,potatoes and eggs sumac,potatoes oil eggs onion salt pepper sumac,['pinot noir' 'riesling off-dry' 'chenin blanc...,2.23,0.74,1.34,2.86,3,pinot noir,3,riesling off-dry,1,chenin blanc dry,1
45,oh no not broccoli,broccoli floret slivered almonds fat free ho...,['chardonnay' 'chenin blanc dry' 'pinot noir'],0.95,0.91,1.64,1.43,2,chardonnay,1,chenin blanc dry,3,pinot noir,1
52,asparagus pesto pasta,fresh asparagus fresh basil leaves parmesan ...,['riesling dry' 'chardonnay' 'riesling off-dry'],3.58,0.87,2.01,3.27,1,riesling dry,2,chardonnay,3,riesling off-dry,1
53,grilled salmon with zucchini confit,salmon fillets zucchini olive oil fresh ros...,['riesling dry' 'sauvignon blanc' 'albariño'],6.24,2.52,4.85,3.67,1,riesling dry,1,sauvignon blanc,1,albariño,1
59,red bell pepper soup with corn and parsley,red bell peppers butter garlic spring onion...,['chenin blanc off-dry' 'riesling off-dry' 'ge...,0.46,0.17,1.87,4.49,3,chenin blanc off-dry,3,riesling off-dry,2,gewurztraminer,1
69,spicy scampi el sombrero,large shrimp fresh garlic red pepper green ...,['chardonnay' 'sauvignon blanc' 'riesling dry'],1.09,0.77,2.16,2.86,2,chardonnay,1,sauvignon blanc,1,riesling dry,1
