## Importing libraries 

In [33]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.preprocessing import StandardScaler, OneHotEncoder


from sklearn.linear_model import LogisticRegression , LinearRegression
from sklearn.neighbors import KNeighborsClassifier, KNeighborsRegressor
from sklearn.tree import DecisionTreeClassifier , DecisionTreeRegressor

from sklearn.model_selection import GridSearchCV

from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score, confusion_matrix,plot_confusion_matrix,cohen_kappa_score,accuracy_score,recall_score,precision_score,f1_score,classification_report

## Importing dataframes

In [10]:
cookies_train = pd.read_csv("../data/cookies_processed.csv")
cookies_test = pd.read_csv("../data/cookies_test_processed.csv")

In [11]:
# regressor

In [12]:
X_train = cookies_train.drop(columns="quality")

In [13]:
X_test = cookies_test.drop(columns="quality")

In [14]:
y_train = cookies_train["quality"]

In [15]:
y_test = cookies_test["quality"]

In [22]:
scaler = StandardScaler()

In [24]:
scaler.fit(X_train)

StandardScaler()

In [25]:
X_train_scaled = scaler.transform(X_train)
X_test_scaled = scaler.transform(X_test)

## KNeighbors

In [26]:
# defining our ML model

In [64]:
grid = GridSearchCV(estimator = KNeighborsRegressor(), param_grid= {'n_neighbors':[2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22,23,24,25,26,27,28,29,30,31,32,33,34,35,36,37,38,39,40], 'weights':['uniform','distance']})

In [66]:
grid.fit(X_train_scaled, y_train)

GridSearchCV(estimator=KNeighborsRegressor(),
             param_grid={'n_neighbors': [2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13,
                                         14, 15, 16, 17, 18, 19, 20, 21, 22, 23,
                                         24, 25, 26, 27, 28, 29, 30, 31, ...],
                         'weights': ['uniform', 'distance']})

In [67]:
grid.best_params_

{'n_neighbors': 14, 'weights': 'distance'}

In [69]:
y_pred_test_kneighbors_cookies = grid.predict(X_test_scaled)

In [72]:
prediction_array_kneigh_cookies = y_pred_test_kneighbors_cookies

In [73]:
np.savetxt('data.csv', prediction_array_kneigh_cookies, delimiter=',')


## Decision Tree

In [51]:
X = cookies_train.drop(columns = 'quality')


In [52]:
X_1 = cookies_test.drop(columns = 'quality')


In [53]:
X = pd.concat([X, X_1], axis=0)


In [54]:
n_features = X.shape[1]
n_samples = X.shape[0]

grid_dt = GridSearchCV(estimator = DecisionTreeRegressor(), param_grid = {"criterion": ["mse", "friedman_mse", "mae", "poisson"],
                                                                       "splitter": ["best", "random"],
                                                                       "max_depth": [None,2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20],
                                                                       'max_features': [None, 'sqrt', 'auto', 'log2', 0.3,0.5,0.7, n_features//2, n_features//3, ],
                                                                        'min_samples_split': [2,0.3,0.5, n_samples//2, n_samples//3, n_samples//5],
                                                                        'min_samples_leaf':[1, 0.3,0.5, n_samples//2, n_samples//3, n_samples//5] }, cv=5)
grid_dt.fit(X_train_scaled, y_train)


GridSearchCV(cv=5, estimator=DecisionTreeRegressor(),
             param_grid={'criterion': ['mse', 'friedman_mse', 'mae', 'poisson'],
                         'max_depth': [None, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12,
                                       13, 14, 15, 16, 17, 18, 19, 20],
                         'max_features': [None, 'sqrt', 'auto', 'log2', 0.3,
                                          0.5, 0.7, 9, 6],
                         'min_samples_leaf': [1, 0.3, 0.5, 2981, 1987, 1192],
                         'min_samples_split': [2, 0.3, 0.5, 2981, 1987, 1192],
                         'splitter': ['best', 'random']})

In [55]:
grid_dt.best_params_


{'criterion': 'friedman_mse',
 'max_depth': 6,
 'max_features': None,
 'min_samples_leaf': 1,
 'min_samples_split': 2,
 'splitter': 'best'}

In [59]:
y_pred_test_dt = grid_dt.predict(X_test_scaled)

In [61]:
prediction_array_dt_cookies = y_pred_test_dt

In [76]:
prediction_array_dt_cookies

array([7.55572065, 7.8313253 , 7.8313253 , 8.19364599, 7.55572065,
       7.28767123, 7.8313253 , 7.8313253 , 8.19364599, 7.93946731,
       7.55572065, 7.88888889, 7.28767123, 7.8313253 , 8.5       ,
       7.55572065, 8.64383562, 7.28767123, 8.5       , 8.19364599,
       7.93946731, 7.93946731, 8.19364599, 7.55572065, 7.55572065,
       7.55572065, 7.66666667, 7.8313253 , 7.55572065, 7.55572065,
       8.19364599, 7.8313253 , 7.28767123, 7.38461538, 8.19364599,
       7.93946731, 8.5       , 8.19364599, 6.77142857, 7.93946731,
       7.93946731, 7.8313253 , 7.93946731, 7.28767123, 7.28767123,
       7.28767123, 7.55572065, 8.19364599, 7.55572065, 7.93946731,
       7.28767123, 7.55572065, 7.8313253 , 7.93946731, 7.28767123,
       8.19364599, 7.55572065, 8.28571429, 8.19047619, 8.76767677,
       7.55572065, 7.8313253 , 6.77142857, 8.19364599, 8.19364599,
       7.8313253 , 7.55572065, 7.55572065, 7.55572065, 8.19047619,
       7.93946731, 8.5       , 7.55572065, 7.28767123, 7.93946

In [62]:
np.savetxt('dt_cookies_array.csv', prediction_array_dt_cookies, delimiter=',')