## Importing libraries 

In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.preprocessing import StandardScaler, OneHotEncoder


from sklearn.linear_model import LogisticRegression , LinearRegression
from sklearn.neighbors import KNeighborsClassifier, KNeighborsRegressor
from sklearn.tree import DecisionTreeClassifier , DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor

from sklearn.model_selection import GridSearchCV

from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score, confusion_matrix,plot_confusion_matrix,cohen_kappa_score,accuracy_score,recall_score,precision_score,f1_score,classification_report

## Importing dataframes

In [2]:
cookies2_train = pd.read_csv("../data/cookies2_processed.csv")
cookies2_test = pd.read_csv("../data/cookies2_test_processed.csv")

In [3]:
# regressor

In [4]:
X_train = cookies2_train.drop(columns="quality")

In [5]:
X_test = cookies2_test.drop(columns="quality")

In [6]:
y_train = cookies2_train["quality"]

In [7]:
y_test = cookies2_test["quality"]

In [8]:
scaler = StandardScaler()

In [9]:
scaler.fit(X_train)

StandardScaler()

In [10]:
#transforming the data without the prediction column with the scaler we trained beforehand

X_train_scaled = scaler.transform(X_train)
X_test_scaled = scaler.transform(X_test)

## KNeighbors

In [30]:
grid = GridSearchCV(estimator = KNeighborsRegressor(), param_grid= {'n_neighbors':[2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22,23,24,25,26,27,28,29,30,31,32,33,34,35,36,37,38,39,40], 'weights':['uniform','distance']})

In [31]:
grid.fit(X_train_scaled, y_train)

GridSearchCV(estimator=KNeighborsRegressor(),
             param_grid={'n_neighbors': [2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13,
                                         14, 15, 16, 17, 18, 19, 20, 21, 22, 23,
                                         24, 25, 26, 27, 28, 29, 30, 31, ...],
                         'weights': ['uniform', 'distance']})

In [32]:
grid.best_params_

{'n_neighbors': 15, 'weights': 'distance'}

In [33]:
y_pred_test = grid.predict(X_test_scaled)

In [35]:
prediction_array = y_pred_test

In [36]:
np.savetxt('data.csv', prediction_array, delimiter=',')

## KNeighbors 2

In [12]:
grid = GridSearchCV(estimator = KNeighborsRegressor(), param_grid= {'n_neighbors':[2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22,23,24,25,26,27,28,29,30,31,32,33,34,35,36,37,38,39,40], 'weights':['uniform','distance'], "algorithm":["auto", "ball_tree", "kd_tree", "brute"]})

In [13]:
grid.fit(X_train_scaled, y_train)

GridSearchCV(estimator=KNeighborsRegressor(),
             param_grid={'algorithm': ['auto', 'ball_tree', 'kd_tree', 'brute'],
                         'n_neighbors': [2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13,
                                         14, 15, 16, 17, 18, 19, 20, 21, 22, 23,
                                         24, 25, 26, 27, 28, 29, 30, 31, ...],
                         'weights': ['uniform', 'distance']})

In [14]:
grid.best_params_

{'algorithm': 'auto', 'n_neighbors': 15, 'weights': 'distance'}

In [15]:
y_pred_test = grid.predict(X_test_scaled)

In [16]:
prediction_array2 = y_pred_test

In [None]:
#np.savetxt('KN_cookies2_array.csv', prediction_array, delimiter=',')

## Decision Tree

In [10]:
X = cookies2_train.drop(columns = 'quality')


In [11]:
X_1 = cookies2_test.drop(columns = 'quality')


In [12]:
X = pd.concat([X, X_1], axis=0)

In [13]:
n_features = X.shape[1]
n_samples = X.shape[0]

grid_dt = GridSearchCV(estimator = DecisionTreeRegressor(), param_grid = {"criterion": ["mse", "friedman_mse", "mae", "poisson"],
                                                                       "splitter": ["best", "random"],
                                                                       "max_depth": [None,2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20],
                                                                       'max_features': [None, 'sqrt', 'auto', 'log2', 0.3,0.5,0.7, n_features//2, n_features//3, ],
                                                                        'min_samples_split': [2,0.3,0.5, n_samples//2, n_samples//3, n_samples//5],
                                                                        'min_samples_leaf':[1, 0.3,0.5, n_samples//2, n_samples//3, n_samples//5] }, cv=5)
grid_dt.fit(X_train_scaled, y_train)


GridSearchCV(cv=5, estimator=DecisionTreeRegressor(),
             param_grid={'criterion': ['mse', 'friedman_mse', 'mae', 'poisson'],
                         'max_depth': [None, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12,
                                       13, 14, 15, 16, 17, 18, 19, 20],
                         'max_features': [None, 'sqrt', 'auto', 'log2', 0.3,
                                          0.5, 0.7, 8, 5],
                         'min_samples_leaf': [1, 0.3, 0.5, 2981, 1987, 1192],
                         'min_samples_split': [2, 0.3, 0.5, 2981, 1987, 1192],
                         'splitter': ['best', 'random']})

In [14]:
grid_dt.best_params_

{'criterion': 'friedman_mse',
 'max_depth': 5,
 'max_features': 'auto',
 'min_samples_leaf': 1,
 'min_samples_split': 2,
 'splitter': 'best'}

In [15]:
y_pred_test_dt = grid_dt.predict(X_test_scaled)

In [16]:
prediction_array_dt_cookies2 = y_pred_test_dt

In [18]:
prediction_array_dt_cookies2

array([7.67292912, 7.67292912, 7.67292912, 8.18552036, 7.67292912,
       7.31222707, 7.67292912, 7.67292912, 8.18552036, 8.04526749,
       7.67292912, 7.70422535, 7.31222707, 7.67292912, 8.61847988,
       7.67292912, 8.04526749, 7.31222707, 8.61847988, 8.18552036,
       8.04526749, 8.04526749, 8.18552036, 7.67292912, 7.67292912,
       7.67292912, 7.66666667, 7.67292912, 7.67292912, 7.67292912,
       8.18552036, 7.67292912, 7.31222707, 7.70422535, 8.18552036,
       8.04526749, 8.61847988, 8.18552036, 6.8625    , 8.04526749,
       8.04526749, 7.67292912, 8.04526749, 7.31222707, 7.31222707,
       7.31222707, 7.67292912, 8.18552036, 7.67292912, 8.04526749,
       7.31222707, 7.67292912, 7.67292912, 8.04526749, 7.31222707,
       8.18552036, 7.67292912, 6.8625    , 8.        , 8.61847988,
       7.67292912, 7.67292912, 6.8625    , 8.18552036, 8.18552036,
       7.67292912, 7.67292912, 7.67292912, 7.67292912, 8.        ,
       8.04526749, 8.61847988, 7.67292912, 7.31222707, 8.04526

In [17]:
np.savetxt('dt_cookies2_array.csv', prediction_array_dt_cookies2, delimiter=',')

## Random Forest

In [11]:
grid_rf = GridSearchCV(estimator = RandomForestRegressor(), param_grid= {"n_estimators":(list(range(175,401,25))), "max_depth":list(range(10,21)), 'criterion': ["mse", "mae"], 'max_features':['auto','sqrt', "log2"]})

In [20]:
grid_rf.fit(X_train_scaled, y_train)


NameError: name 'grid_dt' is not defined

In [None]:
grid_rf.best_params_

In [None]:
y_pred_test_rf = grid_rf.predict(X_test_scaled)

In [None]:
prediction_array_rf_cookies2 = y_pred_test_rf

In [None]:
np.savetxt('rf_cookies2_array.csv', prediction_array_rf_cookies2, delimiter=',')