In [10]:
# Import libraries

import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt

%matplotlib inline

In [None]:
from sklearn.preprocessing import Normalizer
from sklearn.preprocessing import RobustScaler

In [2]:
# Dabase credentials

import pymysql
from sqlalchemy import create_engine
driver = 'mysql+pymysql:'
user = 'ironhacker_read'
password = 'ir0nhack3r'
ip = '35.239.232.23'
database = 'cookies'

In [17]:
pd.set_option('display.max_columns',100)

In [37]:
# Create dataframe

connection_string = f'{driver}//{user}:{password}@{ip}/{database}'
engine = create_engine(connection_string)

query = 'SHOW TABLES'

user_df = pd.read_sql(query,engine)
user_df


query2 = """
SELECT * FROM cookies_quality
"""

cookies_df_original = pd.read_sql(query2, engine)

In [248]:
cookies_df = cookies_df_original.copy()

In [90]:
def graph_box_plots(df):
    
    cols = list(cookies_df.select_dtypes(include=['int64','float64']).columns)

    f, ax = plt.subplots(len(cols)//3,4, figsize=(12,10))

    plt.subplots_adjust(wspace=0.4,hspace=0.4)

    i = 0
    for row in ax:
        for col in row:
            if i == len(cols): 
                break
            else:    
                sns.boxplot(cookies_df[cols[i]], ax=col)
                i += 1

In [245]:
def clean_df(df):
    '''First cleaning of DataFrame'''
    
    df['crunch factor'] = df['crunch factor'].astype('float')
    df = df.dropna()
    df = df.drop(['diameter','aesthetic appeal','id'], axis=1)
    
    return df

In [246]:
def encode(df):
    '''Encodes categorical features'''
        
    flavour_list = ['raisins', 'nuts', 'chocolate', 'oats', 'peanut butter']
    
    for flavour in flavour_list:
        df[flavour] = 0

    for flavour in flavour_list:
        df[flavour] = np.where(df['mixins'].str.contains(flavour), 1, 0)
        
    
    df['butter_type_int'] = pd.get_dummies(df['butter type'],drop_first=True)
    
    df = df.drop(['mixins','butter type','raisins'],axis=1)
    
    return df

In [247]:
def cleanOutliers(df):
    '''Returns a Dataframe without outliers'''
    
    cols = ['sugar to flour ratio', 'sugar index', 'bake temp', 'chill time',
       'calories', 'density', 'pH', 'grams baking soda', 'bake time',
       'quality', 'weight', 'crunch factor']
    
    for col in cols:
        q1 = df[col].quantile(0.25)
        q3 = df[col].quantile(0.75)
        iqr = q3 - q1
        df[col] = df[col].apply(lambda x: x if x > q1 - 3 * iqr and x < q3 + 3 * iqr else np.nan)   
        
    df = df.dropna()
    
    return df

In [259]:
def conversion(x):
    '''Function to convert quality target to category'''
    
    if x >= 9:
        return 2
    elif x <= 6:
        return 0
    else: 
        return 1

In [260]:
def convert_quality(df):
    '''Create new categorical quality target'''
    
    df['quality_label'] = df['quality'].apply(lambda x: conversion(x))

In [116]:
# Import regression models

from sklearn.linear_model import LinearRegression
from sklearn.neighbors import KNeighborsRegressor
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor
from sklearn.svm import SVR

In [212]:
from sklearn.preprocessing import Normalizer
from sklearn.preprocessing import RobustScaler
from sklearn.preprocessing import MinMaxScaler
from sklearn.preprocessing import StandardScaler

In [124]:
def prepare_for_scaling(df):
    column_list = ['quality','nuts', 'chocolate', 'oats', 'peanut butter','butter_type_int']
    return df.drop(column_list, axis=1)

In [135]:
def normalizer(df):
    df_normal = prepare_for_scaling(df)
    transformer = Normalizer().fit(df_normal) # fit does nothing.
    return transformer.transform(df_normal)

In [253]:
def robust_scaler(df):
    df_robust = prepare_for_scaling(df)
    transformer = RobustScaler().fit(df_robust)
    return transformer.transform(df_robust)

In [255]:
def min_max_scaler(df):
    df = prepare_for_scaling(df)
    transformer = MinMaxScaler().fit(df)
    return transformer.transform(df)

In [256]:
def standard_scaler(df):
    df = prepare_for_scaling(df)
    scaler = StandardScaler().fit(df)
    return scaler.transform(df)

In [173]:
def create_definitive_df(array, df):
    columns_set = set(df.columns)
    column_list = ['quality', 'nuts', 'chocolate', 'oats', 'peanut butter','butter_type_int']
    normalized_columns = columns_set.difference(set(column_list))
    normalized_df = pd.DataFrame(array, columns=normalized_columns)
    concat_df = pd.concat([normalized_df, df[column_list].reset_index(drop=True)], axis=1,ignore_index=False)
    return concat_df

In [97]:
lin_model = LinearRegression()
knn_model = KNeighborsRegressor()
tree_model = DecisionTreeRegressor()
forest_model = RandomForestRegressor()
svr_model = SVR()

In [178]:
models = [lin_model,knn_model,tree_model,forest_model,svr_model]

In [179]:
from sklearn.model_selection import cross_val_score

In [184]:
import warnings

In [185]:
warnings.filterwarnings('ignore')

In [197]:
from sklearn.model_selection import StratifiedKFold
from sklearn.model_selection import GridSearchCV

# Parameters to try
parameter_grid = {'max_depth': [1, 2, 3, 4, 5],
                  'max_features': [1, 2, 3, 4]}

# Instantiate stratified cross validation
cross_validation = StratifiedKFold(n_splits=10)

# Create grid search object on decision tree using stratified cross validation
grid_search = GridSearchCV(forest_model,
                           param_grid=parameter_grid,
                           cv=cross_validation)

# Fit model with grid_search
grid_search.fit(X, y)
print('Best score: {}'.format(grid_search.best_score_))
print('Best parameters: {}'.format(grid_search.best_params_))

Best score: 0.6507115819768131
Best parameters: {'max_depth': 5, 'max_features': 4}


## TRY NUMBER 1

In [250]:
# With outliers and scaling = normalizer

cookies_df = cookies_df_original.copy()

cookies_cleaned = clean_df(cookies_df)
cookies_encoded = encode(cookies_cleaned)
cookies_normalized = normalizer(cookies_encoded)

cookies_concat = create_definitive_df(cookies_normalized,cookies_encoded)
X = cookies_concat.drop('quality',axis=1)
y = cookies_concat['quality']

scores_list_try_1 = []

models_names = ['Linear Regression','KNN','DecisionTree','RandomForest','SVR']

for i, model in enumerate(models):
    cv = cross_val_score(models[i],X,y,cv=10,scoring='r2')
    scores_list_try_1.append((i,np.mean(cv)))
    
for i in range(len(scores_list_try_1)):
    print(f'{models_names[i]}: {scores_list_try_1[i][1]}')

Linear Regression: 0.6335077371507825
KNN: 0.636726043535897
DecisionTree: 0.5369759808036288
RandomForest: 0.7228831493738137
SVR: 0.5783946808893693


## TRY NUMBER 2

In [249]:
# Without outliers and scaling = normalizer

cookies_df = cookies_df_original.copy()

cookies_cleaned = clean_df(cookies_df)
cookies_notOL = cleanOutliers(cookies_cleaned)
cookies_encoded = encode(cookies_notOL)
cookies_normalized = normalizer(cookies_encoded)

cookies_concat = create_definitive_df(cookies_normalized,cookies_encoded)
# cookies_concat.dropna(inplace=True)

X = cookies_concat.drop('quality',axis=1)
y = cookies_concat['quality']

scores_list_try_2 = []

models_names = ['Linear Regression','KNN','DecisionTree','RandomForest','SVR']

for i, model in enumerate(models):
    cv = cross_val_score(models[i],X,y,cv=10,scoring='r2')
    scores_list_try_2.append((i,np.mean(cv)))
    
for i in range(len(scores_list_try_2)):
    print(f'{models_names[i]}: {scores_list_try_2[i][1]}')

Linear Regression: 0.6529285617343934
KNN: 0.6197636572312084
DecisionTree: 0.5356228020742833
RandomForest: 0.7194131725221318
SVR: 0.5555830601238809


## TRY NUMBER 3

In [254]:
# Without outliers and scaling = robust

cookies_df = cookies_df_original.copy()

cookies_cleaned = clean_df(cookies_df)
cookies_notOL = cleanOutliers(cookies_cleaned)
cookies_encoded = encode(cookies_notOL)
cookies_normalized = robust_scaler(cookies_encoded)

cookies_concat = create_definitive_df(cookies_normalized,cookies_encoded)
# cookies_concat.dropna(inplace=True)

X = cookies_concat.drop('quality',axis=1)
y = cookies_concat['quality']

scores_list_try_3 = []

models_names = ['Linear Regression','KNN','DecisionTree','RandomForest','SVR']

for i, model in enumerate(models):
    cv = cross_val_score(models[i],X,y,cv=10,scoring='r2')
    scores_list_try_3.append((i,np.mean(cv)))
    
for i in range(len(scores_list_try_3)):
    print(f'{models_names[i]}: {scores_list_try_3[i][1]}')

Linear Regression: 0.6637252256199793
KNN: 0.677266778866967
DecisionTree: 0.558288539229331
RandomForest: 0.7410952939496223
SVR: 0.7053956141360717


## TRY NUMBER 4

In [258]:
# Without outliers and scaling = minmaxscaler()

cookies_df = cookies_df_original.copy()

cookies_cleaned = clean_df(cookies_df)
cookies_notOL = cleanOutliers(cookies_cleaned)
cookies_encoded = encode(cookies_notOL)
cookies_normalized = min_max_scaler(cookies_encoded)

cookies_concat = create_definitive_df(cookies_normalized,cookies_encoded)

X = cookies_concat.drop('quality',axis=1)
y = cookies_concat['quality']

scores_list_try_4 = []

models_names = ['Linear Regression','KNN','DecisionTree','RandomForest','SVR']

for i, model in enumerate(models):
    cv = cross_val_score(models[i],X,y,cv=10,scoring='r2')
    scores_list_try_4.append((i,np.mean(cv)))
    
for i in range(len(scores_list_try_4)):
    print(f'{models_names[i]}: {scores_list_try_4[i][1]}')

Linear Regression: 0.6637252256199795
KNN: 0.6582225567103663
DecisionTree: 0.5525740749009879
RandomForest: 0.7441172725837208
SVR: 0.6746629267940587


## TRY NUMBER 5

In [271]:
# Without outliers and scaling = minmaxscaler()

cookies_df = cookies_df_original.copy()

cookies_cleaned = clean_df(cookies_df)
cookies_notOL = cleanOutliers(cookies_cleaned)
cookies_encoded = encode(cookies_notOL)
cookies_normalized = standard_scaler(cookies_encoded)

cookies_concat = create_definitive_df(cookies_normalized,cookies_encoded)

X = cookies_concat.drop('quality',axis=1)
y = cookies_concat['quality']

scores_list_try_5 = []

models_names = ['Linear Regression','KNN','DecisionTree','RandomForest','SVR']

for i, model in enumerate(models):
    cv = cross_val_score(models[i],X,y,cv=10,scoring='r2')
    scores_list_try_5.append((i,np.mean(cv)))
    
for i in range(len(scores_list_try_5)):
    print(f'{models_names[i]}: {scores_list_try_5[i][1]}')

Linear Regression: 0.6637252256199793
KNN: 0.6693200768337431
DecisionTree: 0.5585071842092668
RandomForest: 0.717415692935818
SVR: 0.7077009882157587
