# Import libraries

In [1]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt

%matplotlib inline

In [2]:
import pymysql
from sqlalchemy import create_engine

In [3]:
from sklearn.preprocessing import Normalizer
from sklearn.preprocessing import RobustScaler

In [4]:
# Import regression models

from sklearn.linear_model import LinearRegression
from sklearn.neighbors import KNeighborsRegressor
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor
from sklearn.svm import SVR

In [5]:
# Import scaling libraries

from sklearn.preprocessing import Normalizer
from sklearn.preprocessing import RobustScaler
from sklearn.preprocessing import MinMaxScaler
from sklearn.preprocessing import StandardScaler

In [6]:
from sklearn.model_selection import cross_val_score

In [7]:
import warnings

# Dabase credentials

In [8]:
driver = 'mysql+pymysql:'
user = 'ironhacker_read'
password = 'ir0nhack3r'
ip = '35.239.232.23'
database = 'cookies'

In [9]:
# Modifiyng numbers of columns displayed 
pd.set_option('display.max_columns',100)

# Create dataframe

In [10]:
connection_string = f'{driver}//{user}:{password}@{ip}/{database}'
engine = create_engine(connection_string)

query = 'SHOW TABLES'

user_df = pd.read_sql(query,engine)
user_df


query2 = """
SELECT * FROM cookies_quality
"""

cookies_df_original = pd.read_sql(query2, engine)

In [11]:
cookies_df = cookies_df_original.copy()

# Functions 

In [12]:
def graph_box_plots(df):
    '''
    Input: DataFrame
    
    # Create the boxplots for all the features of our dataset.
    '''
    cols = list(cookies_df.select_dtypes(include=['int64','float64']).columns)

    f, ax = plt.subplots(len(cols)//3,4, figsize=(12,10))

    plt.subplots_adjust(wspace=0.4,hspace=0.4)

    i = 0
    for row in ax:
        for col in row:
            if i == len(cols): 
                break
            else:    
                sns.boxplot(cookies_df[cols[i]], ax=col)
                i += 1

In [13]:
def clean_df(df):
    '''
    Input: DataFrame
    Output: DataFrame
    
    # Change crunch factor type, drop NaN values and drop 3 columns that don't give useful information.
    '''
    
    df['crunch factor'] = df['crunch factor'].astype('float')
    df = df.dropna()
    df = df.drop(['diameter','aesthetic appeal','id'], axis=1)
    
    return df

In [14]:
def encode(df):
    '''
    Input: DataFrame
    Outpu: DataFrame
    
    # One hot encodes categorical features. Drops columns that were encoded and one of the hot encoded
    # colums to avoid uniform information.
    '''
    
    flavour_list = ['raisins', 'nuts', 'chocolate', 'oats', 'peanut butter']
    
    for flavour in flavour_list:
        df[flavour] = 0

    for flavour in flavour_list:
        df[flavour] = np.where(df['mixins'].str.contains(flavour), 1, 0)
        
    
    df['butter_type_int'] = pd.get_dummies(df['butter type'],drop_first=True)
    
    df = df.drop(['mixins','butter type','raisins'],axis=1)
    
    return df

In [15]:
def cleanOutliers(df):
    '''
    Input: DataFrame
    Output: DataFrame
    
    # Returns a Dataframe without outliers
    '''
    
    cols = ['sugar to flour ratio', 'sugar index', 'bake temp', 'chill time',
       'calories', 'density', 'pH', 'grams baking soda', 'bake time',
       'quality', 'weight', 'crunch factor']
    
    for col in cols:
        q1 = df[col].quantile(0.25)
        q3 = df[col].quantile(0.75)
        iqr = q3 - q1
        df[col] = df[col].apply(lambda x: x if x > q1 - 3 * iqr and x < q3 + 3 * iqr else np.nan)   
        
    df = df.dropna()
    
    return df

In [16]:
def conversion(x):
    '''
    Input: Series
    
    # Maps values of a given list, array or Series.
    '''
    
    if x >= 9:
        return 2
    elif x <= 6:
        return 0
    else: 
        return 1

In [17]:
def convert_quality(df):
    '''
    Input: DataFrame
    
    # Maps values of quality Series with the conversion function.
    '''
    
    df['quality_label'] = df['quality'].apply(lambda x: conversion(x))

In [18]:
def prepare_for_scaling(df):
    '''
    Input: DataFrame
    Output: DataFrame
    
    # Return a DataFrame without the target and hot encoded columns.
    '''
    column_list = ['quality','nuts', 'chocolate', 'oats', 'peanut butter','butter_type_int']
    
    return df.drop(column_list, axis=1)

In [None]:
def normalizer(df):
    '''
    Input: DataFrame
    Output: Array
    
    # Normalize samples individually to unit norm.
    '''
    df_normal = prepare_for_scaling(df)
    transformer = Normalizer().fit(df_normal)
    
    return transformer.transform(df_normal)

In [None]:
def robust_scaler(df):
    '''
    Input: DataFrame
    Output: Array
    
    # Scale features using statistics that are robust to outliers.
    '''
    df_robust = prepare_for_scaling(df)
    transformer = RobustScaler().fit(df_robust)
    
    return transformer.transform(df_robust)

In [None]:
def min_max_scaler(df):
    '''
    Input: DataFrame
    Output: DataFrame
    
    # Transforms features by scaling each feature to a given range.
    '''
    df = prepare_for_scaling(df)
    transformer = MinMaxScaler().fit(df)
    
    return transformer.transform(df)

In [None]:
def standard_scaler(df):
    '''
    Input: DataFrame
    Output: Array
    
    # Standardize features by removing the mean and scaling to unit variance.
    '''
    df = prepare_for_scaling(df)
    scaler = StandardScaler().fit(df)
    
    return scaler.transform(df)

In [None]:
def create_definitive_df(array, df):
    '''
    Input: Array, DataFrame
    Output: DataFrame
    
    # Converts the normalized/standardized array to a DataFrame and concats the result
    # to the one hot encoded and quality columns.
    '''
    
    columns_set = set(df.columns)
    column_list = ['quality', 'nuts', 'chocolate', 'oats', 'peanut butter','butter_type_int']
    normalized_columns = columns_set.difference(set(column_list))
    normalized_df = pd.DataFrame(array, columns=normalized_columns)
    concat_df = pd.concat([normalized_df, df[column_list].reset_index(drop=True)], axis=1,ignore_index=False)
    
    return concat_df

# Creating models

In [None]:
lin_model = LinearRegression()
knn_model = KNeighborsRegressor()
tree_model = DecisionTreeRegressor()
forest_model = RandomForestRegressor()
svr_model = SVR()

# List with the different models that will be used

In [None]:
models = [lin_model,knn_model,tree_model,forest_model,svr_model]

In [None]:
# Ignoring possible errors, when we use cross validation mostly.
warnings.filterwarnings('ignore')

# Trying different models

## TRY NUMBER 1

In [None]:
# With outliers and scaling = normalizer

cookies_df = cookies_df_original.copy()

cookies_cleaned = clean_df(cookies_df)
cookies_encoded = encode(cookies_cleaned)
cookies_normalized = normalizer(cookies_encoded)

cookies_concat = create_definitive_df(cookies_normalized,cookies_encoded)
X = cookies_concat.drop('quality',axis=1)
y = cookies_concat['quality']

scores_list_try_1 = []

models_names = ['Linear Regression','KNN','DecisionTree','RandomForest','SVR']

for i, model in enumerate(models):
    cv = cross_val_score(models[i],X,y,cv=10,scoring='r2')
    scores_list_try_1.append((i,np.mean(cv)))
    
for i in range(len(scores_list_try_1)):
    print(f'{models_names[i]}: {scores_list_try_1[i][1]}')

## TRY NUMBER 2

In [None]:
# Without outliers and scaling = normalizer

cookies_df = cookies_df_original.copy()

cookies_cleaned = clean_df(cookies_df)
cookies_notOL = cleanOutliers(cookies_cleaned)
cookies_encoded = encode(cookies_notOL)
cookies_normalized = normalizer(cookies_encoded)

cookies_concat = create_definitive_df(cookies_normalized,cookies_encoded)
# cookies_concat.dropna(inplace=True)

X = cookies_concat.drop('quality',axis=1)
y = cookies_concat['quality']

scores_list_try_2 = []

models_names = ['Linear Regression','KNN','DecisionTree','RandomForest','SVR']

for i, model in enumerate(models):
    cv = cross_val_score(models[i],X,y,cv=10,scoring='r2')
    scores_list_try_2.append((i,np.mean(cv)))
    
for i in range(len(scores_list_try_2)):
    print(f'{models_names[i]}: {scores_list_try_2[i][1]}')

## TRY NUMBER 3

In [None]:
# Without outliers and scaling = robust

cookies_df = cookies_df_original.copy()

cookies_cleaned = clean_df(cookies_df)
cookies_notOL = cleanOutliers(cookies_cleaned)
cookies_encoded = encode(cookies_notOL)
cookies_normalized = robust_scaler(cookies_encoded)

cookies_concat = create_definitive_df(cookies_normalized,cookies_encoded)
# cookies_concat.dropna(inplace=True)

X = cookies_concat.drop('quality',axis=1)
y = cookies_concat['quality']

scores_list_try_3 = []

models_names = ['Linear Regression','KNN','DecisionTree','RandomForest','SVR']

for i, model in enumerate(models):
    cv = cross_val_score(models[i],X,y,cv=10,scoring='r2')
    scores_list_try_3.append((i,np.mean(cv)))
    
for i in range(len(scores_list_try_3)):
    print(f'{models_names[i]}: {scores_list_try_3[i][1]}')

## TRY NUMBER 4 (BEST TRY)

In [None]:
# Without outliers and scaling = minmaxscaler()

cookies_df = cookies_df_original.copy()

cookies_cleaned = clean_df(cookies_df)
cookies_notOL = cleanOutliers(cookies_cleaned)
cookies_encoded = encode(cookies_notOL)
cookies_normalized = min_max_scaler(cookies_encoded)

cookies_concat = create_definitive_df(cookies_normalized,cookies_encoded)

X = cookies_concat.drop('quality',axis=1)
y = cookies_concat['quality']

scores_list_try_4 = []

models_names = ['Linear Regression','KNN','DecisionTree','RandomForest','SVR']

for i, model in enumerate(models):
    cv = cross_val_score(models[i],X,y,cv=10,scoring='r2')
    scores_list_try_4.append((i,np.mean(cv)))
    
for i in range(len(scores_list_try_4)):
    print(f'{models_names[i]}: {scores_list_try_4[i][1]}')

## TRY NUMBER 5

In [None]:
# Without outliers and scaling = minmaxscaler()

cookies_df = cookies_df_original.copy()

cookies_cleaned = clean_df(cookies_df)
cookies_notOL = cleanOutliers(cookies_cleaned)
cookies_encoded = encode(cookies_notOL)
cookies_normalized = standard_scaler(cookies_encoded)

cookies_concat = create_definitive_df(cookies_normalized,cookies_encoded)

X = cookies_concat.drop('quality',axis=1)
y = cookies_concat['quality']

scores_list_try_5 = []

models_names = ['Linear Regression','KNN','DecisionTree','RandomForest','SVR']

for i, model in enumerate(models):
    cv = cross_val_score(models[i],X,y,cv=10,scoring='r2')
    scores_list_try_5.append((i,np.mean(cv)))
    
for i in range(len(scores_list_try_5)):
    print(f'{models_names[i]}: {scores_list_try_5[i][1]}')

## TRY NUMBER 6

In [None]:
# With outliers and scaling = robust

cookies_df = cookies_df_original.copy()

cookies_cleaned = clean_df(cookies_df)
cookies_encoded = encode(cookies_cleaned)
cookies_normalized = robust_scaler(cookies_encoded)

cookies_concat = create_definitive_df(cookies_normalized,cookies_encoded)
X = cookies_concat.drop('quality',axis=1)
y = cookies_concat['quality']

scores_list_try_6 = []

models_names = ['Linear Regression','KNN','DecisionTree','RandomForest','SVR']

for i, model in enumerate(models):
    cv = cross_val_score(models[i],X,y,cv=10,scoring='r2')
    scores_list_try_6.append((i,np.mean(cv)))
    
for i in range(len(scores_list_try_6)):
    print(f'{models_names[i]}: {scores_list_try_6[i][1]}')

## TRY NUMBER 7

In [None]:
# With outliers and scaling = standardscaler

cookies_df = cookies_df_original.copy()

cookies_cleaned = clean_df(cookies_df)
cookies_encoded = encode(cookies_cleaned)
cookies_normalized = standard_scaler(cookies_encoded)

cookies_concat = create_definitive_df(cookies_normalized,cookies_encoded)
X = cookies_concat.drop('quality',axis=1)
y = cookies_concat['quality']

scores_list_try_7 = []

models_names = ['Linear Regression','KNN','DecisionTree','RandomForest','SVR']

for i, model in enumerate(models):
    cv = cross_val_score(models[i],X,y,cv=10,scoring='r2')
    scores_list_try_7.append((i,np.mean(cv)))
    
for i in range(len(scores_list_try_7)):
    print(f'{models_names[i]}: {scores_list_try_7[i][1]}')

## TRY NUMBER 8

In [None]:
# With outliers and scaling = standardscaler

cookies_df = cookies_df_original.copy()

cookies_cleaned = clean_df(cookies_df)
cookies_notOL = cleanOutliers(cookies_cleaned)
cookies_encoded = encode(cookies_notOL)
cookies_normalized = min_max_scaler(cookies_encoded)

cookies_concat_pref = create_definitive_df(cookies_normalized,cookies_encoded)
cookies_concat = convert_quality(cookies_concat_pref)

X = cookies_concat.drop(['quality','quality_label'],axis=1)
y = cookies_concat['quality_label']

scores_list_try_8 = []

models_names = ['Linear Regression','KNN','DecisionTree','RandomForest','SVR']

for i, model in enumerate(models):
    cv = cross_val_score(models[i],X,y,cv=10,scoring='r2')
    scores_list_try_8.append((i,np.mean(cv)))

for i in range(len(scores_list_try_8)):
    print(f'{models_names[i]}: {scores_list_try_8[i][1]}')