In [123]:
import numpy as np
import pandas as pd

In [124]:
DATAPATH = 'data/wine_dataset.csv'
data = pd.read_csv(DATAPATH)
data.head()

Unnamed: 0,fixed_acidity,volatile_acidity,citric_acid,residual_sugar,chlorides,free_sulfur_dioxide,total_sulfur_dioxide,density,pH,sulphates,alcohol,quality,style
0,7.4,0.7,0.0,1.9,0.076,11.0,34.0,0.9978,3.51,0.56,9.4,5,red
1,7.8,0.88,0.0,2.6,0.098,25.0,67.0,0.9968,3.2,0.68,9.8,5,red
2,7.8,0.76,0.04,2.3,0.092,15.0,54.0,0.997,3.26,0.65,9.8,5,red
3,11.2,0.28,0.56,1.9,0.075,17.0,60.0,0.998,3.16,0.58,9.8,6,red
4,7.4,0.7,0.0,1.9,0.076,11.0,34.0,0.9978,3.51,0.56,9.4,5,red


In [125]:
data.shape

(6497, 13)

In [126]:
#as each feature has diff ranges, lets normalize it
def normalize(x):
    min_x = min(x)
    max_x = max(x)
    return (x - min_x)/(max_x - min_x)

In [127]:
data.columns

Index(['fixed_acidity', 'volatile_acidity', 'citric_acid', 'residual_sugar',
       'chlorides', 'free_sulfur_dioxide', 'total_sulfur_dioxide', 'density',
       'pH', 'sulphates', 'alcohol', 'quality', 'style'],
      dtype='object')

In [128]:
cols = ['fixed_acidity', 'volatile_acidity', 'citric_acid', 'residual_sugar',
       'chlorides', 'free_sulfur_dioxide', 'total_sulfur_dioxide', 'density',
       'pH', 'sulphates', 'alcohol']
for col in cols:
    new_col_name = 'norm_'+col
    data[new_col_name] = normalize(data[col])

In [129]:
data.head()

Unnamed: 0,fixed_acidity,volatile_acidity,citric_acid,residual_sugar,chlorides,free_sulfur_dioxide,total_sulfur_dioxide,density,pH,sulphates,...,norm_volatile_acidity,norm_citric_acid,norm_residual_sugar,norm_chlorides,norm_free_sulfur_dioxide,norm_total_sulfur_dioxide,norm_density,norm_pH,norm_sulphates,norm_alcohol
0,7.4,0.7,0.0,1.9,0.076,11.0,34.0,0.9978,3.51,0.56,...,0.413333,0.0,0.019939,0.111296,0.034722,0.064516,0.206092,0.612403,0.191011,0.202899
1,7.8,0.88,0.0,2.6,0.098,25.0,67.0,0.9968,3.2,0.68,...,0.533333,0.0,0.030675,0.147841,0.083333,0.140553,0.186813,0.372093,0.258427,0.26087
2,7.8,0.76,0.04,2.3,0.092,15.0,54.0,0.997,3.26,0.65,...,0.453333,0.024096,0.026074,0.137874,0.048611,0.110599,0.190669,0.418605,0.241573,0.26087
3,11.2,0.28,0.56,1.9,0.075,17.0,60.0,0.998,3.16,0.58,...,0.133333,0.337349,0.019939,0.109635,0.055556,0.124424,0.209948,0.341085,0.202247,0.26087
4,7.4,0.7,0.0,1.9,0.076,11.0,34.0,0.9978,3.51,0.56,...,0.413333,0.0,0.019939,0.111296,0.034722,0.064516,0.206092,0.612403,0.191011,0.202899


We need to predict the quality of the wine produced which will improve the sales.
So we are converting the style and quality columns into one-hot encoding and categories

In [130]:
#style column one-hot encoding
style_R_W = pd.get_dummies(data['style'])
style_R_W.head()

Unnamed: 0,red,white
0,1,0
1,1,0
2,1,0
3,1,0
4,1,0


In [131]:
data = pd.concat([data, style_R_W], axis=1)
data.head()

Unnamed: 0,fixed_acidity,volatile_acidity,citric_acid,residual_sugar,chlorides,free_sulfur_dioxide,total_sulfur_dioxide,density,pH,sulphates,...,norm_residual_sugar,norm_chlorides,norm_free_sulfur_dioxide,norm_total_sulfur_dioxide,norm_density,norm_pH,norm_sulphates,norm_alcohol,red,white
0,7.4,0.7,0.0,1.9,0.076,11.0,34.0,0.9978,3.51,0.56,...,0.019939,0.111296,0.034722,0.064516,0.206092,0.612403,0.191011,0.202899,1,0
1,7.8,0.88,0.0,2.6,0.098,25.0,67.0,0.9968,3.2,0.68,...,0.030675,0.147841,0.083333,0.140553,0.186813,0.372093,0.258427,0.26087,1,0
2,7.8,0.76,0.04,2.3,0.092,15.0,54.0,0.997,3.26,0.65,...,0.026074,0.137874,0.048611,0.110599,0.190669,0.418605,0.241573,0.26087,1,0
3,11.2,0.28,0.56,1.9,0.075,17.0,60.0,0.998,3.16,0.58,...,0.019939,0.109635,0.055556,0.124424,0.209948,0.341085,0.202247,0.26087,1,0
4,7.4,0.7,0.0,1.9,0.076,11.0,34.0,0.9978,3.51,0.56,...,0.019939,0.111296,0.034722,0.064516,0.206092,0.612403,0.191011,0.202899,1,0


since we are dealing with quality of wine rather very much focused on ratings from 1 to 10 and due to less 
data available. I would categorize the quality into 3 categories i.e. 'good','ok','bad'

In [132]:
#lets categorize the quality column
#bad = 0, ok = 1, good = 2
def categorize_quality(x):
    for i, val in enumerate(x.values):
        if val <= 5:
            data.loc[i, 'quality'] = 'bad'
        elif val >= 6 and val <= 7:
            data.loc[i, 'quality'] = 'ok'
        elif val >= 8:
            data.loc[i, 'quality'] = 'good'
        else:
            print('wrong value {}', i)
            
categorize_quality(data['quality'])

In [137]:
data['quality'] = pd.factorize(data['quality'])[0]

In [138]:
data.head()

Unnamed: 0,fixed_acidity,volatile_acidity,citric_acid,residual_sugar,chlorides,free_sulfur_dioxide,total_sulfur_dioxide,density,pH,sulphates,...,norm_residual_sugar,norm_chlorides,norm_free_sulfur_dioxide,norm_total_sulfur_dioxide,norm_density,norm_pH,norm_sulphates,norm_alcohol,red,white
0,7.4,0.7,0.0,1.9,0.076,11.0,34.0,0.9978,3.51,0.56,...,0.019939,0.111296,0.034722,0.064516,0.206092,0.612403,0.191011,0.202899,1,0
1,7.8,0.88,0.0,2.6,0.098,25.0,67.0,0.9968,3.2,0.68,...,0.030675,0.147841,0.083333,0.140553,0.186813,0.372093,0.258427,0.26087,1,0
2,7.8,0.76,0.04,2.3,0.092,15.0,54.0,0.997,3.26,0.65,...,0.026074,0.137874,0.048611,0.110599,0.190669,0.418605,0.241573,0.26087,1,0
3,11.2,0.28,0.56,1.9,0.075,17.0,60.0,0.998,3.16,0.58,...,0.019939,0.109635,0.055556,0.124424,0.209948,0.341085,0.202247,0.26087,1,0
4,7.4,0.7,0.0,1.9,0.076,11.0,34.0,0.9978,3.51,0.56,...,0.019939,0.111296,0.034722,0.064516,0.206092,0.612403,0.191011,0.202899,1,0


In [139]:
#lets drop columns some columns as we have transformed those columns into new values
data.columns
drop_cols = ['fixed_acidity', 'volatile_acidity', 'citric_acid', 'residual_sugar',
       'chlorides', 'free_sulfur_dioxide', 'total_sulfur_dioxide', 'density',
       'pH', 'sulphates', 'alcohol','style']

data.drop(drop_cols, axis=1, inplace=True)

In [140]:
data.head()

Unnamed: 0,quality,norm_fixed_acidity,norm_volatile_acidity,norm_citric_acid,norm_residual_sugar,norm_chlorides,norm_free_sulfur_dioxide,norm_total_sulfur_dioxide,norm_density,norm_pH,norm_sulphates,norm_alcohol,red,white
0,0,0.297521,0.413333,0.0,0.019939,0.111296,0.034722,0.064516,0.206092,0.612403,0.191011,0.202899,1,0
1,0,0.330579,0.533333,0.0,0.030675,0.147841,0.083333,0.140553,0.186813,0.372093,0.258427,0.26087,1,0
2,0,0.330579,0.453333,0.024096,0.026074,0.137874,0.048611,0.110599,0.190669,0.418605,0.241573,0.26087,1,0
3,1,0.61157,0.133333,0.337349,0.019939,0.109635,0.055556,0.124424,0.209948,0.341085,0.202247,0.26087,1,0
4,0,0.297521,0.413333,0.0,0.019939,0.111296,0.034722,0.064516,0.206092,0.612403,0.191011,0.202899,1,0


In [141]:
data.shape

(6497, 14)

In [122]:
#lets save the data into a csv
data.to_csv('data/basic_preprocessed.csv')