In [1]:
# Import relevant packages
import pandas as pd 
import numpy as np 
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.preprocessing import MinMaxScaler
from sklearn.model_selection import train_test_split
from sklearn.svm import SVC



In [3]:
PATH = "E:\Apprenticeship\data\winequality-red.csv"
df = pd.read_csv(PATH)

In [4]:
df.head()

Unnamed: 0,fixed acidity,volatile acidity,citric acid,residual sugar,chlorides,free sulfur dioxide,total sulfur dioxide,density,pH,sulphates,alcohol,quality
0,7.4,0.7,0.0,1.9,0.076,11.0,34.0,0.9978,3.51,0.56,9.4,5
1,7.8,0.88,0.0,2.6,0.098,25.0,67.0,0.9968,3.2,0.68,9.8,5
2,7.8,0.76,0.04,2.3,0.092,15.0,54.0,0.997,3.26,0.65,9.8,5
3,11.2,0.28,0.56,1.9,0.075,17.0,60.0,0.998,3.16,0.58,9.8,6
4,7.4,0.7,0.0,1.9,0.076,11.0,34.0,0.9978,3.51,0.56,9.4,5


In [5]:
df.rename({
    'fixed acidity':'fixed_acidity',
    'volatile acidity':'volatile_acidity',
    'citric acid':'citric_acid',
    'residual sugar':'residual_sugar',
    'free sulfur dioxide':'FSD',
    'total sulfur dioxide':'TSD'
}, axis=1, inplace=True)

Since most features of the data are right-skewed, standardization would not be a good option. 

So, we are left with two choices: Normalization and MinMax Scaling.


Normalization is applied to rows, while standardization is applied to columns. So, the normalizer assumes that every sample is similar to each other.

MinMax scaling preserves the structure of data, only scaling it to a (0,1) range based on column level features. So, we proceed with MinMaxScaler.

## MIN MAX SCALING

We saw from initial-exploration.ipynb that some features have a range between 0 and a small value, like 1.5. So, we only need to choose features that have a larger range and need to be scaled.

Columns fixed_acidity, residual_sugar, sulfur_dioxide, TSD, pH, alcohol need to be scaled because the data is in a higher range.

In [13]:
# Set a threshold; data containing values above this threshold are scaled. 
range_threshold = 2

In [7]:
cols_to_scale = []
for col in df.columns:
    range = df[col].max()
    # This code also included 'quality' as the feature to be scaled. I'll exclude that in the code below.
    if range > range_threshold and col != 'quality':
        cols_to_scale.append(col)

In [8]:
cols_to_scale

['fixed_acidity', 'residual_sugar', 'FSD', 'TSD', 'pH', 'alcohol']

In [9]:
mms = MinMaxScaler()

In [10]:
for col in df.columns:
    print(col)
    if col in cols_to_scale:
        df[col] = mms.fit_transform(np.array(df[col]).reshape(-1, 1))
        #preprocessed_df[col] = mms.fit_transform(df[i])
    else:
        df[col] = df[col]

fixed_acidity
volatile_acidity
citric_acid
residual_sugar
chlorides
FSD
TSD
density
pH
sulphates
alcohol
quality


In [11]:
df

Unnamed: 0,fixed_acidity,volatile_acidity,citric_acid,residual_sugar,chlorides,FSD,TSD,density,pH,sulphates,alcohol,quality
0,0.247788,0.700,0.00,0.068493,0.076,0.140845,0.098940,0.99780,0.606299,0.56,0.153846,5
1,0.283186,0.880,0.00,0.116438,0.098,0.338028,0.215548,0.99680,0.362205,0.68,0.215385,5
2,0.283186,0.760,0.04,0.095890,0.092,0.197183,0.169611,0.99700,0.409449,0.65,0.215385,5
3,0.584071,0.280,0.56,0.068493,0.075,0.225352,0.190813,0.99800,0.330709,0.58,0.215385,6
4,0.247788,0.700,0.00,0.068493,0.076,0.140845,0.098940,0.99780,0.606299,0.56,0.153846,5
...,...,...,...,...,...,...,...,...,...,...,...,...
1594,0.141593,0.600,0.08,0.075342,0.090,0.436620,0.134276,0.99490,0.559055,0.58,0.323077,5
1595,0.115044,0.550,0.10,0.089041,0.062,0.535211,0.159011,0.99512,0.614173,0.76,0.430769,6
1596,0.150442,0.510,0.13,0.095890,0.076,0.394366,0.120141,0.99574,0.535433,0.75,0.400000,6
1597,0.115044,0.645,0.12,0.075342,0.075,0.436620,0.134276,0.99547,0.653543,0.71,0.276923,5


## EXPORT PREPROCESSED FILE

In [12]:
df.to_csv('E:\Apprenticeship\data\preprocessed.csv')