# Normalization and Standardization

In [25]:
import pandas as pd
import numpy as np
from sklearn import preprocessing
from sklearn.model_selection import train_test_split
from sklearn.naive_bayes import GaussianNB
from sklearn.neighbors import KNeighborsClassifier
from sklearn import metrics

In [2]:
df = pd.io.parsers.read_csv('datasets/wine_data.csv', header=None, usecols=[0,1,2])

df.columns=['Class label', 'Alcohol', 'Malic acid']

df.head()

Unnamed: 0,Class label,Alcohol,Malic acid
0,1,14.23,1.71
1,1,13.2,1.78
2,1,13.16,2.36
3,1,14.37,1.95
4,1,13.24,2.59


In [3]:
std_scale = preprocessing.StandardScaler().fit(df[['Alcohol', 'Malic acid']])
df_std = std_scale.transform(df[['Alcohol', 'Malic acid']])

print('Mean after standardization:\nAlcohol={:.2f}, Malic acid={:.2f}'
      .format(df_std[:,0].mean(), df_std[:,1].mean()))
print('\nStandard deviation after standardization:\nAlcohol={:.2f}, Malic acid={:.2f}'
      .format(df_std[:,0].std(), df_std[:,1].std()))

Mean after standardization:
Alcohol=-0.00, Malic acid=-0.00

Standard deviation after standardization:
Alcohol=1.00, Malic acid=1.00


In [4]:
minmax_scale = preprocessing.MinMaxScaler().fit(df[['Alcohol', 'Malic acid']])
df_minmax = minmax_scale.transform(df[['Alcohol', 'Malic acid']])

print('Min-value after min-max scaling:\nAlcohol={:.2f}, Malic acid={:.2f}'
      .format(df_minmax[:,0].min(), df_minmax[:,1].min()))
print('\nMax-value after min-max scaling:\nAlcohol={:.2f}, Malic acid={:.2f}'
      .format(df_minmax[:,0].max(), df_minmax[:,1].max()))

Min-value after min-max scaling:
Alcohol=0.00, Malic acid=0.00

Max-value after min-max scaling:
Alcohol=1.00, Malic acid=1.00


In [5]:
X_wine = df.values[:,1:]
y_wine = df.values[:,0]

X_train, X_test, y_train, y_test = train_test_split(X_wine, y_wine,
    test_size=0.30, random_state=12345)

In [6]:
std_scale = preprocessing.StandardScaler().fit(X_train)
X_train_std = std_scale.transform(X_train)
X_test_std = std_scale.transform(X_test)

In [7]:
norm_scale = preprocessing.MinMaxScaler().fit(X_train)
X_train_norm = std_scale.transform(X_train)
X_test_norm = std_scale.transform(X_test)

# Usando Gaussian Naive Bayes

In [8]:
# on non-standardized data
gnb = GaussianNB()
gnb.fit(X_train, y_train)

# on standardized data
gnb_std = GaussianNB()
gnb_std.fit(X_train_std, y_train)

# on normalized data
gnb_norm = GaussianNB()
gnb_norm.fit(X_train_norm, y_train)

GaussianNB(priors=None)

In [12]:
y_pred_train = gnb.predict(X_train)
print('\nPrediction accuracy for the training dataset')
print('{:.2%}'.format(metrics.accuracy_score(y_train, y_pred_train)))

y_pred_test = gnb.predict(X_test)
print('\nPrediction accuracy for the test dataset')
print('{:.2%}\n'.format(metrics.accuracy_score(y_test, y_pred_test)))


Prediction accuracy for the training dataset
80.65%

Prediction accuracy for the test dataset
77.78%



In [13]:
y_pred_train_std = gnb_std.predict(X_train_std)
print('\nPrediction accuracy for the training dataset')
print('{:.2%}'.format(metrics.accuracy_score(y_train, y_pred_train_std)))

y_pred_test_std = gnb_std.predict(X_test_std)
print('\nPrediction accuracy for the test dataset')
print('{:.2%}\n'.format(metrics.accuracy_score(y_test, y_pred_test_std)))


Prediction accuracy for the training dataset
80.65%

Prediction accuracy for the test dataset
77.78%



In [14]:
y_pred_train_norm = gnb_norm.predict(X_train_norm)
print('Prediction accuracy for the training dataset')
print('{:.2%}'.format(metrics.accuracy_score(y_train, y_pred_train_norm)))

y_pred_test_norm = gnb_norm.predict(X_test_norm)
print('Prediction accuracy for the test dataset')
print('{:.2%}'.format(metrics.accuracy_score(y_test, y_pred_test_norm)))


Prediction accuracy for the training dataset
80.65%
Prediction accuracy for the test dataset
77.78%


# Usando kNN

In [29]:
# on non-standardized data
knn = KNeighborsClassifier()
knn.fit(X_train, y_train)

# on standardized data
knn_std = KNeighborsClassifier()
knn_std.fit(X_train_std, y_train)

# on normalized data
knn_norm = KNeighborsClassifier()
knn_norm.fit(X_train_norm, y_train)

KNeighborsClassifier(algorithm='auto', leaf_size=30, metric='minkowski',
           metric_params=None, n_jobs=1, n_neighbors=5, p=2,
           weights='uniform')

In [30]:
y_pred_train = knn.predict(X_train)
print('\nPrediction accuracy for the training dataset')
print('{:.2%}'.format(metrics.accuracy_score(y_train, y_pred_train)))

y_pred_test = knn.predict(X_test)
print('\nPrediction accuracy for the test dataset')
print('{:.2%}\n'.format(metrics.accuracy_score(y_test, y_pred_test)))


Prediction accuracy for the training dataset
85.48%

Prediction accuracy for the test dataset
75.93%



In [28]:
X_train

array([[ 11.46,   3.74],
       [ 12.37,   1.07],
       [ 12.69,   1.53],
       [ 12.47,   1.52],
       [ 13.63,   1.81],
       [ 13.9 ,   1.68],
       [ 12.99,   1.67],
       [ 12.7 ,   3.87],
       [ 14.83,   1.64],
       [ 14.75,   1.73],
       [ 12.29,   3.17],
       [ 11.84,   0.89],
       [ 13.05,   2.05],
       [ 13.73,   4.36],
       [ 12.34,   2.45],
       [ 13.86,   1.35],
       [ 12.16,   1.61],
       [ 12.33,   1.1 ],
       [ 13.16,   3.57],
       [ 13.36,   2.56],
       [ 12.67,   0.98],
       [ 12.52,   2.43],
       [ 14.21,   4.04],
       [ 12.21,   1.19],
       [ 13.86,   1.51],
       [ 12.37,   1.63],
       [ 12.85,   3.27],
       [ 12.37,   1.21],
       [ 12.51,   1.73],
       [ 12.82,   3.37],
       [ 11.62,   1.99],
       [ 12.53,   5.51],
       [ 13.05,   1.65],
       [ 13.94,   1.73],
       [ 13.11,   1.01],
       [ 14.39,   1.87],
       [ 12.87,   4.61],
       [ 14.37,   1.95],
       [ 13.68,   1.83],
       [ 13.73,   1.5 ],
