In [23]:
from sklearn.datasets import load_wine
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.neighbors import KNeighborsClassifier
from sklearn.preprocessing import MaxAbsScaler
from sklearn.preprocessing import MinMaxScaler
from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import LabelEncoder
from sklearn.preprocessing import OneHotEncoder
from sklearn.metrics import accuracy_score

In [2]:
wine = load_wine()
print(type(wine))
wine.keys()

<class 'sklearn.utils._bunch.Bunch'>


dict_keys(['data', 'target', 'frame', 'target_names', 'DESCR', 'feature_names'])

In [3]:
print(wine['feature_names'])
print(wine.feature_names)

['alcohol', 'malic_acid', 'ash', 'alcalinity_of_ash', 'magnesium', 'total_phenols', 'flavanoids', 'nonflavanoid_phenols', 'proanthocyanins', 'color_intensity', 'hue', 'od280/od315_of_diluted_wines', 'proline']
['alcohol', 'malic_acid', 'ash', 'alcalinity_of_ash', 'magnesium', 'total_phenols', 'flavanoids', 'nonflavanoid_phenols', 'proanthocyanins', 'color_intensity', 'hue', 'od280/od315_of_diluted_wines', 'proline']


In [4]:
X = wine.data
y = wine.target

In [5]:
print(wine.target_names)

['class_0' 'class_1' 'class_2']


In [6]:
print(wine.DESCR)

.. _wine_dataset:

Wine recognition dataset
------------------------

**Data Set Characteristics:**

    :Number of Instances: 178
    :Number of Attributes: 13 numeric, predictive attributes and the class
    :Attribute Information:
 		- Alcohol
 		- Malic acid
 		- Ash
		- Alcalinity of ash  
 		- Magnesium
		- Total phenols
 		- Flavanoids
 		- Nonflavanoid phenols
 		- Proanthocyanins
		- Color intensity
 		- Hue
 		- OD280/OD315 of diluted wines
 		- Proline

    - class:
            - class_0
            - class_1
            - class_2
		
    :Summary Statistics:
    
                                   Min   Max   Mean     SD
    Alcohol:                      11.0  14.8    13.0   0.8
    Malic Acid:                   0.74  5.80    2.34  1.12
    Ash:                          1.36  3.23    2.36  0.27
    Alcalinity of Ash:            10.6  30.0    19.5   3.3
    Magnesium:                    70.0 162.0    99.7  14.3
    Total Phenols:                0.98  3.88    2.29  0.63
    Fl

In [7]:
X = wine.data
y = wine.target
X_train,X_test,y_train,y_test = train_test_split(X,y,random_state=1)

In [8]:
model = KNeighborsClassifier()
model.fit(X_train,y_train)
pred_train = model.predict(X_train)
pred_test = model.predict(X_test)

print("Train Score : ", accuracy_score(y_train,pred_train))
print("Test Score : ", accuracy_score(y_test,pred_test))

Train Score :  0.8270676691729323
Test Score :  0.6888888888888889


In [9]:
# Faeture MaxAbScaler
sc = MaxAbsScaler()
X_train_new = sc.fit_transform(X_train)
X_test_new = sc.transform(X_test)

model = KNeighborsClassifier()
model.fit(X_train_new,y_train)

pred_train = model.predict(X_train_new)
pred_test = model.predict(X_test_new)


print("Train Score : ", accuracy_score(y_train,pred_train))
print("Test Score : ", accuracy_score(y_test,pred_test))

Train Score :  0.9624060150375939
Test Score :  0.9555555555555556


In [10]:
# Faeture MinMaxScaler
sc = MinMaxScaler()
X_train_new = sc.fit_transform(X_train)
X_test_new = sc.transform(X_test)

model = KNeighborsClassifier()
model.fit(X_train_new,y_train)

pred_train = model.predict(X_train_new)
pred_test = model.predict(X_test_new)


print("Train Score : ", accuracy_score(y_train,pred_train))
print("Test Score : ", accuracy_score(y_test,pred_test))

Train Score :  0.9774436090225563
Test Score :  0.9777777777777777


### Feature Selection

**ANOVA** ------ analysis of variance test

In [11]:
from sklearn.feature_selection import f_classif

In [12]:
fvalue,pvalue = f_classif(X,y)

In [13]:
fvalue

array([135.07762424,  36.94342496,  13.3129012 ,  35.77163741,
        12.42958434,  93.73300962, 233.92587268,  27.57541715,
        30.27138317, 120.66401844, 101.31679539, 189.97232058,
       207.9203739 ])

In [14]:
print(wine.feature_names)

['alcohol', 'malic_acid', 'ash', 'alcalinity_of_ash', 'magnesium', 'total_phenols', 'flavanoids', 'nonflavanoid_phenols', 'proanthocyanins', 'color_intensity', 'hue', 'od280/od315_of_diluted_wines', 'proline']


In [15]:
X1 = X[:,[0,5,6,9,10,11,12]]
y = wine.target
X1_train,X1_test,y_train,y_test = train_test_split(X1,y,random_state=1)


In [16]:
# Faeture MinMaxScaler
sc = MinMaxScaler()
X1_train_new = sc.fit_transform(X1_train)
X1_test_new = sc.transform(X1_test)

model = KNeighborsClassifier()
model.fit(X1_train_new,y_train)

pred_train = model.predict(X1_train_new)
pred_test = model.predict(X1_test_new)


print("Train Score : ", accuracy_score(y_train,pred_train))
print("Test Score : ", accuracy_score(y_test,pred_test))

Train Score :  0.9924812030075187
Test Score :  0.9777777777777777


In [17]:
X1_train_new.shape

(133, 7)

### Atuomatic Feature Selection

In [18]:
from sklearn.feature_selection import SelectKBest

In [25]:
skb = SelectKBest(score_func=f_classif,k=12)
X_new = skb.fit_transform(X,y)
X_new.shape



y = wine.target
X_new
X_new_train,X_new_test,y_train,y_test = train_test_split(X_new,y,random_state=1)

sc = StandardScaler()
X_new_train_new = sc.fit_transform(X_new_train)
X_new_test_new = sc.transform(X_new_test)

model = KNeighborsClassifier()
model.fit(X_new_train_new,y_train)

pred_train = model.predict(X_new_train_new)
pred_test = model.predict(X_new_test_new)


print("Train Score : ", accuracy_score(y_train,pred_train))
print("Test Score : ", accuracy_score(y_test,pred_test))

Train Score :  0.9774436090225563
Test Score :  0.9777777777777777


In [20]:
skb.scores_  # imporant feature value

array([135.07762424,  36.94342496,  13.3129012 ,  35.77163741,
        12.42958434,  93.73300962, 233.92587268,  27.57541715,
        30.27138317, 120.66401844, 101.31679539, 189.97232058,
       207.9203739 ])

Train Score :  0.9624060150375939
Test Score :  0.9777777777777777
