# <center>Wine Data</center>

# Classification

## Data Understanding

In [1]:
# import libraries
import numpy as np
import pandas as pd
import sklearn.tree as tree
import sklearn.metrics as eval_m
import sklearn.preprocessing as pre_pro
import sklearn.feature_selection as f_selec
from sklearn.model_selection import train_test_split

In [2]:
# features
features = ["bclass","alcohol","mallic_acid","ash","alcalinity_of_ash",\
                                            "magnesium","total_phenols","flavanoids","non_flavanoid_phenols",\
                                            "proanthocyanins","color_intensity","hue","od","proline"]

# data loading
data = pd.read_csv("data/wine.csv", names=features)

In [3]:
# display first five rows
data.head()

Unnamed: 0,bclass,alcohol,mallic_acid,ash,alcalinity_of_ash,magnesium,total_phenols,flavanoids,non_flavanoid_phenols,proanthocyanins,color_intensity,hue,od,proline
0,1,14.23,1.71,2.43,15.6,127,2.8,3.06,0.28,2.29,5.64,1.04,3.92,1065
1,1,13.2,1.78,2.14,11.2,100,2.65,2.76,0.26,1.28,4.38,1.05,3.4,1050
2,1,13.16,2.36,2.67,18.6,101,2.8,3.24,0.3,2.81,5.68,1.03,3.17,1185
3,1,14.37,1.95,2.5,16.8,113,3.85,3.49,0.24,2.18,7.8,0.86,3.45,1480
4,1,13.24,2.59,2.87,21.0,118,2.8,2.69,0.39,1.82,4.32,1.04,2.93,735


### [Data Description](https://archive.ics.uci.edu/ml/datasets/wine)
These data are the results of a chemical analysis of wines grown in the same region in Italy but derived from three different cultivars. The analysis determined the quantities of 13 constituents found in each of the three types of wines. 

I think that the initial data set ha

1) Alcohol 
2) Malic acid 
3) Ash 
4) Alcalinity of ash 
5) Magnesium 
6) Total phenols 
7) Flavanoids 
8) Nonflavanoid phenols 
9) Proanthocyanins 
10) Color intensity 
11) Hue 
12) OD280/OD315 of diluted wines 
13) Proline 

## Task
`Classification Problem`

## Data Preparation

In [4]:
# statistical measures
data.describe().T

Unnamed: 0,count,mean,std,min,25%,50%,75%,max
bclass,178.0,1.938202,0.775035,1.0,1.0,2.0,3.0,3.0
alcohol,178.0,13.000618,0.811827,11.03,12.3625,13.05,13.6775,14.83
mallic_acid,178.0,2.336348,1.117146,0.74,1.6025,1.865,3.0825,5.8
ash,178.0,2.366517,0.274344,1.36,2.21,2.36,2.5575,3.23
alcalinity_of_ash,178.0,19.494944,3.339564,10.6,17.2,19.5,21.5,30.0
magnesium,178.0,99.741573,14.282484,70.0,88.0,98.0,107.0,162.0
total_phenols,178.0,2.295112,0.625851,0.98,1.7425,2.355,2.8,3.88
flavanoids,178.0,2.02927,0.998859,0.34,1.205,2.135,2.875,5.08
non_flavanoid_phenols,178.0,0.361854,0.124453,0.13,0.27,0.34,0.4375,0.66
proanthocyanins,178.0,1.590899,0.572359,0.41,1.25,1.555,1.95,3.58


In [5]:
# unique values in "bclass" feature
data.bclass.unique()

array([1, 2, 3])

In [6]:
# independent features
X = data.drop("bclass",axis=1)
X_cols = list(X.columns)

# dependent or target feature
y = data.bclass

In [7]:
# display first five rows of independent features
X.head()

Unnamed: 0,alcohol,mallic_acid,ash,alcalinity_of_ash,magnesium,total_phenols,flavanoids,non_flavanoid_phenols,proanthocyanins,color_intensity,hue,od,proline
0,14.23,1.71,2.43,15.6,127,2.8,3.06,0.28,2.29,5.64,1.04,3.92,1065
1,13.2,1.78,2.14,11.2,100,2.65,2.76,0.26,1.28,4.38,1.05,3.4,1050
2,13.16,2.36,2.67,18.6,101,2.8,3.24,0.3,2.81,5.68,1.03,3.17,1185
3,14.37,1.95,2.5,16.8,113,3.85,3.49,0.24,2.18,7.8,0.86,3.45,1480
4,13.24,2.59,2.87,21.0,118,2.8,2.69,0.39,1.82,4.32,1.04,2.93,735


In [8]:
# display first five rows of dependent features
y.head()

0    1
1    1
2    1
3    1
4    1
Name: bclass, dtype: int64

### Feature Scaling
The purpose of this is to bring all the data in same range so there aren't any unfairness. We will use min max scaler.</br>

`MinMaxScaler()` : Transform features by scaling each feature to a given range.

In [9]:
# min max scaler : chosing range from 0 to 3 since the target class also has that range, we can choose any range though
min_max_scaler = pre_pro.MinMaxScaler(feature_range=(0,3))

In [10]:
# Fit to data, then transform it.
X[X_cols] = min_max_scaler.fit_transform(X)

In [11]:
# display the first five rows
X.head()

Unnamed: 0,alcohol,mallic_acid,ash,alcalinity_of_ash,magnesium,total_phenols,flavanoids,non_flavanoid_phenols,proanthocyanins,color_intensity,hue,od,proline
0,2.526316,0.575099,1.716578,0.773196,1.858696,1.882759,1.721519,0.849057,1.77918,1.116041,1.365854,2.912088,1.684023
1,1.713158,0.616601,1.251337,0.092784,0.978261,1.727586,1.531646,0.735849,0.823344,0.793515,1.390244,2.340659,1.651926
2,1.681579,0.960474,2.101604,1.237113,1.01087,1.882759,1.835443,0.962264,2.271293,1.12628,1.341463,2.087912,1.940799
3,2.636842,0.717391,1.828877,0.958763,1.402174,2.968966,1.993671,0.622642,1.675079,1.668942,0.926829,2.395604,2.57204
4,1.744737,1.096838,2.42246,1.608247,1.565217,1.882759,1.487342,1.471698,1.334385,0.778157,1.365854,1.824176,0.977889


In [12]:
# statistical measures
X.describe().T

Unnamed: 0,count,mean,std,min,25%,50%,75%,max
alcohol,178.0,1.555751,0.640916,0.0,1.051974,1.594737,2.090132,3.0
mallic_acid,178.0,0.946452,0.66234,0.0,0.511364,0.666996,1.388834,3.0
ash,178.0,1.614733,0.440124,0.0,1.363636,1.604278,1.921123,3.0
alcalinity_of_ash,178.0,1.375507,0.516427,0.0,1.020619,1.376289,1.685567,3.0
magnesium,178.0,0.969834,0.465733,0.0,0.586957,0.913043,1.206522,3.0
total_phenols,178.0,1.360461,0.647432,0.0,0.788793,1.422414,1.882759,3.0
flavanoids,178.0,1.069158,0.632189,0.0,0.547468,1.136076,1.60443,3.0
non_flavanoid_phenols,178.0,1.312381,0.704453,0.0,0.792453,1.188679,1.740566,3.0
proanthocyanins,178.0,1.11757,0.541665,0.0,0.794953,1.083596,1.457413,3.0
color_intensity,178.0,0.967088,0.593418,0.0,0.496587,0.872867,1.259386,3.0


It's clearly visible above that for all features the min and max value ranges between 0 to 3.

### Splitting Data

In [13]:
# train test split
X_train, X_test, y_train, y_test = train_test_split(
                                                    X, 
                                                    y, 
                                                    test_size=0.2, 
                                                    random_state=0
                                                    )

## Modelling

In [14]:
# model
model = tree.DecisionTreeClassifier(criterion="entropy",max_depth=4)
# scores = cross_val_score(model,X,y)
# scores.mean()
model.fit(X_train, y_train)
y_pred = model.predict(X_test)

## Evaluation

In [15]:
# Compute confusion matrix to evaluate the accuracy of a classification.
eval_m.confusion_matrix(y_test, y_pred)

array([[13,  1,  0],
       [ 1, 14,  1],
       [ 0,  0,  6]])

In [16]:
# accuracy score
eval_m.accuracy_score(y_test, y_pred)

0.9166666666666666

## Feature Selection | Filter Method - Variance

In [17]:
X1 = X.copy(deep=True)
y1 = y.copy(deep=True)
# variance in ascending order
variance = X.var()
variance.sort_values()

ash                      0.193709
magnesium                0.216907
alcalinity_of_ash        0.266697
proanthocyanins          0.293400
hue                      0.310797
color_intensity          0.352145
flavanoids               0.399663
alcohol                  0.410773
total_phenols            0.419168
mallic_acid              0.438694
proline                  0.454059
non_flavanoid_phenols    0.496254
od                       0.608726
dtype: float64

Selecting features that has variance greater than 0.3

In [18]:
# create boolean mask
variance_3 = variance > 0.3
variance_3

alcohol                   True
mallic_acid               True
ash                      False
alcalinity_of_ash        False
magnesium                False
total_phenols             True
flavanoids                True
non_flavanoid_phenols     True
proanthocyanins          False
color_intensity           True
hue                       True
od                        True
proline                   True
dtype: bool

In [19]:
# extracting features with variance > 0.3
X1 = X1[X1.columns[variance_3]]

In [20]:
X1.head()

Unnamed: 0,alcohol,mallic_acid,total_phenols,flavanoids,non_flavanoid_phenols,color_intensity,hue,od,proline
0,2.526316,0.575099,1.882759,1.721519,0.849057,1.116041,1.365854,2.912088,1.684023
1,1.713158,0.616601,1.727586,1.531646,0.735849,0.793515,1.390244,2.340659,1.651926
2,1.681579,0.960474,1.882759,1.835443,0.962264,1.12628,1.341463,2.087912,1.940799
3,2.636842,0.717391,2.968966,1.993671,0.622642,1.668942,0.926829,2.395604,2.57204
4,1.744737,1.096838,1.882759,1.487342,1.471698,0.778157,1.365854,1.824176,0.977889


In [21]:
# data df also contains target feature
print(f"Features before Feature Selection: {data.shape[1]-1}")
print(f"Features after Feature Selection: {X1.shape[1]}")
print(f"Features eliminated after Feature Selection: {data.shape[1]-1 - (X1.shape[1])}")

Features before Feature Selection: 13
Features after Feature Selection: 9
Features eliminated after Feature Selection: 4


### Splitting data

In [22]:
# train test split
X_train, X_test, y_train, y_test = train_test_split(
                                                    X1, 
                                                    y1, 
                                                    test_size=0.2, 
                                                    random_state=0
                                                    )

### Modelling

In [23]:
# model
model = tree.DecisionTreeClassifier(criterion="entropy",max_depth=4)

# scores = cross_val_score(model,X,y)
# scores.mean()

model.fit(X_train, y_train)
y_pred = model.predict(X_test)

### Evaluation

In [24]:
# confusion matrix
eval_m.confusion_matrix(y_test,y_pred)

array([[13,  1,  0],
       [ 1, 14,  1],
       [ 0,  0,  6]])

In [25]:
# accuracy
eval_m.accuracy_score(y_test, y_pred)

0.9166666666666666

### Conclusion

* Proper feature selection can increase the accuracy of the model.
* After feature selection we have less number of features to work with, which will eventually help us to train the models faster.
* Feature Selection with `variance` can be implemented with Panda's pd.var() and sklearn.feature_selection's VarianceThreshold().

---

## Feature Selection | Filter Method | Anova F-Score

In [26]:
X2 = X.copy(deep=True)
y2 = y.copy(deep=True)

In [27]:
X.shape, X2.shape

((178, 13), (178, 13))

**Higher the value of feature's f-score, greater is the importance of the feature.**

In [28]:
# f_classif(): Compute the ANOVA F-value for the provided sample.
f_score_p_value = f_selec.f_classif(X2,y2)
print(f"Anova f-score:\n {f_score_p_value[0]}\n")
print(f"Anova p-value:\n {f_score_p_value[1]}")

Anova f-score:
 [135.07762424  36.94342496  13.3129012   35.77163741  12.42958434
  93.73300962 233.92587268  27.57541715  30.27138317 120.66401844
 101.31679539 189.97232058 207.9203739 ]

Anova p-value:
 [3.31950380e-36 4.12722880e-14 4.14996797e-06 9.44447294e-14
 8.96339544e-06 2.13767002e-28 3.59858583e-50 3.88804090e-11
 5.12535874e-12 1.16200802e-33 5.91766222e-30 1.39310496e-44
 5.78316836e-47]


In [29]:
# Select features according to the k highest scores.
# k = 11: out of all the features, select the top 11 features
select_k_best = f_selec.SelectKBest(score_func=f_selec.f_classif, k = 11)

# Fit to data, then transform it.
transformed_features = select_k_best.fit_transform(X2,y2)

In [30]:
print(select_k_best.get_support(),"\n")
print(select_k_best.get_feature_names_out())

[ True  True False  True False  True  True  True  True  True  True  True
  True] 

['alcohol' 'mallic_acid' 'alcalinity_of_ash' 'total_phenols' 'flavanoids'
 'non_flavanoid_phenols' 'proanthocyanins' 'color_intensity' 'hue' 'od'
 'proline']


In [31]:
# extracting top 11 features
X2 = X2[select_k_best.get_feature_names_out()]
print(X2.shape)
X2.head()

(178, 11)


Unnamed: 0,alcohol,mallic_acid,alcalinity_of_ash,total_phenols,flavanoids,non_flavanoid_phenols,proanthocyanins,color_intensity,hue,od,proline
0,2.526316,0.575099,0.773196,1.882759,1.721519,0.849057,1.77918,1.116041,1.365854,2.912088,1.684023
1,1.713158,0.616601,0.092784,1.727586,1.531646,0.735849,0.823344,0.793515,1.390244,2.340659,1.651926
2,1.681579,0.960474,1.237113,1.882759,1.835443,0.962264,2.271293,1.12628,1.341463,2.087912,1.940799
3,2.636842,0.717391,0.958763,2.968966,1.993671,0.622642,1.675079,1.668942,0.926829,2.395604,2.57204
4,1.744737,1.096838,1.608247,1.882759,1.487342,1.471698,1.334385,0.778157,1.365854,1.824176,0.977889


### Splitting Data

In [32]:
X_train,X_test,y_train,y_test = train_test_split(X2,y2,test_size=0.2,random_state=11)

In [33]:
X_train.shape,X_test.shape,y_train.shape,y_test.shape

((142, 11), (36, 11), (142,), (36,))

### Modelling

In [34]:
model = tree.DecisionTreeClassifier(criterion="entropy",max_depth=4)
model.fit(X_train,y_train)
y_pred = model.predict(X_test)

### Evaluation

In [35]:
model.score(X_test,y_test)

0.9722222222222222

In [36]:
# confusion matrix
eval_m.confusion_matrix(y_test,y_pred)

array([[14,  0,  0],
       [ 0, 14,  1],
       [ 0,  0,  7]])

In [37]:
eval_m.accuracy_score(y_test,y_pred)

0.9722222222222222