In [321]:
import pandas as pd
import numpy as np
import requests
from io import StringIO
import math
import csv
import sklearn
from sklearn.preprocessing import MinMaxScaler, LabelEncoder, StandardScaler
from sklearn.model_selection import train_test_split
from sklearn.decomposition import PCA
from sklearn import tree
from sklearn.metrics import confusion_matrix
import matplotlib.pyplot as plt
from sklearn.tree import export_graphviz

In [322]:
#Read CSV file from the url to DataFrame
url = "https://github.com/Brunel-Visualization/Brunel/raw/master/python/examples/data/whiskey.csv"
s = requests.get(url).text
wk = pd.read_csv(StringIO(s))

In [323]:
wk.describe()

Unnamed: 0,Rating,Price,ABV,Age
count,272.0,279.0,270.0,174.0
mean,84.474265,72.483871,44.610444,14.33908
std,11.877887,83.992242,5.883056,6.322267
min,40.0,2.0,35.5,0.0
25%,80.0,30.0,40.0,10.0
50%,88.0,50.0,43.0,14.5
75%,94.0,80.0,46.0,18.0
max,100.0,850.0,68.2,40.0


In [324]:
wk.head()

Unnamed: 0,Name,Rating,Country,Category,Price,ABV,Age,Brand
0,Canadian Hunter Canadian Whisky,40.0,Canada,Blended,9.0,40.0,,Canadian Hunter
1,Canadian LTD Blended Canadian Whiskey,43.0,Canada,Blended,10.0,,,Canadian LTD
2,Kellan Irish Whiskey,47.0,Ireland,Blended,20.0,40.0,,Kellan
3,Rich & Rare Canadian Whisky,47.0,Canada,Blended,10.0,,,Rich & Rare
4,Canadian Mist Blended Canadian Whisky,48.0,Canada,Blended,12.0,40.0,,Canadian Mist


# Fix phase1
* delete name, brand columns
* also delete age because it has missing values around 4x%
* drop columns that have missing values

In [325]:
wk = wk.drop('Name', axis=1)
wk = wk.drop('Age', axis=1)
wk = wk.drop('Brand', axis=1)
wk = wk.dropna()

In [326]:
wk.head()

Unnamed: 0,Rating,Country,Category,Price,ABV
0,40.0,Canada,Blended,9.0,40.0
2,47.0,Ireland,Blended,20.0,40.0
4,48.0,Canada,Blended,12.0,40.0
6,53.0,Canada,Blended,12.0,40.0
8,54.0,USA,Blended,20.0,40.0


In [327]:
data = wk.values

# Convert nominal attributes to numeric

In [328]:
data[:,1] = LabelEncoder().fit_transform(data[:,1])
data[:,2] = LabelEncoder().fit_transform(data[:,2])

# From phase 1

In [329]:
def outlier(x):
    q1, q3 = np.percentile(x,[25, 75])
    iqr = q3 - q1
    cut = 1.5 * iqr  
    return [i for i in range(x.shape[0]) if x[i] < q1-cut or x[i]>q3+cut]

def all_out(x):
    idx = []
    for i in range(x.shape[1]):
        idx = np.concatenate((idx, outlier(x[:,i])))
    return list(set(idx))

In [330]:
idx = all_out(data)
print(len(idx),idx)
data = np.delete(data, idx, axis=0)
print(data.shape)

49 [0.0, 1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0, 9.0, 132.0, 259.0, 133.0, 141.0, 256.0, 143.0, 257.0, 155.0, 156.0, 158.0, 159.0, 160.0, 162.0, 46.0, 174.0, 176.0, 51.0, 59.0, 60.0, 67.0, 195.0, 197.0, 199.0, 202.0, 208.0, 80.0, 210.0, 213.0, 214.0, 89.0, 220.0, 97.0, 227.0, 100.0, 229.0, 232.0, 254.0, 125.0, 126.0]
(211, 5)


  This is separate from the ipykernel package so we can avoid doing imports until


# Min-Max Normalization

In [331]:
data = MinMaxScaler().fit_transform(data)
print(data)

[[0.         0.         0.         0.14728682 0.234375  ]
 [0.         0.5        0.         0.06976744 0.234375  ]
 [0.02777778 0.5        0.         0.10077519 0.234375  ]
 ...
 [0.97222222 1.         0.07142857 0.36434109 0.9375    ]
 [0.97222222 0.75       0.57142857 0.53488372 0.97395833]
 [1.         0.5        0.85714286 0.47286822 0.546875  ]]




# Convert numeric rating to nominal ("low", "medium", "high")

In [332]:
"""rate_min = data[:,0].min()
rate_max = data[:,0].max()
rate_range = rate_max - rate_min
print(rate_min, rate_max, rate_range)
label = np.array(['low' if d <= rate_min+rate_range/3  else 'medium' if d <= rate_min+2*rate_range/3 else 'high' for d in data[:,0]])"""

"rate_min = data[:,0].min()\nrate_max = data[:,0].max()\nrate_range = rate_max - rate_min\nprint(rate_min, rate_max, rate_range)\nlabel = np.array(['low' if d <= rate_min+rate_range/3  else 'medium' if d <= rate_min+2*rate_range/3 else 'high' for d in data[:,0]])"

In [333]:
label = np.array(['low' if d <= 1/3  else 'medium' if d <= 2/3 else 'high' for d in data[:,0]])

# Split x, y to both x_train, y_train, x_test, y_test

In [334]:
x = data[:,1:]
y = label

x_train, x_test, y_train, y_test = train_test_split(x, y,test_size=0.2, shuffle=True)
print(x_train.shape, y_train.shape)
print(x_test.shape, y_test.shape)

(168, 4) (168,)
(43, 4) (43,)


# Cart decision tree
* set max depth = 3 to reduce overfitting

In [335]:
cart = tree.DecisionTreeClassifier(criterion="gini", max_depth=3)
cart.fit(x_train, y_train)

DecisionTreeClassifier(class_weight=None, criterion='gini', max_depth=3,
            max_features=None, max_leaf_nodes=None,
            min_impurity_decrease=0.0, min_impurity_split=None,
            min_samples_leaf=1, min_samples_split=2,
            min_weight_fraction_leaf=0.0, presort=False, random_state=None,
            splitter='best')

# Write tree to cart.txt

In [336]:
export_graphviz(cart, "cart.txt")

# Data which incorrectly predict (no need to report, just only tree picture)

In [337]:
count = 0
for i in range(x_test.shape[0]):
    if cart.predict(x_test[i].reshape(1, -1))[0] == y_test[i]:
        count += 1
    else:
        print(i,cart.predict(x_test[i].reshape(1, -1))[0], y_test[i])
        
print("acc:", count/x_test.shape[0])

0 medium high
4 medium low
7 medium high
8 low high
10 high medium
15 medium high
18 low medium
19 medium high
21 high medium
35 medium low
36 high medium
40 high medium
acc: 0.7209302325581395


# Actual labels of test set [low, med, high]

In [338]:
actual = [0, 0, 0]
for x in y_test:
    if x == 'low':
        actual[0] += 1
    elif x == 'medium':
        actual[1] += 1
    else:
        actual[2] += 1
print("actual:", actual)

actual: [3, 22, 18]


# Predict labels of test set [low, med, high]

In [339]:
count_predict_cart = [0, 0, 0]
for x in x_test:
    if cart.predict(x.reshape(1, -1)) == 'low':
        count_predict_cart[0] += 1
    elif cart.predict(x.reshape(1, -1)) == 'medium':
        count_predict_cart[1] += 1
    else:
        count_predict_cart[2] += 1
print("predict of cart:", count_predict_cart)

predict of cart: [3, 23, 17]


In [340]:
y_predict_cart = []
for x in x_test:
    y_predict_cart.append(cart.predict(x.reshape(1, -1))[0])

In [341]:
labels = ['low', 'medium', 'high']

# Create confusion matrix (Can do it in excel for more understandable)

In [342]:
confusion_mat = pd.DataFrame(confusion_matrix(y_test, y_predict_cart,labels=labels), columns=labels, index=labels)

# Row => Actual labels, Col => Predict labels

In [343]:
confusion_mat

Unnamed: 0,low,medium,high
low,1,2,0
medium,1,17,4
high,1,4,13


# Additional information about Cart classification

In [344]:
text_report_cart = sklearn.metrics.classification_report(y_test, y_predict_cart, labels=labels)
print(text_report_cart)

              precision    recall  f1-score   support

         low       0.33      0.33      0.33         3
      medium       0.74      0.77      0.76        22
        high       0.76      0.72      0.74        18

   micro avg       0.72      0.72      0.72        43
   macro avg       0.61      0.61      0.61        43
weighted avg       0.72      0.72      0.72        43



# Next, is about C4.5 (Same step as Cart)

In [345]:
c45 = tree.DecisionTreeClassifier(criterion="entropy", max_depth=3)
c45.fit(x_train, y_train)

DecisionTreeClassifier(class_weight=None, criterion='entropy', max_depth=3,
            max_features=None, max_leaf_nodes=None,
            min_impurity_decrease=0.0, min_impurity_split=None,
            min_samples_leaf=1, min_samples_split=2,
            min_weight_fraction_leaf=0.0, presort=False, random_state=None,
            splitter='best')

In [346]:
export_graphviz(c45, "c45.txt")

In [347]:
count = 0
for i in range(x_test.shape[0]):
    if c45.predict(x_test[i].reshape(1, -1))[0] == y_test[i]:
        count += 1
    else:
        print(i,c45.predict(x_test[i].reshape(1, -1))[0], y_test[i])
        
print("acc:", count/x_test.shape[0])

0 medium high
4 high low
5 low medium
7 medium high
8 medium high
10 high medium
15 medium high
18 low medium
19 medium high
21 high medium
35 medium low
36 high medium
40 high medium
acc: 0.6976744186046512


In [348]:
count_predict_c45 = [0, 0, 0]
for x in x_test:
    if c45.predict(x.reshape(1, -1)) == 'low':
        count_predict_c45[0] += 1
    elif c45.predict(x.reshape(1, -1)) == 'medium':
        count_predict_c45[1] += 1
    else:
        count_predict_c45[2] += 1
print("predict of c4.5:", count_predict_c45)

predict of c4.5: [3, 22, 18]


In [349]:
y_predict_c45 = []
for x in x_test:
    y_predict_c45.append(c45.predict(x.reshape(1, -1))[0])

In [350]:
c45_confusion_mat = pd.DataFrame(confusion_matrix(y_test, y_predict_c45,labels=labels), columns=labels, index=labels)

In [351]:
c45_confusion_mat

Unnamed: 0,low,medium,high
low,1,1,1
medium,2,16,4
high,0,5,13


In [352]:
text_report_c45 = sklearn.metrics.classification_report(y_test, y_predict_c45, labels=labels)
print(text_report_c45)

              precision    recall  f1-score   support

         low       0.33      0.33      0.33         3
      medium       0.73      0.73      0.73        22
        high       0.72      0.72      0.72        18

   micro avg       0.70      0.70      0.70        43
   macro avg       0.59      0.59      0.59        43
weighted avg       0.70      0.70      0.70        43

