In [71]:
import pandas as pd
import numpy as np
import requests
from io import StringIO
import math
import csv
from sklearn.preprocessing import MinMaxScaler, LabelEncoder, StandardScaler
from sklearn.model_selection import train_test_split
from sklearn.decomposition import PCA
from sklearn import tree

In [111]:
#Read CSV file from the url to DataFrame
url = "https://github.com/Brunel-Visualization/Brunel/raw/master/python/examples/data/whiskey.csv"
s = requests.get(url).text
wk = pd.read_csv(StringIO(s))

In [112]:
wk.describe()

Unnamed: 0,Rating,Price,ABV,Age
count,272.0,279.0,270.0,174.0
mean,84.474265,72.483871,44.610444,14.33908
std,11.877887,83.992242,5.883056,6.322267
min,40.0,2.0,35.5,0.0
25%,80.0,30.0,40.0,10.0
50%,88.0,50.0,43.0,14.5
75%,94.0,80.0,46.0,18.0
max,100.0,850.0,68.2,40.0


In [113]:
wk.head()

Unnamed: 0,Name,Rating,Country,Category,Price,ABV,Age,Brand
0,Canadian Hunter Canadian Whisky,40.0,Canada,Blended,9.0,40.0,,Canadian Hunter
1,Canadian LTD Blended Canadian Whiskey,43.0,Canada,Blended,10.0,,,Canadian LTD
2,Kellan Irish Whiskey,47.0,Ireland,Blended,20.0,40.0,,Kellan
3,Rich & Rare Canadian Whisky,47.0,Canada,Blended,10.0,,,Rich & Rare
4,Canadian Mist Blended Canadian Whisky,48.0,Canada,Blended,12.0,40.0,,Canadian Mist


In [114]:
wk = wk.drop('Name', axis=1)
wk = wk.drop('Age', axis=1)
wk = wk.drop('Brand', axis=1)
wk = wk.dropna()

In [125]:
wk.head()

Unnamed: 0,Rating,Country,Category,Price,ABV
0,40.0,Canada,Blended,9.0,40.0
2,47.0,Ireland,Blended,20.0,40.0
4,48.0,Canada,Blended,12.0,40.0
6,53.0,Canada,Blended,12.0,40.0
8,54.0,USA,Blended,20.0,40.0


In [126]:
data = wk.values

In [127]:
def outlier(x):
    rating_q1 = np.percentile(x,25)
    rating_q3 = np.percentile(x,75)
    IQR = rating_q3 - rating_q1
    lower_outlier = rating_q1 - (1.5*IQR)
    higher_outlier = rating_q3 + (1.5*IQR)
    return lower_outlier, higher_outlier

In [137]:
med0 = np.median(data[:,0])
med3 = np.median(data[:,3])
med4 = np.median(data[:,4])
print(med0, med3, med4)

l0, u0 = outlier(data[:,0])
l3, u3 = outlier(data[:,3])
l4, u4 = outlier(data[:,4])
print(l0, u0)
print(l3, u3)
print(l4, u4)

idx0 = [i for i in range(data.shape[0]) if data[i,0] < l0 or data[i,0] > u0]
idx3 = [i for i in range(data.shape[0]) if data[i,3] < l3 or data[i,3] > u3]
idx4 = [i for i in range(data.shape[0]) if data[i,4] < l4 or data[i,4] > u4]
idx = list(set(np.concatenate((idx0, idx3, idx4))))
data = np.delete(data,idx, axis=0)
print(data.shape)

0.7999999999999999 0.04875148632580261 0.22935779816513757
0.3583333333333336 1.2249999999999999
-0.0538049940546968 0.1673602853745541
-0.1376146788990822 0.5963302752293573
(211, 5)


In [138]:
data[:,1] = LabelEncoder().fit_transform(data[:,1])
data[:,2] = LabelEncoder().fit_transform(data[:,2])

In [139]:
data = MinMaxScaler().fit_transform(data)
print(data)

[[0.         0.         0.         0.14728682 0.234375  ]
 [0.         0.5        0.         0.06976744 0.234375  ]
 [0.02777778 0.5        0.         0.10077519 0.234375  ]
 ...
 [0.97222222 1.         0.07142857 0.36434109 0.9375    ]
 [0.97222222 0.75       0.57142857 0.53488372 0.97395833]
 [1.         0.5        0.85714286 0.47286822 0.546875  ]]


In [140]:
label = np.array(["low" if d <= 0.33 else "medium" if d <= 0.66 else "high" for d in data[:,0]])

In [141]:
x = data[:,1:]
y = label

x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.2, random_state=42, shuffle=True)
print(x_train.shape, y_train.shape)
print(x_test.shape, y_test.shape)

(168, 4) (168,)
(43, 4) (43,)


In [142]:
clf = tree.DecisionTreeClassifier()
clf.fit(x_train, y_train)

DecisionTreeClassifier(class_weight=None, criterion='gini', max_depth=None,
            max_features=None, max_leaf_nodes=None,
            min_impurity_decrease=0.0, min_impurity_split=None,
            min_samples_leaf=1, min_samples_split=2,
            min_weight_fraction_leaf=0.0, presort=False, random_state=None,
            splitter='best')

In [143]:
count = 0
for i in range(x_test.shape[0]):
    if clf.predict(x_test[0].reshape(1, -1)) == y_test[i]:
        count += 1
    else:
        print(clf.predict(x_test[i].reshape(1, -1)), y_test[i])
        
print("acc:", count/x_test.shape[0])

['high'] medium
['medium'] medium
['low'] medium
['medium'] medium
['low'] low
['high'] medium
['high'] medium
['medium'] low
['medium'] low
['high'] medium
['high'] medium
['medium'] medium
['medium'] low
['high'] medium
['high'] low
['medium'] medium
['high'] medium
['high'] medium
['medium'] medium
acc: 0.5581395348837209
