In [1]:
#Import Library for analyzing
import pandas as pd
import numpy as np
import requests
from io import StringIO
import math

In [2]:
#Read CSV file from url 
#Keep in whiskey(pandas)
url = "https://github.com/Brunel-Visualization/Brunel/raw/master/python/examples/data/whiskey.csv"
s = requests.get(url).text
whiskey = pd.read_csv(StringIO(s))

In [3]:
#Show number of record (rows,columns)
whiskey.shape

(283, 8)

In [4]:
#Show data type of each arttibute
whiskey.dtypes

Name         object
Rating      float64
Country      object
Category     object
Price       float64
ABV         float64
Age         float64
Brand        object
dtype: object

In [5]:
#Describe basic statistic for each numeric attribute
for x in ('Rating','Price','ABV','Age'):
    print(whiskey[x].describe())
    print('\n')

count    272.000000
mean      84.474265
std       11.877887
min       40.000000
25%       80.000000
50%       88.000000
75%       94.000000
max      100.000000
Name: Rating, dtype: float64


count    279.000000
mean      72.483871
std       83.992242
min        2.000000
25%       30.000000
50%       50.000000
75%       80.000000
max      850.000000
Name: Price, dtype: float64


count    270.000000
mean      44.610444
std        5.883056
min       35.500000
25%       40.000000
50%       43.000000
75%       46.000000
max       68.200000
Name: ABV, dtype: float64


count    174.000000
mean      14.339080
std        6.322267
min        0.000000
25%       10.000000
50%       14.500000
75%       18.000000
max       40.000000
Name: Age, dtype: float64




In [6]:
#Describe basic statistic for each norminal attribute
for x in ['Name','Country','Category','Brand']:
    print(whiskey[x].describe())
    print('\n')

count                                283
unique                               283
top       Wild Turkey Rare Breed Bourbon
freq                                   1
Name: Name, dtype: object


count          283
unique           9
top       Scotland
freq           108
Name: Country, dtype: object


count         283
unique         15
top       Blended
freq           73
Name: Category, dtype: object


count         283
unique        153
top       Kavalan
freq            7
Name: Brand, dtype: object




In [8]:
#Show missing value of each attribute
whiskey.isnull().sum()

Name          0
Rating       11
Country       0
Category      0
Price         4
ABV          13
Age         109
Brand         0
dtype: int64

In [9]:
#Import pandas -> numpy
x = whiskey.values

In [10]:
#Cut attribute Name because it is unique
x = x[:,1:]

In [11]:
#Show column Rating
x[:,0]

array([40.0, 43.0, 47.0, 47.0, 48.0, 50.0, 53.0, 54.0, 54.0, 54.0, 56.0,
       56.0, 57.0, 57.0, 58.0, 60.0, 60.0, 64.0, 64.0, 65.0, 65.0, 67.0,
       67.0, 67.0, 67.0, 67.0, 68.0, 68.0, 69.0, 69.0, 69.0, 69.0, 69.0,
       70.0, 70.0, 71.0, 71.0, 72.0, 73.0, 73.0, 74.0, 74.0, 74.0, 74.0,
       75.0, 76.0, 76.0, 76.0, 76.0, 76.0, 76.0, 77.0, 77.0, 77.0, 77.0,
       77.0, 78.0, 78.0, 78.0, 78.0, 79.0, 79.0, 79.0, 79.0, 80.0, 80.0,
       80.0, 80.0, 80.0, 80.0, 81.0, 81.0, 81.0, 81.0, 81.0, 81.0, 81.0,
       81.0, 81.0, 81.0, 81.0, 81.0, 81.0, 81.0, 81.0, 81.0, 81.0, 81.0,
       82.0, 82.0, 82.0, 82.0, 82.0, 82.0, 82.0, 82.0, 83.0, 83.0, 83.0,
       83.0, 83.0, 83.0, 83.0, 83.0, 84.0, 84.0, 84.0, 84.0, 84.0, 84.0,
       85.0, 85.0, 85.0, 85.0, 85.0, 85.0, 85.0, 85.0, 86.0, 86.0, 86.0,
       86.0, 86.0, 86.0, 86.0, 86.0, 86.0, 87.0, 87.0, 87.0, 87.0, 87.0,
       87.0, 87.0, 87.0, 88.0, 88.0, 88.0, 88.0, 88.0, 88.0, 88.0, 88.0,
       88.0, 88.0, 88.0, 88.0, 88.0, 88.0, 88.0, 88

In [12]:
#Find median of each numeric attribute
med_rating = np.median([xx for xx in x[:,0] if not math.isnan(xx)])
med_price = np.median([xx for xx in x[:,3] if not math.isnan(xx)])
med_avb = np.median([xx for xx in x[:,4] if not math.isnan(xx)])
med_age = np.median([xx for xx in x[:,5] if not math.isnan(xx)])

In [13]:
#Create new attribute to replace missing value with median of each attribute
rating = [med_rating if math.isnan(xx) else xx for xx in x[:,0]]
price = [med_price if math.isnan(xx) else xx for xx in x[:,3]]
avb = [med_avb if math.isnan(xx) else xx for xx in x[:,4]]
age = [med_age if math.isnan(xx) else xx for xx in x[:,5]]

In [14]:
#Detecting Outlier
def outlier_ei(x):
    rating_q1 = np.percentile(x,25)
    rating_q3 = np.percentile(x,75)
    IQR = rating_q3 - rating_q1
    lower_outlier = rating_q1 - (1.5*IQR)
    higher_outlier = rating_q3 + (1.5*IQR)
    return lower_outlier, higher_outlier

In [15]:
#Detecting Outlier
def del_outlier(x):
    l, h = outlier_ei(x)
    return [xx for xx in x if xx >= l and xx <= h]

In [16]:
#Create new attribute to keep value in range  lower_outlier < x < higher_outlier
new_rating = del_outlier(rating)
new_price = del_outlier(price)
new_avb = del_outlier(avb)
new_age = del_outlier(age)