In [1]:
import warnings
warnings.filterwarnings('always')
warnings.filterwarnings('ignore')

In [2]:
import numpy as np
import pandas as pd

import matplotlib.pyplot as plt
%matplotlib inline

In [3]:
df = pd.read_csv("data/glass.csv")
df.shape

(214, 10)

___
## Method1

In [None]:
# Detect observations with more than one outlier
from collections import Counter
def outlier_hunt(df):
    """
    Takes a dataframe df of features and returns a list of the indices
    corresponding to the observations containing more than 2 outliers. 
    """
    outlier_indices = []
    
    # iterate over features(columns)
    for col in df.columns.tolist():
        # 1st quartile (25%)
        Q1 = np.percentile(df[col], 25)
        
        # 3rd quartile (75%)
        Q3 = np.percentile(df[col],75)
        
        # Interquartile rrange (IQR)
        IQR = Q3 - Q1
        
        # outlier step
        outlier_step = 1.5 * IQR
        
        # Determine a list of indices of outliers for feature col
        outlier_list_col = df[(df[col] < Q1 - outlier_step) | (df[col] > Q3 + outlier_step )].index
        
        # append the found outlier indices for col to the list of outlier indices 
        outlier_indices.extend(outlier_list_col)
        
    # select observations containing more than 2 outliers
    outlier_indices = Counter(outlier_indices)        
    multiple_outliers = list( k for k, v in outlier_indices.items() if v > 2 )
    
    return multiple_outliers   

In [None]:
features = df.columns[:-1].tolist()
print('The dataset contains %d observations with more than 2 outliers' %(len(outlier_hunt(df[features]))))

In [None]:
outlier_indices = outlier_hunt(df[features])
df = df.drop(outlier_indices).reset_index(drop=True)
print(df.shape)

In [None]:
X = df.drop("Type", axis = 1)
y = df["Type"]

In [None]:
X.shape

In [None]:
y.shape

In [None]:
X.head()

____
## Method2

In [9]:
features = df.drop("Type", axis = 1)
target = df["Type"]

In [10]:
def find_outlier_fences_IQR(df_in, col_name):
    q1 = df_in[col_name].quantile(0.25)
    q3 = df_in[col_name].quantile(0.75)
    iqr = q3-q1 #Interquartile range
    fence_low  = q1-1.5*iqr
    fence_high = q3+1.5*iqr
    return [fence_low, fence_high]

fences = {}
for column in features.columns.values:
    fences[column] = find_outlier_fences_IQR(features, column)
print(fences)

#lets find rows with more than one or two outliers and drop them.
outliers_index = []
for index, row in features.iterrows():
    outliers_detected = 0
    for column in features.columns.values:
        fence_low = fences[column][0]
        fence_high = fences[column][1]
        if row[column] < fence_low or row[column] > fence_high:
            outliers_detected = outliers_detected + 1
    
    if outliers_detected > 1:
        outliers_index.append(index)

print("\n There are %d rows found with more than 1 outlier" %(len(outliers_index)))

{'RI': [1.5125700000000006, 1.5231099999999995], 'Na': [11.53125, 15.201250000000002], 'Mg': [-0.11250000000000071, 5.827500000000001], 'Al': [0.53, 2.29], 'Si': [71.06875, 74.29875000000001], 'K': [-0.6087499999999999, 1.34125], 'Ca': [6.841250000000001, 10.57125], 'Ba': [0.0, 0.0], 'Fe': [-0.15000000000000002, 0.25]}

 There are 35 rows found with more than 1 outlier


In [11]:
outliers_removed_featureset = features.drop(outliers_index)
outliers_removed_targetset = target.drop(outliers_index)