In [3]:
import numpy as np
import matplotlib.pyplot as plt
%matplotlib inline

#### Various ways of finding an outlier-
##### i.) Using Scatter plot
##### ii.) Using Box plot
##### iii.) Using Z Score
##### iv.) Using IQR

In [2]:
#Example- 
df = [11,10,12,14,12,15,14,13,15,102,12,14,17,19,107, 10,13,12,14,12,108,12,11,14,13,15,10,15,12,10,14,13,15,10]

### Detecting Outlier using Z-score
#### z = (x - mean)/ S.D.

In [1]:
outliers = []

def detect_outliers(data):
    threshold = 3
    mean = np.mean(data)
    sd = np.std(data)
    
    for i in data:
        z = (i-mean)/sd
        if np.abs(z) > threshold:
            outliers.append(i)
    return outliers

In [4]:
outliers = detect_outliers(df)
outliers

[102, 107, 108]

In [5]:
df

[11,
 10,
 12,
 14,
 12,
 15,
 14,
 13,
 15,
 102,
 12,
 14,
 17,
 19,
 107,
 10,
 13,
 12,
 14,
 12,
 108,
 12,
 11,
 14,
 13,
 15,
 10,
 15,
 12,
 10,
 14,
 13,
 15,
 10]

### Detecting Outliers using InterQuartile Range (IQR)
#### 75% - 25% values in a dataset
#### Steps:
##### 1.) Arrange data in increasing order
##### 2.) Calculate first (q1) and third (q3) quartile
##### 3.) Find interquartile range q3-q1
##### 4.) Find lower bound = q1 - 1.5 * IQR
##### 5.) Find upper bound = q3 + 1.5 * IQR

#### Anything that lies outside of lower and upper bound is an outlier.

In [8]:
df = sorted(df)

In [9]:
df

[10,
 10,
 10,
 10,
 10,
 11,
 11,
 12,
 12,
 12,
 12,
 12,
 12,
 12,
 13,
 13,
 13,
 13,
 14,
 14,
 14,
 14,
 14,
 14,
 15,
 15,
 15,
 15,
 15,
 17,
 19,
 102,
 107,
 108]

In [10]:
# Get 25 and 75 percentile i.e., q1 and q3
q1, q3 = np.percentile(df, [25,75]) 

In [11]:
print(q1, q3)

12.0 15.0


In [12]:
# Find iqr
iqr = q3-q1
iqr

3.0

In [16]:
# find lower bound and upper bound
lower_bound = q1 - 1.5 * iqr
upper_bound = q3 + 1.5 * iqr

In [17]:
lower_bound

7.5

In [18]:
upper_bound

19.5