In [1]:
import pandas as pd

df = pd.read_csv('outliers_data.csv')
df.head()

Unnamed: 0,Student ID,Age
0,49,17
1,268,15
2,120,14
3,478,13
4,199,17


In [3]:
df.describe()

Unnamed: 0,Student ID,Age
count,527.0,527.0
mean,263.0,16.00759
std,152.276065,9.156432
min,0.0,3.0
25%,131.5,13.0
50%,263.0,14.0
75%,394.5,16.0
max,526.0,75.0


In [4]:
df.Age.sort_values()

524     3
340     4
124     4
164     6
245    10
       ..
153    64
152    69
162    71
389    71
360    75
Name: Age, Length: 527, dtype: int64

## Identifying outliers when we know the range

In [5]:
df.loc[(df['Age'] < 11)]

Unnamed: 0,Student ID,Age
124,449,4
164,269,6
245,322,10
340,435,4
524,12,3


In [6]:
df.loc[(df['Age'] > 18)]

Unnamed: 0,Student ID,Age
8,385,52
99,171,63
108,259,38
117,44,48
152,414,69
153,427,64
162,128,71
189,4,60
197,221,52
210,353,40


## Identify outliers using standard deviation

In [7]:
summaries = df.describe().loc[['mean', 'std']]
summaries

Unnamed: 0,Student ID,Age
mean,263.0,16.00759
std,152.276065,9.156432


In [8]:
upper_bound = summaries['Age']['mean'] + summaries['Age']['std'] 
lower_bound = summaries['Age']['mean'] - summaries['Age']['std'] 

In [9]:
upper_bound

25.1640221051006

In [10]:
lower_bound

6.85115816055405

In [11]:
violating_rows = df[(df['Age'] < lower_bound) | (df['Age'] > upper_bound)]
violating_rows

Unnamed: 0,Student ID,Age
8,385,52
99,171,63
108,259,38
117,44,48
124,449,4
152,414,69
153,427,64
162,128,71
164,269,6
189,4,60


## Remove outliers entirely from the dataset

In [12]:
outliers_index = violating_rows.index
outliers_index

Int64Index([  8,  99, 108, 117, 124, 152, 153, 162, 164, 189, 197, 210, 232,
            277, 318, 340, 360, 366, 383, 388, 389, 411, 420, 444, 445, 504,
            510, 517, 523, 524],
           dtype='int64')

In [13]:
newdf = df.drop(index=outliers_index)
newdf.describe()

Unnamed: 0,Student ID,Age
count,497.0,497.0
mean,262.917505,14.158954
std,152.291698,1.939843
min,0.0,10.0
25%,131.0,12.0
50%,263.0,14.0
75%,394.0,16.0
max,526.0,17.0


In [14]:
df.describe()

Unnamed: 0,Student ID,Age
count,527.0,527.0
mean,263.0,16.00759
std,152.276065,9.156432
min,0.0,3.0
25%,131.5,13.0
50%,263.0,14.0
75%,394.5,16.0
max,526.0,75.0


## Drop method with `inplace = True`

In [15]:
df.drop(index=outliers_index, inplace=True)
df

Unnamed: 0,Student ID,Age
0,49,17
1,268,15
2,120,14
3,478,13
4,199,17
...,...,...
520,139,17
521,215,13
522,488,15
525,235,11


In [16]:
df.describe()

Unnamed: 0,Student ID,Age
count,497.0,497.0
mean,262.917505,14.158954
std,152.291698,1.939843
min,0.0,10.0
25%,131.0,12.0
50%,263.0,14.0
75%,394.0,16.0
max,526.0,17.0
