# Detecting and Filtering Outliers

In [3]:
import pandas as pd
import numpy as np

In [4]:
data = pd.DataFrame(np.random.randn(1000, 4))
data.head()

Unnamed: 0,0,1,2,3
0,-1.208924,0.515113,0.28162,-1.6167
1,-1.131881,0.827526,-0.314094,-0.182442
2,-0.819016,0.35849,0.202973,-2.164555
3,-0.161667,-0.448738,-1.654088,-0.077182
4,-0.16378,0.01106,1.713771,-0.352316


In [7]:
data.describe()

Unnamed: 0,0,1,2,3
count,1000.0,1000.0,1000.0,1000.0
mean,0.012822,-0.016613,0.011746,-0.013623
std,0.977568,0.992944,0.981228,1.011326
min,-2.864226,-3.042841,-3.56114,-3.100476
25%,-0.656826,-0.696695,-0.65021,-0.688026
50%,-0.016012,-0.018627,0.022571,0.003212
75%,0.677806,0.659649,0.704894,0.647192
max,3.425651,2.963175,4.070292,2.901707


Suppose you wanted to find values in one of the columns exceeding 3 in absolute
value:

In [10]:
col = data[2]
col.head()

0    0.797799
1   -1.083026
2    0.217140
3    0.906877
4   -0.064395
Name: 2, dtype: float64

In [11]:
col[np.abs(col) > 3]

171   -3.124339
613    4.070292
760    3.001952
770   -3.561140
Name: 2, dtype: float64

To select all rows having a value exceeding 3 or –3, you can use the 'any' method on a
boolean DataFrame:

In [12]:
data[(np.abs(data) > 3).any(1)]

Unnamed: 0,0,1,2,3
8,3.261082,0.43112,0.134601,0.448197
80,-1.479233,-3.042841,1.60943,0.842992
171,0.451209,-0.272517,-3.124339,0.937215
459,-1.17568,0.51526,-0.757142,-3.100476
514,0.27637,0.92715,-0.068245,-3.093571
613,1.417604,-1.251585,4.070292,0.094839
760,-1.379709,-1.21117,3.001952,-1.265852
766,3.0533,1.293647,-1.45427,0.582991
770,1.231616,0.190205,-3.56114,0.355144
974,3.425651,-0.894065,0.669318,-2.782337


Values can be set based on these criteria. Here is code to cap values outside the interval –3 to 3:

In [13]:
data[np.abs(data) > 3] = np.sign(data) * 3

In [15]:
data.describe()
data

Unnamed: 0,0,1,2,3
0,-0.581561,-0.313535,0.797799,0.228165
1,0.333266,0.515854,-1.083026,1.191030
2,1.194900,-1.568695,0.217140,0.500528
3,2.145409,1.117278,0.906877,-0.061420
4,0.775462,-0.595675,-0.064395,-0.396552
...,...,...,...,...
995,-1.088103,-0.856632,-0.108632,0.430124
996,0.388906,-0.998899,1.552504,-1.398795
997,0.044133,-0.145509,-1.678940,1.650109
998,-0.829694,0.403079,1.201093,-0.587390


The statement np.sign(data) produces 1 and –1 values based on whether the values
in data are positive or negative:


In [26]:
np.sign(data).head()

Unnamed: 0,0,1,2,3
0,-1.0,-1.0,1.0,1.0
1,1.0,1.0,-1.0,1.0
2,1.0,-1.0,1.0,1.0
3,1.0,1.0,1.0,-1.0
4,1.0,-1.0,-1.0,-1.0
