# Outliner

###### Detect
 - Maximum & Minimum values
 - Percentile
 - Mean & standard deviation
 - Inter Qualities Range

###### Eliminate
 - Trimming
 - Capping

## 1. Using maximum and minimum values

In [None]:
# Import data
import pandas as pd
df = pd.read_excel('dataset.xlsx', sheet_name='outliner')

In [None]:
df

In [None]:
df.describe()

In [None]:
import seaborn as sns
sns.boxplot(df['salary'])

In [None]:
sns.distplot(df['salary'])

### Find boundary for outliner

In [None]:
import numpy as np
lower_limit = np.sort(df['salary'].values)[1]
upper_limit = np.sort(df['salary'].values)[-2]

In [None]:
# 0 1 2 3 ... 199 200 201 202

In [None]:
lower_limit

In [None]:
upper_limit

### Trimming the outliner values

In [None]:
df_trim = df.loc[df['salary'] >= lower_limit]

In [None]:
df.describe()

In [None]:
df_trim.describe()

In [None]:
df_trim = df_trim.loc[df_trim['salary'] <= upper_limit]

In [None]:
df_trim.describe()

In [None]:
# Option : use only one command
# df_trim = df[(df['salary'] <= upper_limit) & (df['salary'] >= lower_limit)]

In [None]:
sns.boxplot(df['salary'])

In [None]:
sns.boxplot(df_trim['salary'])

In [None]:
sns.distplot(df_trim['salary'])

### Clipping the outliner values

In [None]:
df_clip = df.copy()

In [None]:
df_clip['salary'] = df['salary'].replace( df['salary'].min() , lower_limit)

In [None]:
df_clip.describe()

In [None]:
df_clip['salary'] = df_clip['salary'].replace( df_clip['salary'].max() , upper_limit)

In [None]:
df_clip.describe()

In [None]:
sns.boxplot(df_clip['salary'])

In [None]:
sns.distplot(df_clip['salary'])

## 2. Using percentile

In [None]:
import pandas as pd
df = pd.read_excel('dataset.xlsx', sheet_name='outliner')

In [None]:
df

In [None]:
df.describe()

In [None]:
import seaborn as sns
sns.boxplot(df['salary'])

In [None]:
sns.distplot(df['salary'])

### Find boundary for outliner

In [None]:
upper_limit = df['salary'].quantile(0.99)
lower_limit = df['salary'].quantile(0.01)

In [None]:
upper_limit

In [None]:
lower_limit

### Trimming the outliner values

In [None]:
df_trim = df[(df['salary'] <= upper_limit) & (df['salary'] >= lower_limit)]

In [None]:
df_trim.describe()

In [None]:
sns.boxplot(df_trim['salary'])

In [None]:
sns.distplot(df_trim['salary'])

### Clipping the outliner values

In [None]:
df_clip = df.copy()

In [None]:
df_clip['salary'] = df['salary'].replace( df['salary'].loc[df['salary'] < lower_limit] , lower_limit)

In [None]:
df_clip.describe()

In [None]:
df_clip['salary'] = df_clip['salary'].replace( df_clip['salary'].loc[df_clip['salary'] > upper_limit] , upper_limit)

In [None]:
df_clip.describe()

In [None]:
sns.boxplot(df['salary'])

In [None]:
sns.boxplot(df_clip['salary'])

In [None]:
sns.distplot(df_clip['salary'])

### Change the boundary to the 10th percentile

In [None]:
upper_limit = df['salary'].quantile(0.90)
lower_limit = df['salary'].quantile(0.10)

In [None]:
upper_limit

In [None]:
lower_limit

In [None]:
df_trim = df[(df['salary'] <= upper_limit) & (df['salary'] >= lower_limit)]

In [None]:
df_trim.describe()

In [None]:
sns.boxplot(df_trim['salary'])

In [None]:
sns.distplot(df_trim['salary'])

## 3. Using the Inter-Quantile Range (IQR)

In [None]:
import pandas as pd
df = pd.read_excel('dataset.xlsx', sheet_name='outliner')

In [None]:
df.describe()

In [None]:
import seaborn as sns
sns.boxplot(df['salary'])

In [None]:
sns.distplot(df['salary'])

### Find boundary for outliner

In [None]:
q1 = df['salary'].quantile(0.25)
q3 = df['salary'].quantile(0.75)
iqr = q3-q1

In [None]:
upper_limit = q3 + ( 1.5 * iqr )
lower_limit = q1 - ( 1.5 * iqr )

In [None]:
upper_limit

In [None]:
lower_limit

### Trimming the outliner values

In [None]:
df_trim = df[(df['salary'] <= upper_limit) & (df['salary'] >= lower_limit)]

In [None]:
df_trim.describe()

In [None]:
sns.boxplot(df_trim['salary'])

In [None]:
sns.distplot(df_trim['salary'])

### Clipping the outliner values

In [None]:
df_clip = df.copy()

In [None]:
df_clip['salary'] = df['salary'].replace( df['salary'].loc[df['salary'] < lower_limit] , lower_limit)

In [None]:
df_clip.describe()

In [None]:
df_clip['salary'] = df_clip['salary'].replace( df_clip['salary'].loc[df_clip['salary'] > upper_limit] , upper_limit)

In [None]:
df_clip.describe()

In [None]:
sns.boxplot(df_clip['salary'])

In [None]:
sns.distplot(df_clip['salary'])

### Another function for clipping the outliner values

In [None]:
import numpy as np
df_clip['salary'] = np.where( df['salary'] >= upper_limit,
                                  upper_limit,
                                  np.where(df['salary'] <= lower_limit,
                                               lower_limit,
                                               df['salary'] ) )

In [None]:
df_clip.describe()

In [None]:
sns.boxplot(df_clip['salary'])

In [None]:
sns.distplot(df_clip['salary'])