<a href="https://colab.research.google.com/github/suchanbo/programming/blob/master/Isolation_Forest_Example.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

**Isolation Forest Example**

In [None]:
# import the libraries
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.ensemble import IsolationForest
import warnings
warnings.filterwarnings("ignore")

In [None]:
# Load the dataset
df = pd.read_csv('Employee_Salary_Dataset.csv')
df.head(5)
# we will be selecting ID, Experience_Years, Age, Gender, Salary
df = df[['ID', 'Experience_Years', 'Age', 'Gender', 'Salary']]
# check how many records are there in the dataset
df.shape
# view all the records
df.head(5)

Unnamed: 0,ID,Experience_Years,Age,Gender,Salary
0,1,5,28,Female,250000
1,2,1,21,Male,50000
2,3,3,23,Female,170000
3,4,2,22,Male,25000
4,5,1,17,Male,10000


In [None]:
# Select only salary for the isolation forest model test
df_salary = df[['Salary']]
df_salary.head(5)


Unnamed: 0,Salary
0,250000
1,50000
2,170000
3,25000
4,10000


In [None]:
# Instantiate the model and fit the data to it
model=IsolationForest(n_estimators=1000,max_samples='auto', contamination=float(0.3),max_features=1.0,random_state=0)
model.fit(df_salary[['Salary']])

In [None]:
# Get the score and anomaly flag
df_salary['scores']=model.decision_function(df[['Salary']])
df_salary['anomaly']=model.predict(df[['Salary']])
# view the data
df_salary.sort_values(by=['Salary'])


Unnamed: 0,Salary,scores,anomaly
26,3000,0.146115,1
23,6000,0.181805,1
29,6100,0.18261,1
14,7500,0.185777,1
24,8900,0.190545,1
7,9000,0.190582,1
4,10000,0.18416,1
18,15000,0.171313,1
25,20000,0.16748,1
3,25000,0.16621,1


In [None]:
# fetch all the anomalies
anomaly=df_salary.loc[df_salary['anomaly']==-1]
anomaly_index=list(anomaly.index)
anomaly.head(40).sort_values(by=['Salary'])

Unnamed: 0,Salary,scores,anomaly
11,1400000,-0.037059,-1
32,1540000,-0.056923,-1
28,5000000,-0.037755,-1
5,5001000,-0.037755,-1
12,6000050,-0.044214,-1
20,6570000,-0.002572,-1
22,6845000,-0.005663,-1
34,7600000,-0.024863,-1
17,7900000,-0.040408,-1
33,9300000,-0.114575,-1


In [None]:
#merge the two dataframes
df_merged = pd.merge(df, df_salary, on=["Salary"])
df_merged.sort_values(by=['Salary'])


Unnamed: 0,ID,Experience_Years,Age,Gender,Salary,scores,anomaly
30,27,1,18,Male,3000,0.146115,1
27,24,1,21,Female,6000,0.181805,1
33,30,2,21,Female,6100,0.18261,1
19,15,2,23,Male,7500,0.185777,1
28,25,4,23,Female,8900,0.190545,1
13,8,2,21,Female,9000,0.190582,1
10,5,1,17,Male,10000,0.18416,1
23,19,2,21,Male,15000,0.171313,1
29,26,3,22,Female,20000,0.16748,1
9,22,4,26,Male,25000,0.16621,1


**Let's make predictions by passing an amount value and see how well the model predicts**

In [None]:
# Test your model
# Model returns 1 ==> The amount is within the expected salary range
# Model return -1 ==> The amount deviates from the expected salary range
model.predict([[10000000]])

array([-1])