In [92]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import MinMaxScaler
from pyod.models.iforest import IForest

In [93]:
data = pd.read_csv(r"D:\IForest\airbnb_melbourne.csv")
data.head()

Unnamed: 0,room_type,price,minimum_nights,number_of_reviews,calculated_host_listings_count,availability_365
0,Private room,60,1,4,1,365
1,Entire home/apt,95,3,42,10,0
2,Private room,1000,1,2,1,365
3,Entire home/apt,99,1,163,1,341
4,Private room,40,7,159,2,0


In [94]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 18016 entries, 0 to 18015
Data columns (total 6 columns):
 #   Column                          Non-Null Count  Dtype 
---  ------                          --------------  ----- 
 0   room_type                       18016 non-null  object
 1   price                           18016 non-null  int64 
 2   minimum_nights                  18016 non-null  int64 
 3   number_of_reviews               18016 non-null  int64 
 4   calculated_host_listings_count  18016 non-null  int64 
 5   availability_365                18016 non-null  int64 
dtypes: int64(5), object(1)
memory usage: 844.6+ KB


In [95]:
df = pd.get_dummies(data, drop_first = True)
df.head()

Unnamed: 0,price,minimum_nights,number_of_reviews,calculated_host_listings_count,availability_365,room_type_Hotel room,room_type_Private room,room_type_Shared room
0,60,1,4,1,365,0,1,0
1,95,3,42,10,0,0,0,0
2,1000,1,2,1,365,0,1,0
3,99,1,163,1,341,0,0,0
4,40,7,159,2,0,0,1,0


In [96]:
df.describe()

Unnamed: 0,price,minimum_nights,number_of_reviews,calculated_host_listings_count,availability_365,room_type_Hotel room,room_type_Private room,room_type_Shared room
count,18016.0,18016.0,18016.0,18016.0,18016.0,18016.0,18016.0,18016.0
mean,173.141596,6.80484,25.825599,8.392096,136.493339,0.007937,0.339032,0.017262
std,382.689938,34.164457,51.853321,20.463457,144.361245,0.08874,0.473394,0.130251
min,0.0,1.0,0.0,1.0,0.0,0.0,0.0,0.0
25%,70.0,1.0,1.0,1.0,0.0,0.0,0.0,0.0
50%,110.0,2.0,5.0,1.0,87.0,0.0,0.0,0.0
75%,180.0,3.0,26.0,4.0,302.0,0.0,1.0,0.0
max,15000.0,1125.0,643.0,147.0,365.0,1.0,1.0,1.0


In [82]:
import sweetviz as sv

In [83]:
#my_report = sv.analyze(df)

In [84]:
#my_report.show_notebook()

## Hyperparamter Tuning

In [97]:
def outlier_classifier(model, data):
    
    # Getting labels
    labels = model.fit_predict(data)
    
    # Returning inliers
    return data[labels == 0]

In [98]:
from sklearn.linear_model import LinearRegression
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error

In [99]:
def evaluate_regressor(inliners):
    
    X = inliners.drop('price', axis = 1)
    y = inliners[['price']]
    
    X_train, X_test, y_train, y_test = train_test_split(X, y, random_state = 10)
    
    lr = LinearRegression()
    lr.fit(X_train, y_train)
    
    preds = lr.predict(X_test)
    rmse = mean_squared_error(y_test, preds, squared = False)
    
    return round(rmse, 3)

In [100]:
estimators = [100, 300, 600, 1000, 1500]
max_samples = [0.6, 0.8, 1]
contaminations = [0.005, 0.01, 0.03, 0.05, 0.1, 0.2, 0.3]
max_features = [0.7, 0.8, 0.9, 1]
scores = dict()

In [101]:
from itertools import product

#list(product(contaminations, estimators, max_samples, max_features))

In [102]:
for a, b, c, d in product(estimators, max_samples, contaminations, max_features):
    
    # Instantiating an IForest
    iforest = IForest(n_estimators = a, max_samples = b, contamination = c, max_features = d, n_jobs = -1)
    
    # Getting the inliers with the current IForest
    inliers = outlier_classifier(iforest, df)
    
    # Calculating and storing RMSE into scores
    scores[(a, b, c, d)] = evaluate_regressor(inliers)



















In [113]:
# Find the minimum value of the values (which are integers)
min_value = min(scores.values())
print(min_value)

# Get the key for the minimum value
min_key = min(scores, key = scores.get) # 'c'

print(min_key)

83.282
(100, 0.8, 0.3, 0.7)


In [112]:
min_key = min(scores, key = scores.get) # 'c'

print(min_key)

(100, 0.8, 0.3, 0.7)


## Trying optimizing more by taking contamination values beyond 30% ----> 34, 38, 43, 48

In [114]:
estimators_2 = [100, 200]
max_samples_2 = [0.8, 1]
contaminations_2 = [0.2, 0.3, 0.34, 0.38, 0.43, 0.48]
max_features_2 = [0.7, 1]
scores_2 = dict()

In [115]:
for a, b, c, d in product(estimators_2, max_samples_2, contaminations_2, max_features_2):
    
    # Instantiating an IForest
    iforest = IForest(n_estimators = a, max_samples = b, contamination = c, max_features = d, n_jobs = -1)
    
    # Getting the inliers with the current IForest
    inliers = outlier_classifier(iforest, df)
    
    # Calculating and storing RMSE into scores
    scores_2[(a, b, c, d)] = evaluate_regressor(inliers)



In [116]:
# Find the minimum value of the values (which are integers)
min_value_2 = min(scores_2.values())
print(min_value_2)

# Get the key for the minimum value
min_key_2 = min(scores_2, key = scores_2.get) # 'c'

print(min_key_2)

76.787
(100, 0.8, 0.34, 0.7)
