# Import the libraries

In [1]:
import numpy as np
import pandas as pd
import matplotlib
import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline
from datetime import datetime, date
from scipy import stats
import warnings
warnings.filterwarnings("ignore")

# Load the dataset

In [2]:
df = pd.read_csv('ames_housing_preprocessed_feat_sel.csv')
df.shape

(1460, 57)

In [3]:
df.head()

Unnamed: 0,1stFlrSF,3SsnPorch,BldgType,BsmtExposure,BsmtFinSF1,BsmtFinSF2,BsmtFinType1,BsmtFullBath,BsmtHalfBath,BsmtQual,...,SaleCondition,ScreenPorch,Street,TotRmsAbvGrd,TotalBsmtSF,WoodDeckSF,YearBuilt,YearRemodAdd,YrSold,Sale Price
0,856,0,0,3,706,0,2,1,0,2,...,4,0,1,8,856,0,2003,2003,2,12.247699
1,1262,0,0,1,978,0,0,0,1,2,...,4,0,1,6,1262,298,1976,1976,1,12.109016
2,920,0,0,2,486,0,2,1,0,2,...,4,0,1,6,920,0,2001,2002,2,12.317171
3,961,0,0,3,216,0,0,1,0,4,...,0,0,1,7,756,0,1915,1970,0,11.849405
4,1145,0,0,0,655,0,2,1,0,2,...,4,0,1,9,1145,192,2000,2000,2,12.42922


# Normalize the dataset

In [5]:
#normalize the dataset
from sklearn.preprocessing import StandardScaler
scaler = StandardScaler()
df_norm = pd.DataFrame(scaler.fit_transform(df))

df_norm.columns = df.columns
df_norm.head()

Unnamed: 0,1stFlrSF,3SsnPorch,BldgType,BsmtExposure,BsmtFinSF1,BsmtFinSF2,BsmtFinType1,BsmtFullBath,BsmtHalfBath,BsmtQual,...,SaleCondition,ScreenPorch,Street,TotRmsAbvGrd,TotalBsmtSF,WoodDeckSF,YearBuilt,YearRemodAdd,YrSold,Sale Price
0,-0.793434,-0.116339,-0.411691,0.591905,0.575425,-0.288653,-0.510942,1.10781,-0.241061,-0.566939,...,0.208502,-0.270208,0.064238,0.91221,-0.459303,-0.752176,1.050994,0.878668,0.138777,0.560067
1,0.25714,-0.116339,-0.411691,-1.124444,1.171992,-0.288653,-1.399536,-0.819964,3.948809,-0.566939,...,0.208502,-0.270208,0.064238,-0.318683,0.466465,1.626195,0.156734,-0.429577,-0.614439,0.212763
2,-0.627826,-0.116339,-0.411691,-0.266269,0.092907,-0.288653,-0.510942,1.10781,-0.241061,-0.566939,...,0.208502,-0.270208,0.064238,-0.318683,-0.313369,-0.752176,0.984752,0.830215,0.138777,0.734046
3,-0.521734,-0.116339,-0.411691,0.591905,-0.499274,-0.288653,-1.399536,1.10781,-0.241061,0.99777,...,-3.426284,-0.270208,0.064238,0.296763,-0.687324,-0.752176,-1.863632,-0.720298,-1.367655,-0.437383
4,-0.045611,-0.116339,-0.411691,-1.982619,0.463568,-0.288653,-0.510942,1.10781,-0.241061,-0.566939,...,0.208502,-0.270208,0.064238,1.527656,0.19968,0.780197,0.951632,0.733308,0.138777,1.014651


# Use PYOD to detect Anomaly

In [6]:
from pyod.models.iforest import IForest
random_state = np.random.RandomState(42)
outliers_fraction = 0.05
classifiers = {
        'Isolation Forest': IForest(contamination=outliers_fraction,random_state=random_state)
}

In [9]:


for i, (clf_name, clf) in enumerate(classifiers.items()):
    clf.fit(df_norm)
    # predict raw anomaly score
    scores_pred = clf.decision_function(df_norm) * -1
        
    # prediction of a datapoint category outlier or inlier
    y_pred = clf.predict(df_norm)
    n_inliers = len(y_pred) - np.count_nonzero(y_pred)
    n_outliers = np.count_nonzero(y_pred == 1)
    plt.figure(figsize=(10, 10))
    
    # copy of dataframe
    dfx = df_norm.copy(deep=True)
    dfx['outlier'] = y_pred.tolist()
         
    print('OUTLIERS : ',n_outliers,'INLIERS : ',n_inliers, clf_name)

OUTLIERS :  73 INLIERS :  1387 Isolation Forest


<Figure size 720x720 with 0 Axes>

In [13]:
dfoo = pd.DataFrame(df)
dfoo['outlier'] = y_pred.tolist()
dfoo.head()

Unnamed: 0,1stFlrSF,3SsnPorch,BldgType,BsmtExposure,BsmtFinSF1,BsmtFinSF2,BsmtFinType1,BsmtFullBath,BsmtHalfBath,BsmtQual,...,ScreenPorch,Street,TotRmsAbvGrd,TotalBsmtSF,WoodDeckSF,YearBuilt,YearRemodAdd,YrSold,Sale Price,outlier
0,856,0,0,3,706,0,2,1,0,2,...,0,1,8,856,0,2003,2003,2,12.247699,0
1,1262,0,0,1,978,0,0,0,1,2,...,0,1,6,1262,298,1976,1976,1,12.109016,0
2,920,0,0,2,486,0,2,1,0,2,...,0,1,6,920,0,2001,2002,2,12.317171,0
3,961,0,0,3,216,0,0,1,0,4,...,0,1,7,756,0,1915,1970,0,11.849405,0
4,1145,0,0,0,655,0,2,1,0,2,...,0,1,9,1145,192,2000,2000,2,12.42922,0


In [15]:
dfoo[dfoo['outlier'] == 1].head(25)

Unnamed: 0,1stFlrSF,3SsnPorch,BldgType,BsmtExposure,BsmtFinSF1,BsmtFinSF2,BsmtFinType1,BsmtFullBath,BsmtHalfBath,BsmtQual,...,ScreenPorch,Street,TotRmsAbvGrd,TotalBsmtSF,WoodDeckSF,YearBuilt,YearRemodAdd,YrSold,Sale Price,outlier
39,1152,0,2,4,0,0,4,0,0,3,...,0,1,6,0,0,1955,1955,2,11.314487,1
53,1842,0,0,1,1810,0,2,2,0,0,...,0,1,5,1842,857,1981,1987,0,12.861001,1
58,1426,0,0,1,0,0,6,0,0,0,...,0,1,10,1410,192,2006,2006,0,12.991756,1
88,1013,0,0,3,0,0,6,0,0,4,...,0,1,6,1013,0,1915,1982,3,11.350418,1
108,997,0,0,3,0,0,6,0,0,4,...,0,1,7,793,0,1919,2005,1,11.652696,1
178,2234,0,0,3,1904,0,2,1,0,0,...,0,1,9,2216,0,2008,2009,3,13.126033,1
185,1518,0,0,2,0,0,6,0,0,4,...,410,1,12,1107,0,1892,1993,0,13.071072,1
197,1360,0,0,3,1036,184,2,1,1,4,...,0,1,8,1360,0,1918,1990,0,12.367345,1
249,1444,0,0,1,697,0,5,0,1,2,...,0,1,7,1444,0,1958,2006,1,12.531776,1
250,1306,0,0,3,0,0,6,0,0,4,...,0,1,6,728,263,1940,1966,4,11.245059,1


In [17]:
dfx.head()

Unnamed: 0,1stFlrSF,3SsnPorch,BldgType,BsmtExposure,BsmtFinSF1,BsmtFinSF2,BsmtFinType1,BsmtFullBath,BsmtHalfBath,BsmtQual,...,ScreenPorch,Street,TotRmsAbvGrd,TotalBsmtSF,WoodDeckSF,YearBuilt,YearRemodAdd,YrSold,Sale Price,outlier
0,-0.793434,-0.116339,-0.411691,0.591905,0.575425,-0.288653,-0.510942,1.10781,-0.241061,-0.566939,...,-0.270208,0.064238,0.91221,-0.459303,-0.752176,1.050994,0.878668,0.138777,0.560067,0
1,0.25714,-0.116339,-0.411691,-1.124444,1.171992,-0.288653,-1.399536,-0.819964,3.948809,-0.566939,...,-0.270208,0.064238,-0.318683,0.466465,1.626195,0.156734,-0.429577,-0.614439,0.212763,0
2,-0.627826,-0.116339,-0.411691,-0.266269,0.092907,-0.288653,-0.510942,1.10781,-0.241061,-0.566939,...,-0.270208,0.064238,-0.318683,-0.313369,-0.752176,0.984752,0.830215,0.138777,0.734046,0
3,-0.521734,-0.116339,-0.411691,0.591905,-0.499274,-0.288653,-1.399536,1.10781,-0.241061,0.99777,...,-0.270208,0.064238,0.296763,-0.687324,-0.752176,-1.863632,-0.720298,-1.367655,-0.437383,0
4,-0.045611,-0.116339,-0.411691,-1.982619,0.463568,-0.288653,-0.510942,1.10781,-0.241061,-0.566939,...,-0.270208,0.064238,1.527656,0.19968,0.780197,0.951632,0.733308,0.138777,1.014651,0


In [18]:
dfx.to_csv('ames_data_with_feat_sel_outlier_status.csv', index=False)