## Evaluation for Scaler Class

- Evaluates the effect of preprocessy execute function on model accuracy compared to sklearn
- Evaluates on 1 dataset
    * Melbourne Housing Snapshot
- Considering a standard test size of 0.3 for all 3 cases i.e.
    * MinMaxScaling
    * StandardScaling
    * BinaryScaling
- Using RandomForestRegressor() model
- Using r2_score of sklearn.metrics
- Comparisons between sklearn and preprocessy based on accuracy and time have been indicated at the end

In [1]:
# To access preprocessy module. Required in .ipynb files
import os
import sys
module_path = os.path.abspath(os.path.join('..'))
if module_path not in sys.path:
    sys.path.append(module_path)

In [2]:
import pandas as pd
import numpy as np
import matplotlib

from sklearn import preprocessing

import warnings
warnings.filterwarnings('ignore')

from preprocessy.scaling import Scaler
from preprocessy.data_splitting import Split
import time

from sklearn.impute import SimpleImputer
from sklearn.ensemble import RandomForestRegressor

from sklearn.metrics import mean_absolute_error,classification_report, r2_score, mean_squared_error
from sklearn.model_selection import train_test_split

from preprocessy.handlenullvalues import NullValuesHandler

np.random.seed(101)

In [3]:
melb_data = pd.read_csv('../datasets/handling_null_values/melb_data.csv')
melb_data_copy2 = melb_data

dtf_1 = pd.DataFrame(columns = ['Accuracy', 'Time'])
dtf_2 = pd.DataFrame(columns = ['Accuracy', 'Time'])
dtf_3 = pd.DataFrame(columns = ['Accuracy', 'Time'])

melb_data.head()

Unnamed: 0,Suburb,Address,Rooms,Type,Price,Method,SellerG,Date,Distance,Postcode,...,Bathroom,Car,Landsize,BuildingArea,YearBuilt,CouncilArea,Lattitude,Longtitude,Regionname,Propertycount
0,Abbotsford,85 Turner St,2,h,1480000.0,S,Biggin,3/12/2016,2.5,3067.0,...,1.0,1.0,202.0,,,Yarra,-37.7996,144.9984,Northern Metropolitan,4019.0
1,Abbotsford,25 Bloomburg St,2,h,1035000.0,S,Biggin,4/02/2016,2.5,3067.0,...,1.0,0.0,156.0,79.0,1900.0,Yarra,-37.8079,144.9934,Northern Metropolitan,4019.0
2,Abbotsford,5 Charles St,3,h,1465000.0,SP,Biggin,4/03/2017,2.5,3067.0,...,2.0,0.0,134.0,150.0,1900.0,Yarra,-37.8093,144.9944,Northern Metropolitan,4019.0
3,Abbotsford,40 Federation La,3,h,850000.0,PI,Biggin,4/03/2017,2.5,3067.0,...,2.0,1.0,94.0,,,Yarra,-37.7969,144.9969,Northern Metropolitan,4019.0
4,Abbotsford,55a Park St,4,h,1600000.0,VB,Nelson,4/06/2016,2.5,3067.0,...,1.0,2.0,120.0,142.0,2014.0,Yarra,-37.8072,144.9941,Northern Metropolitan,4019.0


In [4]:
# Consider Price as Target property and others as Predictors 
melb_target = melb_data.Price

melb_predictors = melb_data.drop(['Price'], axis=1)
melb_numeric_predictors = melb_predictors.select_dtypes(exclude=['object'])
col_names = list(melb_numeric_predictors.columns)
col_names

['Rooms',
 'Distance',
 'Postcode',
 'Bedroom2',
 'Bathroom',
 'Car',
 'Landsize',
 'BuildingArea',
 'YearBuilt',
 'Lattitude',
 'Longtitude',
 'Propertycount']

In [5]:
melb_data_copy2.isnull().sum().sort_values(ascending = False)[:4]

BuildingArea    6450
YearBuilt       5375
CouncilArea     1369
Car               62
dtype: int64

In [6]:
len(melb_data_copy2)

13580

## Fill Null Values and split into train and test dataframes

In [7]:
imputed_df = melb_data_copy2.select_dtypes(exclude=['object']).fillna(melb_data_copy2.select_dtypes(exclude=['object']).mean())
train, test = train_test_split(imputed_df, test_size = 0.3, random_state = 69)

In [8]:
imputed_df.isnull().sum().sort_values(ascending = False)[:4]

Rooms       0
Price       0
Distance    0
Postcode    0
dtype: int64

In [9]:
imputed_df[:2]

Unnamed: 0,Rooms,Price,Distance,Postcode,Bedroom2,Bathroom,Car,Landsize,BuildingArea,YearBuilt,Lattitude,Longtitude,Propertycount
0,2,1480000.0,2.5,3067.0,2.0,1.0,1.0,202.0,151.96765,1964.684217,-37.7996,144.9984,4019.0
1,2,1035000.0,2.5,3067.0,2.0,1.0,0.0,156.0,79.0,1900.0,-37.8079,144.9934,4019.0


In [10]:
train[:2]

Unnamed: 0,Rooms,Price,Distance,Postcode,Bedroom2,Bathroom,Car,Landsize,BuildingArea,YearBuilt,Lattitude,Longtitude,Propertycount
10872,4,478000.0,27.2,3024.0,4.0,2.0,2.0,400.0,151.54,2008.0,-37.87047,144.59864,5262.0
6603,2,555000.0,5.1,3181.0,2.0,1.0,1.0,0.0,151.96765,1990.0,-37.8585,145.0004,4380.0


In [11]:
test[:2]

Unnamed: 0,Rooms,Price,Distance,Postcode,Bedroom2,Bathroom,Car,Landsize,BuildingArea,YearBuilt,Lattitude,Longtitude,Propertycount
10059,3,518000.0,10.5,3020.0,3.0,1.0,1.0,322.0,151.96765,1964.684217,-37.79209,144.82024,6763.0
6844,3,1110000.0,1.8,3053.0,3.0,2.0,1.0,0.0,151.96765,1964.684217,-37.7946,144.9716,6786.0


## MinMaxScaler
-  smaller standard deviations through the process

### sklearn

In [12]:
mm_scaler = preprocessing.MinMaxScaler()

start = time.time()

#Scale the data
df_mm = mm_scaler.fit_transform(imputed_df.drop(['Price'], axis =1))
df_mm = pd.DataFrame(df_mm, columns=col_names)

#Split the data
X_train, X_test, y_train, y_test = train_test_split(df_mm, imputed_df['Price'], test_size=0.3, random_state=69)

#Fit RandomForestRegressor model
model = RandomForestRegressor(random_state = 42)
model.fit(X_train, y_train)

end=time.time()

sklearn_preds = model.predict(X_test)

# Get time and accuracy
sklearn_time = np.round(end - start,4)
sklearn_accuracy = np.round(r2_score(y_test, sklearn_preds),4)

# Print Diff Error Values
print('Mean Absolute Error:', mean_absolute_error(y_test, sklearn_preds))
print('Mean Squared Error:', mean_squared_error(y_test, sklearn_preds))
print('Root Mean Squared Error:', np.sqrt(mean_squared_error(y_test, sklearn_preds)))

#Append Dataframe
dtf_1.loc['sklearn'] = [sklearn_accuracy, sklearn_time]

Mean Absolute Error: 174131.8521815462
Mean Squared Error: 89052272815.75325
Root Mean Squared Error: 298416.2743815311


### preprocessy

In [13]:
def preprocessy_score_dataset(params):

    target_col = params["target_label"]
    start=time.time()
    
    Scaler().execute(params)
    
    # Train dataset
    X_train = params["train_df"].drop(target_col,axis =1)
    y_train = params["train_df"][[target_col]]
    
    # Test dataset
    X_test = params["test_df"].drop(target_col,axis =1)
    y_test = params["test_df"][[target_col]]
    
#     print(X_train[:2])
#     print(X_test[:2])

    # Fit RandomForestRegressor model
    model = RandomForestRegressor(random_state = 42)
    model.fit(X_train, y_train)
    
    end=time.time()
    
    preprocessy_preds = model.predict(X_test)

    # Get time and accuracy
    preprocessy_time = np.round(end - start,4)
    preprocessy_accuracy = np.round(r2_score(y_test, preprocessy_preds),4)

    # Print Diff Error Values
    print('Mean Absolute Error:', mean_absolute_error(y_test, preprocessy_preds))
    print('Mean Squared Error:', mean_squared_error(y_test, preprocessy_preds))
    print('Root Mean Squared Error:', np.sqrt(mean_squared_error(y_test, preprocessy_preds)))
    
    return preprocessy_accuracy, preprocessy_time

In [14]:
params = {"train_df": train, "test_df": test, "target_label": "Price", "test_size": 0.3, "type": "MinMaxScaler"}

In [15]:
acc, t = preprocessy_score_dataset(params = params)

dtf_1.loc['Preprocessy'] = [acc, t]

Mean Absolute Error: 174010.92741858942
Mean Squared Error: 88951856442.0704
Root Mean Squared Error: 298247.9781022336


## StandardScaler

### sklearn

In [16]:
s_scaler = preprocessing.StandardScaler()

start = time.time()

#Scale the data
df_s = s_scaler.fit_transform(imputed_df.drop(['Price'], axis =1))
df_s = pd.DataFrame(df_s, columns=col_names)

#Split the data
X_train, X_test, y_train, y_test = train_test_split(df_s, imputed_df['Price'], test_size=0.3, random_state=69)

#Fit RandomForestRegressor model
model = RandomForestRegressor(random_state = 42)
model.fit(X_train, y_train)

end=time.time()

sklearn_preds = model.predict(X_test)

# Get time and accuracy
sklearn_time = np.round(end - start,4)
sklearn_accuracy = np.round(r2_score(y_test, sklearn_preds),4)

# Print Diff Error Values
print('Mean Absolute Error:', mean_absolute_error(y_test, sklearn_preds))
print('Mean Squared Error:', mean_squared_error(y_test, sklearn_preds))
print('Root Mean Squared Error:', np.sqrt(mean_squared_error(y_test, sklearn_preds)))

#Append Dataframe
dtf_2.loc['sklearn'] = [sklearn_accuracy, sklearn_time]

Mean Absolute Error: 174062.09716600043
Mean Squared Error: 88994429193.78366
Root Mean Squared Error: 298319.3409649861


### preprocessy

In [17]:
params = {"train_df": train, "test_df": test, "target_label": "Price", "test_size": 0.3, "type": "StandardScaler"}
acc, t = preprocessy_score_dataset(params = params)

dtf_2.loc['Preprocessy'] = [acc, t]

Mean Absolute Error: 174010.92741858942
Mean Squared Error: 88951856442.0704
Root Mean Squared Error: 298247.9781022336


## BinaryScaler

### sklearn

In [18]:
b_scaler = preprocessing.Binarizer()

start = time.time()

#Scale the data
df_b = b_scaler.fit_transform(imputed_df.drop(['Price'], axis =1))
df_b = pd.DataFrame(df_b, columns=col_names)

#Split the data
X_train, X_test, y_train, y_test = train_test_split(df_b, imputed_df['Price'], test_size=0.3, random_state=69)

#Fit RandomForestRegressor model
model = RandomForestRegressor(random_state = 42)
model.fit(X_train, y_train)

end=time.time()

sklearn_preds = model.predict(X_test)

# Get time and accuracy
sklearn_time = np.round(end - start,4)
sklearn_accuracy = np.round(r2_score(y_test, sklearn_preds),4)

# Print Diff Error Values
print('Mean Absolute Error:', mean_absolute_error(y_test, sklearn_preds))
print('Mean Squared Error:', mean_squared_error(y_test, sklearn_preds))
print('Root Mean Squared Error:', np.sqrt(mean_squared_error(y_test, sklearn_preds)))

#Append Dataframe
dtf_3.loc['sklearn'] = [sklearn_accuracy, sklearn_time]

Mean Absolute Error: 442584.36487377214
Mean Squared Error: 405718702255.6247
Root Mean Squared Error: 636960.5186003484


### preprocessy

In [19]:
params = {"train_df": train, "test_df": test, "target_label": "Price", "test_size": 0.3, "type": "BinaryScaler"}

In [20]:
acc, t = preprocessy_score_dataset(params)

dtf_3.loc['Preprocessy'] = [acc, t]

Mean Absolute Error: 174010.92741858942
Mean Squared Error: 88951856442.0704
Root Mean Squared Error: 298247.9781022336


#### MinMaxScaler

In [21]:
dtf_1

Unnamed: 0,Accuracy,Time
sklearn,0.7946,4.1067
Preprocessy,0.7948,4.0594


#### StandardScaler

In [22]:
dtf_2

Unnamed: 0,Accuracy,Time
sklearn,0.7947,4.0521
Preprocessy,0.7948,4.0472


#### BinaryScaler

In [23]:
dtf_3

Unnamed: 0,Accuracy,Time
sklearn,0.0643,0.2084
Preprocessy,0.7948,3.9504
