## Evaluation for Scaler Class

- Evaluates the effect of preprocessy execute function on model accuracy compared to sklearn
- Evaluates on 1 dataset
    * Melbourne Housing Snapshot
- Considering a standard test size of 0.3 for all 3 cases i.e.
    * MinMaxScaling
    * StandardScaling
    * BinaryScaling
- Using RandomForestRegressor() model
- Using r2_score of sklearn.metrics
- Comparisons between sklearn and preprocessy based on accuracy and time have been indicated at the end

In [4]:
# To access preprocessy module. Required in .ipynb files
import os
import sys
module_path = os.path.abspath(os.path.join('..'))
if module_path not in sys.path:
    sys.path.append(module_path)

In [49]:
import pandas as pd
import numpy as np
import matplotlib

import mlxtend
from scipy import stats
from mlxtend.preprocessing import minmax_scaling
import seaborn as sns
from sklearn import preprocessing

import matplotlib.pyplot as plt
%matplotlib inline
matplotlib.style.use('ggplot')

import warnings
warnings.filterwarnings('ignore')

from preprocessy.scaling import Scaler
from preprocessy.resampling import Split
import time

from sklearn.datasets import load_iris, load_boston, load_breast_cancer, load_diabetes
from sklearn.impute import SimpleImputer
from sklearn.ensemble import RandomForestRegressor
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import mean_absolute_error,classification_report, r2_score, mean_squared_error
from sklearn.model_selection import train_test_split

from preprocessy.handlenullvalues import NullValuesHandler
from preprocessy.resampling import Split

np.random.seed(101)

In [50]:
melb_data = pd.read_csv('../datasets/handling_null_values/melb_data.csv')
melb_data_copy1 = melb_data
melb_data_copy2 = melb_data

dtf_1 = pd.DataFrame(columns = ['Accuracy', 'Time'])
dtf_2 = pd.DataFrame(columns = ['Accuracy', 'Time'])
dtf_3 = pd.DataFrame(columns = ['Accuracy', 'Time'])

melb_data.head()

Unnamed: 0,Suburb,Address,Rooms,Type,Price,Method,SellerG,Date,Distance,Postcode,...,Bathroom,Car,Landsize,BuildingArea,YearBuilt,CouncilArea,Lattitude,Longtitude,Regionname,Propertycount
0,Abbotsford,85 Turner St,2,h,1480000.0,S,Biggin,3/12/2016,2.5,3067.0,...,1.0,1.0,202.0,,,Yarra,-37.7996,144.9984,Northern Metropolitan,4019.0
1,Abbotsford,25 Bloomburg St,2,h,1035000.0,S,Biggin,4/02/2016,2.5,3067.0,...,1.0,0.0,156.0,79.0,1900.0,Yarra,-37.8079,144.9934,Northern Metropolitan,4019.0
2,Abbotsford,5 Charles St,3,h,1465000.0,SP,Biggin,4/03/2017,2.5,3067.0,...,2.0,0.0,134.0,150.0,1900.0,Yarra,-37.8093,144.9944,Northern Metropolitan,4019.0
3,Abbotsford,40 Federation La,3,h,850000.0,PI,Biggin,4/03/2017,2.5,3067.0,...,2.0,1.0,94.0,,,Yarra,-37.7969,144.9969,Northern Metropolitan,4019.0
4,Abbotsford,55a Park St,4,h,1600000.0,VB,Nelson,4/06/2016,2.5,3067.0,...,1.0,2.0,120.0,142.0,2014.0,Yarra,-37.8072,144.9941,Northern Metropolitan,4019.0


In [51]:
# Consider Price as Target property and others as Predictors 
melb_target = melb_data.Price

melb_predictors = melb_data.drop(['Price'], axis=1)
melb_numeric_predictors = melb_predictors.select_dtypes(exclude=['object'])
col_names = list(melb_numeric_predictors.columns)
col_names

['Rooms',
 'Distance',
 'Postcode',
 'Bedroom2',
 'Bathroom',
 'Car',
 'Landsize',
 'BuildingArea',
 'YearBuilt',
 'Lattitude',
 'Longtitude',
 'Propertycount']

## MinMaxScaler
-  smaller standard deviations through the process

### sklearn

In [92]:
imputed_df = melb_data_copy2.select_dtypes(exclude=['object']).fillna(melb_data_copy2.select_dtypes(exclude=['object']).mean())

mm_scaler = preprocessing.MinMaxScaler()
df_mm = mm_scaler.fit_transform(imputed_df.drop(['Price'], axis =1))
df_mm = pd.DataFrame(df_mm, columns=col_names)
X_train, X_test, y_train, y_test = train_test_split(df_mm, imputed_df['Price'], test_size=0.3, random_state=69)

print(X_train[:2])
print(y_train[:2])
print()
print(X_test[:2])
print(y_test[:2])
print()

model = RandomForestRegressor()
estimators = np.arange(10, 200, 10)
scores = []
for n in estimators:
    model.set_params(n_estimators=n)
    model.fit(X_train, y_train)
    scores.append(model.score(X_test, y_test))
print(max(scores))
print(estimators[scores.index(max(scores))])

          Rooms  Distance  Postcode  Bedroom2  Bathroom  Car  Landsize  \
10872  0.333333  0.565489  0.024565       0.2     0.250  0.2  0.000924   
6603   0.111111  0.106029  0.185261       0.1     0.125  0.1  0.000000   

       BuildingArea  YearBuilt  Lattitude  Longtitude  Propertycount  
10872      0.003404   0.987835   0.403194    0.152420       0.234241  
6603       0.003414   0.965937   0.418658    0.519479       0.193028  
10872    478000.0
6603     555000.0
Name: Price, dtype: float64

          Rooms  Distance  Postcode  Bedroom2  Bathroom  Car  Landsize  \
10059  0.222222  0.218295  0.020471      0.15     0.125  0.1  0.000744   
6844   0.222222  0.037422  0.054248      0.15     0.250  0.1  0.000000   

       BuildingArea  YearBuilt  Lattitude  Longtitude  Propertycount  
10059      0.003414   0.935139   0.504457    0.354880       0.304378  
6844       0.003414   0.935139   0.501214    0.493166       0.305453  
10059     518000.0
6844     1110000.0
Name: Price, dtype: float

In [93]:
mm_scaler = preprocessing.MinMaxScaler()
start = time.time()
imputed_df = melb_data_copy2.select_dtypes(exclude=['object']).fillna(melb_data_copy2.select_dtypes(exclude=['object']).mean())
df_mm = mm_scaler.fit_transform(imputed_df.drop(['Price'], axis =1))
df_mm = pd.DataFrame(df_mm, columns=col_names)
X_train, X_test, y_train, y_test = train_test_split(df_mm, imputed_df['Price'], test_size=0.3, random_state=69)
model = RandomForestRegressor(n_estimators = 120)
model.fit(X_train, y_train)
sklearn_preds = model.predict(X_test)
end=time.time()
sklearn_time = np.round(end - start,4)
sklearn_accuracy = np.round(r2_score(y_test, sklearn_preds),4)
print(sklearn_preds[:5])
print('Mean Absolute Error:', mean_absolute_error(y_test, sklearn_preds))
print('Mean Squared Error:', mean_squared_error(y_test, sklearn_preds))
print('Root Mean Squared Error:', np.sqrt(mean_squared_error(y_test, sklearn_preds)))
dtf_1.loc['sklearn'] = [sklearn_accuracy, sklearn_time]

[ 592510.         1223198.61111111 1653633.33333333  976995.83333333
  579470.83333333]
Mean Absolute Error: 174187.00462407758
Mean Squared Error: 89520017255.5589
Root Mean Squared Error: 299198.9593156348


### preprocessy

In [75]:
melb_data_copy1_x = melb_data_copy1.sample(frac=1).reset_index(drop=True)
params = {"df": melb_data_copy1_x.select_dtypes(exclude=['object']), "test_size": 0.3, "type": "MinMaxScaler",  "fill_missing":  "mean"}
target_col = "Price"

NullValuesHandler().execute(params)
params["X"] = params.pop("df")
Split().train_test_split(params)
# this params has X, test_size, train_df, test_df, type
# To get: target_col, columns
params["train_df"] = params.pop('train')
params["test_df"] = params.pop('test')
params["columns"] = list(params["train_df"].columns)
params["target_col"] = target_col
Scaler().execute(params)

# print(params.keys())
y_train = params["train_df"][["Price"]]
X_train = params["train_df"].drop(target_col,axis =1)
X_test = params["test_df"].drop(target_col,axis =1)
y_test = params["test_df"][["Price"]]

print(X_train[:2])
print(y_train[:2])
print()
print(X_test[:2])
print(y_test[:2])
print()
model = RandomForestRegressor()
estimators = np.arange(10, 200, 10)
scores = []
for n in estimators:
    model.set_params(n_estimators=n)
    model.fit(X_train, y_train)
    scores.append(model.score(X_test, y_test))
print(max(scores))
print(estimators[scores.index(max(scores))])

         Rooms  Distance  Postcode  Bedroom2  Bathroom  Car  Landsize  \
4074  0.222222  0.289030  0.192426      0.15  0.285714  0.3  0.003238   
4075  0.222222  0.172996  0.012282      0.20  0.142857  0.2  0.001238   

      BuildingArea  YearBuilt  Lattitude  Longtitude  Propertycount  
4074      0.022378   0.935139   0.320211    0.537139       0.238230  
4075      0.016640   0.886861   0.497080    0.421050       0.219604  
          Price
4074  2200000.0
4075  1151500.0

      Rooms  Distance  Postcode  Bedroom2  Bathroom    Car  Landsize  \
0  0.285714  0.374220  0.097236  0.333333     0.250  0.250  0.014876   
1  0.285714  0.135135  0.072671  0.222222     0.125  0.125  0.007348   

   BuildingArea  YearBuilt  Lattitude  Longtitude  Propertycount  
0      0.003414   0.686732   0.651479    0.622156       0.097986  
1      0.002157   0.419162   0.579061    0.480176       0.402832  
      Price
0  881017.0
1  995000.0

-0.617559884841071
80


In [80]:
def preprocessy_score_dataset(params, target_col, no_estimators):

    # params = {"df": melb_data_copy1.select_dtypes(exclude=['object']), "test_size": 0.3, "type": "MinMaxScaler",  "fill_missing":  "mean"}

    start = time.time()
    NullValuesHandler().execute(params)
    params["X"] = params.pop("df")
    Split().train_test_split(params)
    # this params has X, test_size, train_df, test_df, type
    # To get: target_col
    params["train_df"] = params.pop('train')
    params["test_df"] = params.pop('test')
    params["columns"] = list(params["train_df"].columns)
    params["target_col"] = target_col
    Scaler().execute(params)

    # print(params.keys())
    y_train = params["train_df"][["Price"]]
    X_train = params["train_df"].drop(target_col,axis =1)
    X_test = params["test_df"].drop(target_col,axis =1)
    y_test = params["test_df"][["Price"]]
    print(y_test.head(5))
    model = RandomForestRegressor(n_estimators = no_estimators)
    model.fit(X_train, y_train)
    preprocessy_preds = model.predict(X_test)

    end=time.time()
    preprocessy_time = np.round(end - start,4)
    
    preprocessy_accuracy = np.round(r2_score(y_test, preprocessy_preds),4)
    print(preprocessy_preds[:5])
    print('Mean Absolute Error:', mean_absolute_error(y_test, preprocessy_preds))
    print('Mean Squared Error:', mean_squared_error(y_test, preprocessy_preds))
    print('Root Mean Squared Error:', np.sqrt(mean_squared_error(y_test, preprocessy_preds)))
    return preprocessy_accuracy, preprocessy_time

In [84]:
acc, t = preprocessy_score_dataset(params = {"df": melb_data_copy1_x.select_dtypes(exclude=['object']), "test_size": 0.3, "type": "MinMaxScaler",  "fill_missing":  "mean"}, target_col = "Price", no_estimators = 80)

dtf_1.loc['Preprocessy'] = [acc, t]

       Price
0   881017.0
1   995000.0
2   902000.0
3  1075000.0
4  1450000.0
[1000362.5  1693773.6  2269425.   3300418.75 3493950.  ]
Mean Absolute Error: 676305.0681280245
Mean Squared Error: 816054743493.7233
Root Mean Squared Error: 903357.4837757881


## StandardScaler

### sklearn

In [98]:
imputed_df = melb_data_copy2.select_dtypes(exclude=['object']).fillna(melb_data_copy2.select_dtypes(exclude=['object']).mean())
s_scaler = preprocessing.StandardScaler()
df_s = s_scaler.fit_transform(imputed_df.drop(['Price'], axis =1))
df_s = pd.DataFrame(df_s, columns=col_names)
X_train, X_test, y_train, y_test = train_test_split(df_s, imputed_df['Price'], test_size=0.3, random_state=69)
model = RandomForestRegressor()
estimators = np.arange(10, 200, 10)
scores = []
for n in estimators:
    model.set_params(n_estimators=n)
    model.fit(X_train, y_train)
    scores.append(model.score(X_test, y_test))
print(max(scores))
print(estimators[scores.index(max(scores))])

0.7998959670016117
50


In [110]:
s_scaler = preprocessing.StandardScaler()
start = time.time()
imputed_df = melb_data_copy2.select_dtypes(exclude=['object']).fillna(melb_data_copy2.select_dtypes(exclude=['object']).mean())
df_s = s_scaler.fit_transform(imputed_df.drop(['Price'], axis =1))
df_s = pd.DataFrame(df_s, columns=col_names)
X_train, X_test, y_train, y_test = train_test_split(df_s, imputed_df['Price'], test_size=0.3, random_state=69)
model = RandomForestRegressor(n_estimators = 50)
model.fit(X_train, y_train)
sklearn_preds = model.predict(X_test)
end=time.time()
sklearn_time = np.round(end - start,4)
sklearn_accuracy = np.round(r2_score(y_test, sklearn_preds),4)
print(sklearn_preds[:5])
print('Mean Absolute Error:', mean_absolute_error(y_test, sklearn_preds))
print('Mean Squared Error:', mean_squared_error(y_test, sklearn_preds))
print('Root Mean Squared Error:', np.sqrt(mean_squared_error(y_test, sklearn_preds)))
dtf_2.loc['sklearn'] = [sklearn_accuracy, sklearn_time]

[ 629990.         1232493.33333333 1663110.          993050.
  610126.        ]
Mean Absolute Error: 174914.23406597003
Mean Squared Error: 89985125605.21092
Root Mean Squared Error: 299975.208317639


### preprocessy

In [86]:
params = {"df": melb_data_copy1_x.select_dtypes(exclude=['object']), "test_size": 0.3, "type": "StandardScaler",  "fill_missing":  "mean"}
target_col = "Price"

NullValuesHandler().execute(params)
params["X"] = params.pop("df")
Split().train_test_split(params)
# this params has X, test_size, train_df, test_df, type
# To get: target_col, columns
params["train_df"] = params.pop('train')
params["test_df"] = params.pop('test')
params["columns"] = list(params["train_df"].columns)
params["target_col"] = target_col
Scaler().execute(params)

# print(params.keys())
y_train = params["train_df"][["Price"]]
X_train = params["train_df"].drop(target_col,axis =1)
X_test = params["test_df"].drop(target_col,axis =1)
y_test = params["test_df"][["Price"]]

print(X_train[:2])
print(y_train[:2])
print()
print(X_test[:2])
print(y_test[:2])
print()
model = RandomForestRegressor()
estimators = np.arange(10, 200, 10)
scores = []
for n in estimators:
    model.set_params(n_estimators=n)
    model.fit(X_train, y_train)
    scores.append(model.score(X_test, y_test))
print(max(scores))
print(estimators[scores.index(max(scores))])

         Rooms  Distance  Postcode  Bedroom2  Bathroom       Car  Landsize  \
4074  0.061472  0.601463  0.914144  0.086004  0.684322  1.431324  0.176030   
4075  0.061472 -0.335258 -1.027479  1.118274 -0.771624  0.399141 -0.008076   

      BuildingArea  YearBuilt  Lattitude  Longtitude  Propertycount  
4074      0.025288  -0.000787  -1.576042    0.015216      -0.457171  
4075     -0.372561  -1.354584   0.139655   -1.158390      -0.547581  
          Price
4074  2200000.0
4075  1151500.0

     Rooms  Distance  Postcode  Bedroom2  Bathroom       Car  Landsize  \
0  0.07283  1.353385 -0.117757  0.093638  0.648448  0.422536  0.115897   
1  0.07283 -0.608321 -0.382191 -0.948662 -0.774157 -0.640563 -0.161600   

   BuildingArea  YearBuilt  Lattitude  Longtitude  Propertycount  
0     -0.008257   0.001912   1.341104    1.317252      -1.166377  
1     -0.088221  -1.584666   0.671820   -0.105729       0.324580  
      Price
0  881017.0
1  995000.0

0.609247029468338
80


In [87]:
acc, t = preprocessy_score_dataset(params = {"df": melb_data_copy1_x.select_dtypes(exclude=['object']), "test_size": 0.3, "type": "StandardScaler",  "fill_missing":  "mean"}, target_col = "Price", no_estimators = 80)

dtf_2.loc['Preprocessy'] = [acc, t]

       Price
0   881017.0
1   995000.0
2   902000.0
3  1075000.0
4  1450000.0
[ 931225.    800743.75  724366.25 1138093.75 1658400.  ]
Mean Absolute Error: 255442.90709734496
Mean Squared Error: 164138703312.8332
Root Mean Squared Error: 405140.3501415691


## BinaryScaler

### sklearn

In [112]:
imputed_df = melb_data_copy2.select_dtypes(exclude=['object']).fillna(melb_data_copy2.select_dtypes(exclude=['object']).mean())
b_scaler = preprocessing.Binarizer()
df_b = b_scaler.fit_transform(imputed_df.drop(['Price'], axis =1))
df_b = pd.DataFrame(df_b, columns=col_names)
X_train, X_test, y_train, y_test = train_test_split(df_b, imputed_df['Price'], test_size=0.3, random_state=69)
model = RandomForestRegressor()
estimators = np.arange(10, 200, 10)
scores = []
for n in estimators:
    model.set_params(n_estimators=n)
    model.fit(X_train, y_train)
    scores.append(model.score(X_test, y_test))
print(max(scores))
print(estimators[scores.index(max(scores))])

0.0646790724493077
10


In [113]:
b_scaler = preprocessing.Binarizer()
start = time.time()
imputed_df = melb_data_copy2.select_dtypes(exclude=['object']).fillna(melb_data_copy2.select_dtypes(exclude=['object']).mean())
df_b = b_scaler.fit_transform(imputed_df.drop(['Price'], axis =1))
df_b = pd.DataFrame(df_b, columns=col_names)
X_train, X_test, y_train, y_test = train_test_split(df_b, imputed_df['Price'], test_size=0.3, random_state=69)
model = RandomForestRegressor(n_estimators = 10)
model.fit(X_train, y_train)
sklearn_preds = model.predict(X_test)
end=time.time()
sklearn_time = np.round(end - start,4)
sklearn_accuracy = np.round(r2_score(y_test, sklearn_preds),4)
print(sklearn_preds[:5])
print('Mean Absolute Error:', mean_absolute_error(y_test, sklearn_preds))
print('Mean Squared Error:', mean_squared_error(y_test, sklearn_preds))
print('Root Mean Squared Error:', np.sqrt(mean_squared_error(y_test, sklearn_preds)))
dtf_3.loc['sklearn'] = [sklearn_accuracy, sklearn_time]

[1143008.94698582  640846.90979939 1143008.94698582 1143008.94698582
 1143008.94698582]
Mean Absolute Error: 442692.58795341593
Mean Squared Error: 405721239003.7401
Root Mean Squared Error: 636962.5098887219


### preprocessy

In [115]:
params = {"df": melb_data_copy1_x.select_dtypes(exclude=['object']), "test_size": 0.3, "type": "BinaryScaler",  "fill_missing":  "mean"}
target_col = "Price"

NullValuesHandler().execute(params)
params["X"] = params.pop("df")
Split().train_test_split(params)
# this params has X, test_size, train_df, test_df, type
# To get: target_col, columns
params["train_df"] = params.pop('train')
params["test_df"] = params.pop('test')
params["columns"] = list(params["train_df"].columns)
params["target_col"] = target_col
Scaler().execute(params)

# print(params.keys())
y_train = params["train_df"][["Price"]]
X_train = params["train_df"].drop(target_col,axis =1)
X_test = params["test_df"].drop(target_col,axis =1)
y_test = params["test_df"][["Price"]]

print(X_train[:2])
print(y_train[:2])
print()
print(X_test[:2])
print(y_test[:2])
print()
model = RandomForestRegressor()
estimators = np.arange(10, 200, 10)
scores = []
for n in estimators:
    model.set_params(n_estimators=n)
    model.fit(X_train, y_train)
    scores.append(model.score(X_test, y_test))
print(max(scores))
print(estimators[scores.index(max(scores))])

      Rooms  Distance  Postcode  Bedroom2  Bathroom  Car  Landsize  \
4074      1         1         1         1         1    1         1   
4075      1         1         1         1         1    1         1   

      BuildingArea  YearBuilt  Lattitude  Longtitude  Propertycount  
4074             1          1          0           1              1  
4075             1          1          0           1              1  
          Price
4074  2200000.0
4075  1151500.0

   Rooms  Distance  Postcode  Bedroom2  Bathroom  Car  Landsize  BuildingArea  \
0      1         1         1         1         1    1         1             1   
1      1         1         1         1         1    1         1             1   

   YearBuilt  Lattitude  Longtitude  Propertycount  
0          1          0           1              1  
1          1          0           1              1  
      Price
0  881017.0
1  995000.0

0.06584096190103228
30


In [90]:
acc, t = preprocessy_score_dataset(params = {"df": melb_data_copy1_x.select_dtypes(exclude=['object']), "test_size": 0.3, "type": "BinaryScaler",  "fill_missing":  "mean"}, target_col = "Price", no_estimators = 30)

dtf_3.loc['Preprocessy'] = [acc, t]

       Price
0   881017.0
1   995000.0
2   902000.0
3  1075000.0
4  1450000.0
[1144457.12680848 1144457.12680848 1144457.12680848 1144457.12680848
 1144457.12680848]
Mean Absolute Error: 434299.1286214688
Mean Squared Error: 386468948618.0512
Root Mean Squared Error: 621666.2678785549


#### MinMaxScaler

In [101]:
dtf_1

Unnamed: 0,Accuracy,Time
Preprocessy,-0.9742,3.3462
sklearn,0.7935,5.6237


#### StandardScaler

In [111]:
dtf_2

Unnamed: 0,Accuracy,Time
Preprocessy,0.6029,4.222
sklearn,0.7925,2.1454


#### BinaryScaler

In [114]:
dtf_3

Unnamed: 0,Accuracy,Time
Preprocessy,0.0651,0.1361
sklearn,0.0643,0.0728
