## Evaluation for NullValuesHandler Class

- Evaluates the effect of preprocessy execute function on model accuracy compared to sklearn
- Evaluates on 1 dataset
    * Melbourne Housing Snapshot
- Considering a standard test size of 0.3 for all 3 cases i.e.
    * Dropping columns with null values
    * Filling null values with mean
    * Filling null values with mean considering values which were originally missing
- Using RandomForestRegressor() model
- Using r2_score of sklearn.metrics
- Comparisons between sklearn and preprocessy based on accuracy and time have been indicated at the end

In [57]:
# To access preprocessy module. Required in .ipynb files
import os
import sys
module_path = os.path.abspath(os.path.join('..'))
if module_path not in sys.path:
    sys.path.append(module_path)

In [58]:
import numpy as np
import pandas as pd
import time

from sklearn.datasets import load_iris, load_boston, load_breast_cancer, load_diabetes
from sklearn.impute import SimpleImputer
from sklearn.ensemble import RandomForestRegressor
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import mean_absolute_error,classification_report, r2_score
from sklearn.model_selection import train_test_split

from preprocessy.handlenullvalues import NullValuesHandler
from preprocessy.resampling import Split

np.random.seed(101)

## Melbourne Housing Snapshot

In [59]:
melb_data = pd.read_csv('../datasets/handling_null_values/melb_data.csv')
melb_data_copy1 = melb_data
melb_data_copy2 = melb_data

dtf_1 = pd.DataFrame(columns = ['Accuracy', 'Time'])
dtf_2 = pd.DataFrame(columns = ['Accuracy', 'Time'])
dtf_3 = pd.DataFrame(columns = ['Accuracy', 'Time'])

melb_data.head()

Unnamed: 0,Suburb,Address,Rooms,Type,Price,Method,SellerG,Date,Distance,Postcode,...,Bathroom,Car,Landsize,BuildingArea,YearBuilt,CouncilArea,Lattitude,Longtitude,Regionname,Propertycount
0,Abbotsford,85 Turner St,2,h,1480000.0,S,Biggin,3/12/2016,2.5,3067.0,...,1.0,1.0,202.0,,,Yarra,-37.7996,144.9984,Northern Metropolitan,4019.0
1,Abbotsford,25 Bloomburg St,2,h,1035000.0,S,Biggin,4/02/2016,2.5,3067.0,...,1.0,0.0,156.0,79.0,1900.0,Yarra,-37.8079,144.9934,Northern Metropolitan,4019.0
2,Abbotsford,5 Charles St,3,h,1465000.0,SP,Biggin,4/03/2017,2.5,3067.0,...,2.0,0.0,134.0,150.0,1900.0,Yarra,-37.8093,144.9944,Northern Metropolitan,4019.0
3,Abbotsford,40 Federation La,3,h,850000.0,PI,Biggin,4/03/2017,2.5,3067.0,...,2.0,1.0,94.0,,,Yarra,-37.7969,144.9969,Northern Metropolitan,4019.0
4,Abbotsford,55a Park St,4,h,1600000.0,VB,Nelson,4/06/2016,2.5,3067.0,...,1.0,2.0,120.0,142.0,2014.0,Yarra,-37.8072,144.9941,Northern Metropolitan,4019.0


### No of columns with null values in Original Dataset

In [60]:
melb_data.isnull().any().sum()

4

In [61]:
missing_val_cnt_per_column = melb_data.isnull().sum().sort_values(ascending=False)[:4]
total_val = np.product(melb_data.shape)
perc = 100*(missing_val_cnt_per_column.sum())/total_val

missing_val_cnt_per_column

BuildingArea    6450
YearBuilt       5375
CouncilArea     1369
Car               62
dtype: int64

In [62]:
print(f"missing_val_cnt: {missing_val_cnt_per_column.sum()} ")
print(f"total_val      : {total_val}")
print(f"perc           : {perc}")

missing_val_cnt: 13256 
total_val      : 285180
perc           : 4.648292306613367


In [63]:
# Consider Price as Target property and others as Predictors 
melb_target = melb_data.Price

melb_predictors = melb_data.drop(['Price'], axis=1)
melb_numeric_predictors = melb_predictors.select_dtypes(exclude=['object'])

## Using sklearn

In [64]:
def sklearn_score_dataset(sklearn_X_train, sklearn_X_test, sklearn_y_train, sklearn_y_test):
    
    model = RandomForestRegressor()
    model.fit(sklearn_X_train, sklearn_y_train)
    sklearn_preds = model.predict(sklearn_X_test)

    
    sklearn_accuracy = np.round(r2_score(sklearn_y_test, sklearn_preds),4)
    return sklearn_accuracy

### Dropping columns with null values

In [65]:
start = time.time()

X_train, X_test, y_train, y_test = train_test_split(melb_numeric_predictors, 
                                                    melb_target,
                                                    train_size=0.7, 
                                                    test_size=0.3, 
                                                    random_state=0)
                                                    
cols_with_missing = [col for col in X_train.columns if X_train[col].isnull().any()]

reduced_X_train = X_train.drop(cols_with_missing, axis=1)
reduced_X_test  = X_test.drop(cols_with_missing, axis=1)

acc = sklearn_score_dataset(reduced_X_train, reduced_X_test, y_train, y_test)

end = time.time()
sklearn_time = np.round(end - start,4)

dtf_1.loc['sklearn'] = [acc, sklearn_time]

### Imputation by default fills in the missing value with the mean value

In [66]:
start = time.time()

my_imputer = SimpleImputer()

imputed_X_train = my_imputer.fit_transform(X_train)
imputed_X_test = my_imputer.transform(X_test)

acc = sklearn_score_dataset(imputed_X_train, imputed_X_test, y_train, y_test)

end = time.time()
sklearn_time = np.round(end - start,4)

dtf_2.loc['sklearn'] = [acc, sklearn_time]

### Model can make better predictions by considering which values were originally missing.

In [67]:
imputed_X_train_plus = X_train.copy()
imputed_X_test_plus = X_test.copy()

cols_with_missing = (col for col in X_train.columns if X_train[col].isnull().any())
for col in cols_with_missing:
    imputed_X_train_plus[col + '_was_missing'] = imputed_X_train_plus[col].isnull()
    imputed_X_test_plus[col + '_was_missing'] = imputed_X_test_plus[col].isnull()

# Imputation
start = time.time()

my_imputer = SimpleImputer()

imputed_X_train_plus = my_imputer.fit_transform(imputed_X_train_plus)
imputed_X_test_plus = my_imputer.transform(imputed_X_test_plus)

acc= sklearn_score_dataset(imputed_X_train_plus, imputed_X_test_plus, y_train, y_test)

end = time.time()
sklearn_time = np.round(end - start,4)

dtf_3.loc['sklearn'] = [acc, sklearn_time]

## Using preprocessy

In [68]:
cols_with_missing = [col for col in melb_data_copy2.columns if melb_data_copy2[col].isnull().any()]
cols_with_missing

['Car', 'BuildingArea', 'YearBuilt', 'CouncilArea']

In [69]:
def preprocessy_score_dataset(params):

    start = time.time()

    NullValuesHandler().execute(params)

        # print(f"No of Columns with Null values after execute: {params["df"].isnull().any().sum()}")
        # missing_val_cnt_per_column = params["df"].isnull().sum().sort_values(ascending=False)[:1]
        # total_val = np.product(melb_data.shape)
        # perc = 100*(missing_val_cnt_per_column.sum())/total_val

        # print(f"missing_val_cnt: {missing_val_cnt_per_column.sum()} ")
        # print(f"total_val: {total_val}")
        # print(f"perc: {perc}")

    melb_target = params["df"].Price
    melb_predictors = params["df"].drop(['Price'], axis=1)
    melb_numeric_predictors = melb_predictors.select_dtypes(exclude=['object'])

    par = {"X": melb_numeric_predictors, "y":  melb_target, "test_size": 0.3}
    

    Split().train_test_split(par)
    model = RandomForestRegressor()
    model.fit(par["X_train"], par["y_train"])
    preprocessy_preds = model.predict(par["X_test"])

    end=time.time()
    preprocessy_time = np.round(end - start,4)
    
    preprocessy_accuracy = np.round(r2_score(par["y_test"], preprocessy_preds),4)
    return preprocessy_accuracy, preprocessy_time

## Using preprocessy
- Dropping cloumns with null values

In [70]:
cols_with_missing = [col for col in melb_data_copy2.columns if melb_data_copy2[col].isnull().any()]

params = {"df": melb_data_copy2, "drop": True, "column_list": cols_with_missing}
acc, t = preprocessy_score_dataset(params)

dtf_1.loc['Preprocessy'] = [acc, t]

## Using preprocessy for imputation
- filling missing values with mean

In [71]:
params = {"df": melb_data_copy1, "fill_missing":  "mean"}

acc, t = preprocessy_score_dataset(params)

dtf_2.loc['Preprocessy'] = [acc, t]

## Using preprocessy for imputation
- filling missing values with mean and improving model by considering values whcih were originally missing

In [72]:
cols_with_missing = (col for col in melb_data.columns if melb_data[col].isnull().any())
for col in cols_with_missing:
    melb_data[col + '_was_missing'] = melb_data[col].isnull()

params = {"df": melb_data, "fill_missing":  "mean"}

acc, t = preprocessy_score_dataset(params)

dtf_3.loc['Preprocessy'] = [acc, t]

## Comparison for case where we drop columns with null values

In [73]:
dtf_1

Unnamed: 0,Accuracy,Time
sklearn,0.741,4.2711
Preprocessy,0.7768,3.5912


## Comparison when Imputation without tracking is done

In [74]:
dtf_2

Unnamed: 0,Accuracy,Time
sklearn,0.7658,5.2105
Preprocessy,0.7892,5.0117


## Comparison when Imputation is done while tracking

In [75]:
dtf_3

Unnamed: 0,Accuracy,Time
sklearn,0.7661,4.7758
Preprocessy,0.7914,5.1065
