## Evaluation for Split Class

- Evaluates the effect of preprocessy train test split function on model accuracy compared to sklearn train test split
- Evaluates on 4 datasets

In [32]:
# To access preprocessy module. Required in .ipynb files
import os
import sys
module_path = os.path.abspath(os.path.join('..'))
if module_path not in sys.path:
    sys.path.append(module_path)

In [33]:
import numpy as np
import pandas as pd
import time
from sklearn.datasets import load_iris, load_boston, load_breast_cancer, load_diabetes
from sklearn.linear_model import LinearRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report, r2_score
from sklearn.model_selection import train_test_split
from preprocessy.resampling import Split

In [34]:
splits = [None, 0.2, 0.3]

In [35]:
def preprocessy_eval(X, y, split, model):
    X_train, X_test, y_train, y_test = Split().train_test_split(X, y, test_size=split)
    preprocessy_test_size = None
    if split:
        preprocessy_test_size = split
    else:
        preprocessy_test_size = 1 / np.sqrt(len(X.columns))
    model.fit(X_train, y_train)
    preds = model.predict(X_test)
    return preprocessy_test_size, preds, y_test

In [36]:
def sklearn_eval(X, y, split, model):
    X_train, X_test, y_train, y_test = train_test_split(
        X, y, test_size=split, random_state=69
    )
    sklearn_test_size = None
    if split:
        sklearn_test_size = split
    else:
        sklearn_test_size = 0.25  # from sklearn docs
    model.fit(X_train, y_train)
    preds = model.predict(X_test)
    return sklearn_test_size, preds, y_test

In [37]:
def eval(X, y, dataset, model):
    print(f"Dataset - {dataset}\n")
    for split in splits:
        start = time.time()
        preprocessy_test_size, preprocessy_preds, preprocessy_y_test = preprocessy_eval(
            X, y, split, model
        )
        end = time.time()
        preprocessy_time = end - start
        start = time.time()
        sklearn_test_size, sklearn_preds, sklearn_y_test = sklearn_eval(
            X, y, split, model
        )
        end = time.time()
        sklearn_time = end - start
        preprocessy_accuracy = classification_report(
            preprocessy_y_test, preprocessy_preds, output_dict=True
        )["accuracy"]
        sklearn_accuracy = classification_report(
            sklearn_y_test, sklearn_preds, output_dict=True
        )["accuracy"]
        
        print(
            f"Preprocessy\n-----------\n\ntest_size - {preprocessy_test_size}\naccuracy - {preprocessy_accuracy:.4f}\ntime - {preprocessy_time:.4f}\n"
        )
        print(f"Sklearn\n-------\n\ntest_size - {sklearn_test_size}\naccuracy - {sklearn_accuracy:.4f}\ntime - {sklearn_time:.4f}\n")


In [38]:
def evaluate_on_iris():
    model = RandomForestClassifier()
    X, y = load_iris(return_X_y=True, as_frame=True)
    eval(X, y, "iris", model)

In [39]:
def evaluate_on_breast_cancer():
    model = RandomForestClassifier()
    X, y = load_breast_cancer(return_X_y=True, as_frame=True)
    eval(X, y, "breast cancer", model)

In [40]:
def evaluate_on_diabetes():
    print(f"Dataset - diabetes")
    model = LinearRegression(fit_intercept=True, normalize=True, copy_X=True)
    X, y = load_diabetes(return_X_y=True, as_frame=True)
    for split in splits:
        start = time.time()
        preprocessy_test_size, preprocessy_preds, preprocessy_y_test = preprocessy_eval(
            X, y, split, model
        )
        end = time.time()
        preprocessy_time = end - start
        start = time.time()
        sklearn_test_size, sklearn_preds, sklearn_y_test = sklearn_eval(
            X, y, split, model
        )
        end = time.time()
        sklearn_time = end - start
        preprocessy_accuracy = r2_score(preprocessy_y_test, preprocessy_preds)
        sklearn_accuracy = r2_score(sklearn_y_test, sklearn_preds)
        print(
            f"Preprocessy\n-----------\n\ntest_size - {preprocessy_test_size}\naccuracy - {preprocessy_accuracy:.4f}\ntime - {preprocessy_time:.4f}\n"
        )
        print(f"Sklearn\n-------\n\ntest_size - {sklearn_test_size}\naccuracy - {sklearn_accuracy:.4f}\ntime - {sklearn_time:.4f}\n")

In [41]:
def evaluate_on_boston():
    print(f"Dataset - boston")
    model = LinearRegression(fit_intercept=True, normalize=True, copy_X=True)
    dataset = load_boston()
    X = pd.DataFrame(dataset.data, columns=dataset.feature_names)
    y = pd.Series(dataset.target, name="Target")
    for split in splits:
        start = time.time()
        preprocessy_test_size, preprocessy_preds, preprocessy_y_test = preprocessy_eval(
            X, y, split, model
        )
        end = time.time()
        preprocessy_time = end - start
        start = time.time()
        sklearn_test_size, sklearn_preds, sklearn_y_test = sklearn_eval(
            X, y, split, model
        )
        end = time.time()
        sklearn_time = end - start
        preprocessy_accuracy = r2_score(preprocessy_y_test, preprocessy_preds)
        sklearn_accuracy = r2_score(sklearn_y_test, sklearn_preds)
        
        print(
            f"Preprocessy\n-----------\n\ntest_size - {preprocessy_test_size}\naccuracy - {preprocessy_accuracy:.4f}\ntime - {preprocessy_time:.4f}\n"
        )
        print(f"Sklearn\n-------\n\ntest_size - {sklearn_test_size}\naccuracy - {sklearn_accuracy:.4f}\ntime - {sklearn_time:.4f}\n")

In [42]:
evaluate_on_iris()

Dataset - iris

Preprocessy
-----------

test_size - 0.5
accuracy - 0.9600
time - 0.1451

Sklearn
-------

test_size - 0.25
accuracy - 0.9737
time - 0.1307

Preprocessy
-----------

test_size - 0.2
accuracy - 0.9667
time - 0.1249

Sklearn
-------

test_size - 0.2
accuracy - 0.9667
time - 0.1285

Preprocessy
-----------

test_size - 0.3
accuracy - 0.9778
time - 0.1223

Sklearn
-------

test_size - 0.3
accuracy - 0.9778
time - 0.1292



In [43]:
evaluate_on_breast_cancer()

Dataset - breast cancer

Preprocessy
-----------

test_size - 0.18257418583505536
accuracy - 0.9515
time - 0.1899

Sklearn
-------

test_size - 0.25
accuracy - 0.9650
time - 0.1732

Preprocessy
-----------

test_size - 0.2
accuracy - 0.9469
time - 0.1711

Sklearn
-------

test_size - 0.2
accuracy - 0.9474
time - 0.2014

Preprocessy
-----------

test_size - 0.3
accuracy - 0.9706
time - 0.2178

Sklearn
-------

test_size - 0.3
accuracy - 0.9474
time - 0.2177



In [44]:
evaluate_on_diabetes()

Dataset - diabetes
Preprocessy
-----------

test_size - 0.31622776601683794
accuracy - 0.4350
time - 0.0049

Sklearn
-------

test_size - 0.25
accuracy - 0.4405
time - 0.0034

Preprocessy
-----------

test_size - 0.2
accuracy - 0.5065
time - 0.0070

Sklearn
-------

test_size - 0.2
accuracy - 0.5145
time - 0.0072

Preprocessy
-----------

test_size - 0.3
accuracy - 0.4482
time - 0.0070

Sklearn
-------

test_size - 0.3
accuracy - 0.4509
time - 0.0039



In [45]:
evaluate_on_boston()

Dataset - boston
Preprocessy
-----------

test_size - 0.2773500981126146
accuracy - 0.6891
time - 0.0153

Sklearn
-------

test_size - 0.25
accuracy - 0.6722
time - 0.0066

Preprocessy
-----------

test_size - 0.2
accuracy - 0.6752
time - 0.0074

Sklearn
-------

test_size - 0.2
accuracy - 0.6747
time - 0.0035

Preprocessy
-----------

test_size - 0.3
accuracy - 0.6755
time - 0.0054

Sklearn
-------

test_size - 0.3
accuracy - 0.6927
time - 0.0053

