## Evaluation for Split Class

- Evaluates the effect of preprocessy train test split function on model accuracy compared to sklearn train test split
- Evaluates on 4 datasets
    * iris
    * boston
    * breast_cancer
    * diabetes
- Using RandomForestClassifier() model on first 2 datasets
- Using LinearRegression() model on other 2 datasets
- Using r2_score of sklearn.metrics
- Comparisons between sklearn and preprocessy based on accuracy and time for different test sizes have been indicated at the end

In [1]:
# To access preprocessy module. Required in .ipynb files
import os
import sys
module_path = os.path.abspath(os.path.join('..'))
if module_path not in sys.path:
    sys.path.append(module_path)

In [2]:
import numpy as np
import pandas as pd
import time

from sklearn.datasets import load_iris, load_boston, load_breast_cancer, load_diabetes
from sklearn.linear_model import LinearRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report, r2_score
from sklearn.model_selection import train_test_split

from preprocessy.resampling import Split

In [22]:
splits = [None, 0.2, 0.3]

In [23]:
def preprocessy_eval(X, y, split, model):
    X_train, X_test, y_train, y_test = Split().train_test_split(X, y, test_size=split)
    preprocessy_test_size = None

    if split:
        preprocessy_test_size = split
    else:
        preprocessy_test_size = 1 / np.sqrt(len(X.columns))
    
    model.fit(X_train, y_train)
    preds = model.predict(X_test)
    
    return preprocessy_test_size, preds, y_test

In [24]:
def sklearn_eval(X, y, split, model):
    X_train, X_test, y_train, y_test = train_test_split(
        X, y, test_size=split, random_state=69
    )
    sklearn_test_size = None

    if split:
        sklearn_test_size = split
    else:
        sklearn_test_size = 0.25  # from sklearn docs
    
    model.fit(X_train, y_train)
    preds = model.predict(X_test)
    return sklearn_test_size, preds, y_test

In [25]:
def eval(X, y, dataset, model):
    print(f"Dataset - {dataset}\n")
    for split in splits:
        start = time.time()
        preprocessy_test_size, preprocessy_preds, preprocessy_y_test = preprocessy_eval(
            X, y, split, model
        )
        end = time.time()
        preprocessy_time = np.round(end - start,4)

        start = time.time()
        sklearn_test_size, sklearn_preds, sklearn_y_test = sklearn_eval(
            X, y, split, model
        )
        end = time.time()
        sklearn_time = np.round(end - start,4)
        
        preprocessy_accuracy =  np.round(
                                classification_report(
                                preprocessy_y_test, preprocessy_preds, output_dict=True
                                )["accuracy"],
                                4)
        sklearn_accuracy    =   np.round(classification_report(
                                sklearn_y_test, sklearn_preds, output_dict=True
                                )["accuracy"],
                                4)
        
        dt = {'Test_size': [preprocessy_test_size, sklearn_test_size],
            'Accuracy': [preprocessy_accuracy, sklearn_accuracy],
            'Time': [preprocessy_time, sklearn_time]
        }
        print(pd.DataFrame(dt,index=['Preprocessy','sklearn']))
        print()


In [40]:
def eval2(X, y, dataset, model):
    print(f"Dataset - {dataset}\n")
    for split in splits:
        start = time.time()
        preprocessy_test_size, preprocessy_preds, preprocessy_y_test = preprocessy_eval(
            X, y, split, model
        )
        end = time.time()
        preprocessy_time = np.round(end - start,4)

        start = time.time()
        sklearn_test_size, sklearn_preds, sklearn_y_test = sklearn_eval(
            X, y, split, model
        )
        end = time.time()
        sklearn_time = np.round(end - start,4)

        preprocessy_accuracy = np.round(r2_score(preprocessy_y_test, preprocessy_preds),4)
        sklearn_accuracy = np.round(r2_score(sklearn_y_test, sklearn_preds),4)
        
        dt = {'Test_size': [preprocessy_test_size, sklearn_test_size],
            'Accuracy': [preprocessy_accuracy, sklearn_accuracy],
            'Time': [preprocessy_time, sklearn_time]
        }
        print(pd.DataFrame(dt,index=['Preprocessy','sklearn']))
        print()

In [41]:
def evaluate_on_iris():
    model = RandomForestClassifier()
    X, y = load_iris(return_X_y=True, as_frame=True)
    eval(X, y, "iris", model)

In [42]:
def evaluate_on_breast_cancer():
    model = RandomForestClassifier()
    X, y = load_breast_cancer(return_X_y=True, as_frame=True)
    eval(X, y, "breast cancer", model)

In [43]:
def evaluate_on_diabetes():
    model = LinearRegression(fit_intercept=True, normalize=True, copy_X=True)
    X, y = load_diabetes(return_X_y=True, as_frame=True)
    eval2(X, y, "diabetes", model)

In [44]:
def evaluate_on_boston():
    model = LinearRegression(fit_intercept=True, normalize=True, copy_X=True)
    dataset = load_boston()
    X = pd.DataFrame(dataset.data, columns=dataset.feature_names)
    y = pd.Series(dataset.target, name="Target")
    eval2(X, y, "boston", model)

In [45]:
evaluate_on_iris()

Dataset - iris

             Test_size  Accuracy    Time
Preprocessy       0.50    0.9600  0.1257
sklearn           0.25    0.9737  0.1237

             Test_size  Accuracy    Time
Preprocessy        0.2    0.9667  0.1516
sklearn            0.2    0.9667  0.1307

             Test_size  Accuracy    Time
Preprocessy        0.3    0.9778  0.1417
sklearn            0.3    0.9778  0.1506



In [46]:
evaluate_on_breast_cancer()

Dataset - breast cancer

             Test_size  Accuracy    Time
Preprocessy   0.182574    0.9515  0.2145
sklearn       0.250000    0.9650  0.1855

             Test_size  Accuracy    Time
Preprocessy        0.2    0.9469  0.1845
sklearn            0.2    0.9474  0.1805

             Test_size  Accuracy    Time
Preprocessy        0.3    0.9706  0.1575
sklearn            0.3    0.9474  0.1636



In [47]:
evaluate_on_diabetes()

Dataset - diabetes

             Test_size  Accuracy   Time
Preprocessy   0.316228    0.4350  0.009
sklearn       0.250000    0.4405  0.004

             Test_size  Accuracy    Time
Preprocessy        0.2    0.5065  0.0149
sklearn            0.2    0.5145  0.0080

             Test_size  Accuracy   Time
Preprocessy        0.3    0.4482  0.011
sklearn            0.3    0.4509  0.004



In [48]:
evaluate_on_boston()

Dataset - boston

             Test_size  Accuracy   Time
Preprocessy    0.27735    0.6891  0.009
sklearn        0.25000    0.6722  0.007

             Test_size  Accuracy    Time
Preprocessy        0.2    0.6752  0.0119
sklearn            0.2    0.6747  0.0060

             Test_size  Accuracy   Time
Preprocessy        0.3    0.6755  0.006
sklearn            0.3    0.6927  0.004

