In [1]:
# To access preprocessy module. Required in .ipynb files
import os
import sys
module_path = os.path.abspath(os.path.join('..'))
if module_path not in sys.path:
    sys.path.append(module_path)

In [2]:
import numpy as np
import pandas as pd
import time
from random import randint
from sklearn.datasets import make_classification
from sklearn.feature_selection import chi2, mutual_info_classif, mutual_info_regression
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report
from preprocessy.feature_selection import Correlation,SelectKBest
from preprocessy.resampling import Split

In [3]:
def without_preprocessing(X,y):
    start = time.time()
    model = RandomForestClassifier()
    X_train, X_test, y_train, y_test = Split().train_test_split(X, y, test_size=0.1)
    model.fit(X_train, y_train)
    preds = model.predict(X_test)
    accuracy_1 = classification_report(y_test,preds,output_dict=True)["accuracy"]
    end = time.time()-start
    return accuracy_1, end

In [4]:
def post_preprocessing(X,y,score_func=None):
    kbest = SelectKBest(k=40)
    X_new = kbest.fit_transform(X,y)
    start = time.time()
    model = RandomForestClassifier()
    X_train, X_test, y_train, y_test = Split().train_test_split(X_new, y, test_size=0.1)
    model.fit(X_train, y_train)
    preds = model.predict(X_test)
    accuracy_2 = classification_report(y_test,preds,output_dict=True)["accuracy"]
    end = time.time()-start
    return accuracy_2, end

In [5]:
o_acc, o_time, u_acc, u_time = 0,0,0,0

In [6]:
for _ in range(100):
    X_class, y_class = make_classification(n_samples=randint(1000,10000), n_features=randint(50,100))
    X_class = pd.DataFrame(X_class)
    y_class = pd.Series(y_class,name='Target')
    a,e = without_preprocessing(X_class,y_class)
    o_acc += a
    o_time += e
    a,e = post_preprocessing(X_class,y_class)
    u_acc += a
    u_time += e

In [7]:
print(f"Without Preprocessing\n\nAccuracy - {(o_acc/100):.4f}\nTime - {(o_time/100):.4f}\n")
print(f"Post Preprocessing\n\nAccuracy - {(u_acc/100):.4f}\nTime - {(u_time/100):.4f}")

Without Preprocessing

Accuracy - 0.9296
Time - 3.5947

Post Preprocessing

Accuracy - 0.9291
Time - 2.4966
