In [1]:
import os
import sys
module_path = os.path.abspath(os.path.join('..'))
if module_path not in sys.path:
    sys.path.append(module_path)

In [2]:
import pandas as pd
import numpy as np
import random
import time
import tracemalloc
from sklearn.pipeline import Pipeline
from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import OneHotEncoder
from sklearn.model_selection import train_test_split
from sklearn.compose import ColumnTransformer
from sklearn.neighbors import LocalOutlierFactor
from sklearn.metrics import mean_squared_error  
from sklearn.linear_model import LinearRegression
from sklearn import metrics

## Dataset Analysis

In [3]:
df = pd.read_csv('accelerometer_modified.csv')

In [4]:
df.shape

(153000, 6)

In [5]:
df.head()

Unnamed: 0,wconfid,pctid,x,y,z,random
0,1,20,1.004,0.09,-0.125,11
1,1,20,1.004,-0.043,-0.125,2
2,1,20,0.969,0.09,-0.121,75
3,1,20,0.973,-0.012,-0.137,41
4,1,20,1.0,-0.016,-0.121,49


In [6]:
df.isnull().sum()

wconfid      0
pctid        0
x          765
y            0
z            0
random       0
dtype: int64

## Preprocessy Pipeline

In [5]:
from preprocessy.pipelines import StandardPipeline

In [9]:
def test_preprocessy_pipeline(random_state_num):
    params = {
        'target_label':"z",
        'cat_cols':["wconfid","pctid"],
        'drop_cols':["random"],
        'fill_missing':{"mean":["x"]},
        'first_quartile':0.01,
        'third_quartile':0.99,
        'columns':["x","y"],
        'type':"StandardScaler",
        'one_hot':True,
        'out_cols':["x","y"],
        'test_size':0.1,
        'shuffle':True,
        'random_state':random_state_num,
        'n_splits':2
    }
    p = StandardPipeline(train_df_path="accelerometer_modified.csv", params=params)
    res = p.process()
    params['X_train'] = params['X_train'].drop(columns=["wconfid","pctid"])
    params['X_test'] = params['X_test'].drop(columns=["wconfid","pctid"])
    reg = LinearRegression().fit(params['X_train'], params['y_train'])
    return mean_squared_error(reg.predict(params['X_test']), params['y_test'])

In [10]:
average_time = []
average_mem = []
average_mse = []
for i in range(100):
    tracemalloc.start()
    begin = time.time()
    mse = test_preprocessy_pipeline(i)
    end = time.time()
    current,peak=tracemalloc.get_traced_memory()
    tracemalloc.clear_traces()
    tracemalloc.stop()
    average_time.append(end-begin)
    average_mem.append(peak)
    average_mse.append(mse)


Pipeline Class: StandardPipeline

+-----------------------+---------------------------------------------------------------------------------------+
| Pipeline Property     | Value                                                                                 |
+-----------------------+---------------------------------------------------------------------------------------+
| Train Dataframe Path  | ../datasets/regression/accelerometer_modified.csv                                     |
| Test Dataframe Path   | None                                                                                  |
| Config File Path      | None                                                                                  |
| Pipeline Stages       | Read file, Parse dataset, Execute, Encode, Handle outliers, Execute, Train test split |
| Total Pipeline Stages | 7                                                                                     |
| Total Params          | 16                         

+-----------------------+---------------------------------------------------------------------------------------+
Processing...
==> Completed Stage: Read file
==> Completed Stage: Parse dataset
==> Completed Stage: Execute
==> Completed Stage: Encode
==> Completed Stage: Handle outliers
==> Completed Stage: Execute
==> Completed Stage: Train test split
Pipeline Stages |████████████████████████████████████████| 7/7 [100%] in 0.4s (19.93/s)

Pipeline Completed Successfully


Pipeline Class: StandardPipeline

+-----------------------+---------------------------------------------------------------------------------------+
| Pipeline Property     | Value                                                                                 |
+-----------------------+---------------------------------------------------------------------------------------+
| Train Dataframe Path  | ../datasets/regression/accelerometer_modified.csv                                     |
| Test Dataframe Path   | None  


+-----------------------+---------------------------------------------------------------------------------------+
| Pipeline Property     | Value                                                                                 |
+-----------------------+---------------------------------------------------------------------------------------+
| Train Dataframe Path  | ../datasets/regression/accelerometer_modified.csv                                     |
| Test Dataframe Path   | None                                                                                  |
| Config File Path      | None                                                                                  |
| Pipeline Stages       | Read file, Parse dataset, Execute, Encode, Handle outliers, Execute, Train test split |
| Total Pipeline Stages | 7                                                                                     |
| Total Params          | 16                                                           

+-----------------------+---------------------------------------------------------------------------------------+
Processing...
==> Completed Stage: Read file
==> Completed Stage: Parse dataset
==> Completed Stage: Execute
==> Completed Stage: Encode
==> Completed Stage: Handle outliers
==> Completed Stage: Execute
==> Completed Stage: Train test split
Pipeline Stages |████████████████████████████████████████| 7/7 [100%] in 0.3s (20.63/s)

Pipeline Completed Successfully


Pipeline Class: StandardPipeline

+-----------------------+---------------------------------------------------------------------------------------+
| Pipeline Property     | Value                                                                                 |
+-----------------------+---------------------------------------------------------------------------------------+
| Train Dataframe Path  | ../datasets/regression/accelerometer_modified.csv                                     |
| Test Dataframe Path   | None  


+-----------------------+---------------------------------------------------------------------------------------+
| Pipeline Property     | Value                                                                                 |
+-----------------------+---------------------------------------------------------------------------------------+
| Train Dataframe Path  | ../datasets/regression/accelerometer_modified.csv                                     |
| Test Dataframe Path   | None                                                                                  |
| Config File Path      | None                                                                                  |
| Pipeline Stages       | Read file, Parse dataset, Execute, Encode, Handle outliers, Execute, Train test split |
| Total Pipeline Stages | 7                                                                                     |
| Total Params          | 16                                                           

+-----------------------+---------------------------------------------------------------------------------------+
Processing...
==> Completed Stage: Read file
==> Completed Stage: Parse dataset
==> Completed Stage: Execute
==> Completed Stage: Encode
==> Completed Stage: Handle outliers
==> Completed Stage: Execute
==> Completed Stage: Train test split
Pipeline Stages |████████████████████████████████████████| 7/7 [100%] in 0.4s (16.54/s)

Pipeline Completed Successfully


Pipeline Class: StandardPipeline

+-----------------------+---------------------------------------------------------------------------------------+
| Pipeline Property     | Value                                                                                 |
+-----------------------+---------------------------------------------------------------------------------------+
| Train Dataframe Path  | ../datasets/regression/accelerometer_modified.csv                                     |
| Test Dataframe Path   | None  


+-----------------------+---------------------------------------------------------------------------------------+
| Pipeline Property     | Value                                                                                 |
+-----------------------+---------------------------------------------------------------------------------------+
| Train Dataframe Path  | ../datasets/regression/accelerometer_modified.csv                                     |
| Test Dataframe Path   | None                                                                                  |
| Config File Path      | None                                                                                  |
| Pipeline Stages       | Read file, Parse dataset, Execute, Encode, Handle outliers, Execute, Train test split |
| Total Pipeline Stages | 7                                                                                     |
| Total Params          | 16                                                           

+-----------------------+---------------------------------------------------------------------------------------+
Processing...
==> Completed Stage: Read file
==> Completed Stage: Parse dataset
==> Completed Stage: Execute
==> Completed Stage: Encode
==> Completed Stage: Handle outliers
==> Completed Stage: Execute
==> Completed Stage: Train test split
Pipeline Stages |████████████████████████████████████████| 7/7 [100%] in 0.3s (20.94/s)

Pipeline Completed Successfully


Pipeline Class: StandardPipeline

+-----------------------+---------------------------------------------------------------------------------------+
| Pipeline Property     | Value                                                                                 |
+-----------------------+---------------------------------------------------------------------------------------+
| Train Dataframe Path  | ../datasets/regression/accelerometer_modified.csv                                     |
| Test Dataframe Path   | None  


+-----------------------+---------------------------------------------------------------------------------------+
| Pipeline Property     | Value                                                                                 |
+-----------------------+---------------------------------------------------------------------------------------+
| Train Dataframe Path  | ../datasets/regression/accelerometer_modified.csv                                     |
| Test Dataframe Path   | None                                                                                  |
| Config File Path      | None                                                                                  |
| Pipeline Stages       | Read file, Parse dataset, Execute, Encode, Handle outliers, Execute, Train test split |
| Total Pipeline Stages | 7                                                                                     |
| Total Params          | 16                                                           

+-----------------------+---------------------------------------------------------------------------------------+
Processing...
==> Completed Stage: Read file
==> Completed Stage: Parse dataset
==> Completed Stage: Execute
==> Completed Stage: Encode
==> Completed Stage: Handle outliers
==> Completed Stage: Execute
==> Completed Stage: Train test split
Pipeline Stages |████████████████████████████████████████| 7/7 [100%] in 0.3s (21.65/s)

Pipeline Completed Successfully


Pipeline Class: StandardPipeline

+-----------------------+---------------------------------------------------------------------------------------+
| Pipeline Property     | Value                                                                                 |
+-----------------------+---------------------------------------------------------------------------------------+
| Train Dataframe Path  | ../datasets/regression/accelerometer_modified.csv                                     |
| Test Dataframe Path   | None  


+-----------------------+---------------------------------------------------------------------------------------+
| Pipeline Property     | Value                                                                                 |
+-----------------------+---------------------------------------------------------------------------------------+
| Train Dataframe Path  | ../datasets/regression/accelerometer_modified.csv                                     |
| Test Dataframe Path   | None                                                                                  |
| Config File Path      | None                                                                                  |
| Pipeline Stages       | Read file, Parse dataset, Execute, Encode, Handle outliers, Execute, Train test split |
| Total Pipeline Stages | 7                                                                                     |
| Total Params          | 16                                                           

+-----------------------+---------------------------------------------------------------------------------------+
Processing...
==> Completed Stage: Read file
==> Completed Stage: Parse dataset
==> Completed Stage: Execute
==> Completed Stage: Encode
==> Completed Stage: Handle outliers
==> Completed Stage: Execute
==> Completed Stage: Train test split
Pipeline Stages |████████████████████████████████████████| 7/7 [100%] in 0.3s (20.43/s)

Pipeline Completed Successfully


Pipeline Class: StandardPipeline

+-----------------------+---------------------------------------------------------------------------------------+
| Pipeline Property     | Value                                                                                 |
+-----------------------+---------------------------------------------------------------------------------------+
| Train Dataframe Path  | ../datasets/regression/accelerometer_modified.csv                                     |
| Test Dataframe Path   | None  


+-----------------------+---------------------------------------------------------------------------------------+
| Pipeline Property     | Value                                                                                 |
+-----------------------+---------------------------------------------------------------------------------------+
| Train Dataframe Path  | ../datasets/regression/accelerometer_modified.csv                                     |
| Test Dataframe Path   | None                                                                                  |
| Config File Path      | None                                                                                  |
| Pipeline Stages       | Read file, Parse dataset, Execute, Encode, Handle outliers, Execute, Train test split |
| Total Pipeline Stages | 7                                                                                     |
| Total Params          | 16                                                           

+-----------------------+---------------------------------------------------------------------------------------+
Processing...
==> Completed Stage: Read file
==> Completed Stage: Parse dataset
==> Completed Stage: Execute
==> Completed Stage: Encode
==> Completed Stage: Handle outliers
==> Completed Stage: Execute
==> Completed Stage: Train test split
Pipeline Stages |████████████████████████████████████████| 7/7 [100%] in 0.3s (20.07/s)

Pipeline Completed Successfully


Pipeline Class: StandardPipeline

+-----------------------+---------------------------------------------------------------------------------------+
| Pipeline Property     | Value                                                                                 |
+-----------------------+---------------------------------------------------------------------------------------+
| Train Dataframe Path  | ../datasets/regression/accelerometer_modified.csv                                     |
| Test Dataframe Path   | None  


+-----------------------+---------------------------------------------------------------------------------------+
| Pipeline Property     | Value                                                                                 |
+-----------------------+---------------------------------------------------------------------------------------+
| Train Dataframe Path  | ../datasets/regression/accelerometer_modified.csv                                     |
| Test Dataframe Path   | None                                                                                  |
| Config File Path      | None                                                                                  |
| Pipeline Stages       | Read file, Parse dataset, Execute, Encode, Handle outliers, Execute, Train test split |
| Total Pipeline Stages | 7                                                                                     |
| Total Params          | 16                                                           

+-----------------------+---------------------------------------------------------------------------------------+
Processing...
==> Completed Stage: Read file
==> Completed Stage: Parse dataset
==> Completed Stage: Execute
==> Completed Stage: Encode
==> Completed Stage: Handle outliers
==> Completed Stage: Execute
==> Completed Stage: Train test split
Pipeline Stages |████████████████████████████████████████| 7/7 [100%] in 0.3s (20.95/s)

Pipeline Completed Successfully


Pipeline Class: StandardPipeline

+-----------------------+---------------------------------------------------------------------------------------+
| Pipeline Property     | Value                                                                                 |
+-----------------------+---------------------------------------------------------------------------------------+
| Train Dataframe Path  | ../datasets/regression/accelerometer_modified.csv                                     |
| Test Dataframe Path   | None  


+-----------------------+---------------------------------------------------------------------------------------+
| Pipeline Property     | Value                                                                                 |
+-----------------------+---------------------------------------------------------------------------------------+
| Train Dataframe Path  | ../datasets/regression/accelerometer_modified.csv                                     |
| Test Dataframe Path   | None                                                                                  |
| Config File Path      | None                                                                                  |
| Pipeline Stages       | Read file, Parse dataset, Execute, Encode, Handle outliers, Execute, Train test split |
| Total Pipeline Stages | 7                                                                                     |
| Total Params          | 16                                                           

+-----------------------+---------------------------------------------------------------------------------------+
Processing...
==> Completed Stage: Read file
==> Completed Stage: Parse dataset
==> Completed Stage: Execute
==> Completed Stage: Encode
==> Completed Stage: Handle outliers
==> Completed Stage: Execute
==> Completed Stage: Train test split
Pipeline Stages |████████████████████████████████████████| 7/7 [100%] in 0.3s (20.51/s)

Pipeline Completed Successfully


Pipeline Class: StandardPipeline

+-----------------------+---------------------------------------------------------------------------------------+
| Pipeline Property     | Value                                                                                 |
+-----------------------+---------------------------------------------------------------------------------------+
| Train Dataframe Path  | ../datasets/regression/accelerometer_modified.csv                                     |
| Test Dataframe Path   | None  


+-----------------------+---------------------------------------------------------------------------------------+
| Pipeline Property     | Value                                                                                 |
+-----------------------+---------------------------------------------------------------------------------------+
| Train Dataframe Path  | ../datasets/regression/accelerometer_modified.csv                                     |
| Test Dataframe Path   | None                                                                                  |
| Config File Path      | None                                                                                  |
| Pipeline Stages       | Read file, Parse dataset, Execute, Encode, Handle outliers, Execute, Train test split |
| Total Pipeline Stages | 7                                                                                     |
| Total Params          | 16                                                           

In [11]:
av_time,av_mem,av_mse = 0,0,0
for i in range(100):
    av_time += average_time[i]
    av_mem += average_mem[i]
    av_mse += average_mse[i]
print(av_time/100,(av_mem/100)/10**6,av_mse/100)

0.5289485669136047 115.39634484999999 0.19820350319126662


## Sklearn piepline

In [11]:
class OutlierRemover(BaseEstimator,TransformerMixin):
    def __init__(self,**kwargs):
        self.q1 = -1
        self.q3 = -1
        
    def outlier_detector(self,X,y=None):
        X = pd.Series(X).copy()
        self.q1 = X.quantile(0.01)
        self.q3 = X.quantile(0.99)

    def fit(self,X,y=None):
        X = pd.DataFrame(X)
        X.apply(self.outlier_detector)
        return self
    
    def transform(self,X,y=None):
        X = pd.DataFrame(X).copy()
        for i in range(X.shape[1]):
            x = X.iloc[:, i].copy()
            x = x[(x > self.q1) | (x <= self.q3)]
            X.iloc[:, i] = x
        return X
outlier_removal = OutlierRemover()

In [12]:
def test_sklearn_pipeline(random_state_num):
    numeric_features = ["x","y"]
    categorical_features = ["wconfid","pctid"]
    missing_features = ["x"]
    categorical_transformer = Pipeline(
        steps=[('ohe',OneHotEncoder(handle_unknown="ignore"))]
    )
    numeric_transformer = Pipeline(
        steps=[("imputer", SimpleImputer(strategy="mean")),("outlier",outlier_removal),("scaler", StandardScaler())]
    )
    preprocessor = ColumnTransformer(
        transformers=[
            ('num',numeric_transformer,numeric_features),
            ("cat", categorical_transformer, categorical_features),
        ]
    )
    clf = Pipeline(
        steps=[("preprocessor", preprocessor), ("classifier", LinearRegression())]
    )
    df = pd.read_csv('accelerometer_modified.csv')
    df = df.drop(columns=['random'])
    X,y = df.drop(columns=['z']),df.loc[:,'z']
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.1, random_state=random_state_num)
    clf.fit(X_train, y_train)
    return mean_squared_error(clf.predict(X_test), y_test)

In [13]:
average_time = []
average_mem = []
average_mse = []
for i in range(100):
    tracemalloc.start()
    begin = time.time()
    mse = test_sklearn_pipeline(i)
    end = time.time()
    current,peak=tracemalloc.get_traced_memory()
    tracemalloc.clear_traces()
    tracemalloc.stop()
    average_time.append(end-begin)
    average_mem.append(peak)
    average_mse.append(mse)

In [14]:
av_time,av_mem,av_mse = 0,0,0
for i in range(100):
    av_time += average_time[i]
    av_mem += average_mem[i]
    av_mse += average_mse[i]
print(av_time/100,(av_mem/100)/10**6,av_mse/100)

0.30688182353973387 45.630042270000004 0.2662710904758258


## Pandas Pipeline

In [15]:
def drop_columns(df,column_list):
    df = df.drop(columns=column_list)
    return df
def fill_missing_values(df,column_list):
   for col in column_list:
      df[col].fillna(df[col].mean(), inplace=True)
   return df
def remove_outliers(df, column_list):
   for col in column_list:
      q1 = df[col].quantile(0.01)
      q3 = df[col].quantile(0.99)
      df = df[(df[col] > q1)]
      df = df[(df[col] <= q3)]
   return df
def encode_cols(df,column_list):
    for col in column_list:
        df = pd.concat(
                    [
                        df,
                        pd.get_dummies(df[col], prefix=col).astype(
                            "category"
                        ),
                    ],
                    axis=1,
            )
    return df
def scale_cols(df,column_list):
    for col in column_list:
        cur_col = df[col]
        max = cur_col.max()
        min = cur_col.min()
        cur_col = (cur_col - min) / (max - min)
        df[col] = cur_col
    return df

In [16]:
def test_pandas_pipeline(random_state):
    df = pd.read_csv('accelerometer_modified.csv')
    df_processed = (df.
                    pipe(drop_columns,["random"]).
                    pipe(fill_missing_values,["x"]).
                    pipe(encode_cols, ["wconfid","pctid"]).
                    pipe(remove_outliers, ["x","y"]).
                    pipe(scale_cols,["x","y"])
                   )
    X,y = df_processed.drop(columns=['z']),df_processed.loc[:,'z']
    X = X.drop(columns=["wconfid","pctid"])
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.1, random_state=random_state)
    reg = LinearRegression().fit(X_train, y_train)
    return mean_squared_error(reg.predict(X_test), y_test)

In [17]:
average_time = []
average_mem = []
average_mse = []
for i in range(100):
    tracemalloc.start()
    begin = time.time()
    mse = test_pandas_pipeline(i)
    end = time.time()
    current,peak=tracemalloc.get_traced_memory()
    tracemalloc.clear_traces()
    tracemalloc.stop()
    average_time.append(end-begin)
    average_mem.append(peak)
    average_mse.append(mse)

In [18]:
av_time,av_mem,av_mse = 0,0,0
for i in range(100):
    av_time += average_time[i]
    av_mem += average_mem[i]
    av_mse += average_mse[i]
print(av_time/100,(av_mem/100)/10**6,av_mse/100)

0.47684704780578613 78.5080287 0.19307729915644833
