In [13]:
import pandas as pd
from sklearn.pipeline import Pipeline
from sklearn.base import BaseEstimator, TransformerMixin

In [25]:
X_train=pd.read_csv('X_train.csv')

In [26]:
X_train.head()

Unnamed: 0,time,seconds_elapsed_orientation,qz_orientation,qy_orientation,qx_orientation,qw_orientation,roll_orientation,pitch_orientation,yaw_orientation,seconds_elapsed_location,...,horizontalAccuracy_location_network,speed_location_network,bearing_location_network,altitude_location_network,longitude_location_network,latitude_location_network,seconds_elapsed_gravity,z_gravity,y_gravity,x_gravity
0,1694857537318869200,1228.074843,-0.626221,0.3861,-0.674631,0.060399,-1.826153,0.600508,-2.571477,1228.069,...,149.600006,0.0,0.0,66.5,3.097713,50.68115,1228.074843,-2.041392,-5.524593,7.841044
1,1694857182616775700,873.374776,-0.013953,-0.618994,-0.425122,0.660245,-1.72367,0.57531,0.706575,873.07,...,149.600006,0.0,0.0,66.5,3.097713,50.68115,873.374776,-1.252387,-5.323759,8.139995
2,1694850675496257000,488.368257,0.529413,-0.843842,0.045404,0.074776,-2.755124,1.09014,-0.343697,488.191,...,800.0,0.0,0.0,0.0,3.123176,50.672179,488.368257,-4.202703,-8.689218,1.733541
3,1694856574973923600,265.731924,-0.103478,-0.568592,-0.501415,0.643878,-1.747675,0.556275,0.976151,265.072,...,149.600006,0.0,0.0,66.5,3.097713,50.68115,265.731924,-1.467729,-5.174577,8.199995
4,1697611168928672300,191.401327,-0.176786,0.565639,-0.799976,0.094012,-2.952015,0.358007,-1.945084,191.092,...,34.563,0.0,0.0,90.5,3.132056,50.687971,191.401327,-9.001679,-3.482618,1.735401


In [27]:
#Since it was observed through exploratory data analysis (EDA) that columns related to elapsed time are similar, 
# we want to consolidate them by calculating their mean value. This transformer computes the average of the elapsed time
# values and removes the individual columns to avoid redundant information.

class AverageSecondsElapsedTransformer(BaseEstimator, TransformerMixin):
    def __init__(self, keyword='seconds_elapsed_'):
        self.keyword = keyword
        
    def fit(self, X, y=None):
        return self
    
    def transform(self, X):
        columns_to_average = X.filter(like=self.keyword).columns
        X['seconds_elapsed_time'] = X[columns_to_average].mean(axis=1)
        X = X.drop(columns=columns_to_average)
        return X

In [28]:
# We'll be using this transformer for dropping columns with only one unique value as they do not provide any useful
# information for the model and then also for dropping time column after extracting time features

class ColumnDropper(BaseEstimator, TransformerMixin):
    def __init__(self, columns=None):
        self.columns = columns
        
    def fit(self, X, y=None):
        return self
    
    def transform(self, X):
        if self.columns is not None:
            X = X.drop(columns=self.columns, errors='ignore')
        return X

In [None]:
# Since we already have roll, pitch and yaw orientation values
# we do not need to convert quaterions (qx, qy, qz, qw) to Euler angles.

In [29]:
# Adding time features to capture the hour and minute of the day, which may be relevant for detecting patterns
# in the data. Since we are extracting time features, we will drop the original time column, which serves a role
# in our dataset similar to primary key.

class TimeFeatureExtraction(BaseEstimator, TransformerMixin):
    def fit(self, X, y=None):
        return self
    
    def transform(self, X):
        X['time'] = pd.to_datetime(X['time'], unit='ns')
        X['hour'] = X['time'].dt.hour
        X['minute'] = X['time'].dt.minute
        return X

In [30]:
pipeline = Pipeline(steps=[
    ('dropper', ColumnDropper(columns=["bearingAccuracy_location_network", 
                                        "speedAccuracy_location_network", 
                                        "speed_location_network", 
                                        "bearing_location_network"])),
    ('average_seconds_elapsed', AverageSecondsElapsedTransformer()),
    ('time_feature_extractor', TimeFeatureExtraction()),
    ('time_dropper', ColumnDropper(columns=['time']))
])

In [31]:
X_train_transformed = pipeline.fit_transform(X_train)

In [32]:
X_train_transformed.head()

Unnamed: 0,qz_orientation,qy_orientation,qx_orientation,qw_orientation,roll_orientation,pitch_orientation,yaw_orientation,bearingAccuracy_location,speedAccuracy_location,verticalAccuracy_location,...,horizontalAccuracy_location_network,altitude_location_network,longitude_location_network,latitude_location_network,z_gravity,y_gravity,x_gravity,seconds_elapsed_time,hour,minute
0,-0.626221,0.3861,-0.674631,0.060399,-1.826153,0.600508,-2.571477,6.8,1.2,15.1,...,149.600006,66.5,3.097713,50.68115,-2.041392,-5.524593,7.841044,1119.515857,9,45
1,-0.013953,-0.618994,-0.425122,0.660245,-1.72367,0.57531,0.706575,10.0,1.3,14.4,...,149.600006,66.5,3.097713,50.68115,-1.252387,-5.323759,8.139995,800.161781,9,39
2,0.529413,-0.843842,0.045404,0.074776,-2.755124,1.09014,-0.343697,26.700001,3.1,15.8,...,800.0,0.0,3.123176,50.672179,-4.202703,-8.689218,1.733541,453.108725,7,51
3,-0.103478,-0.568592,-0.501415,0.643878,-1.747675,0.556275,0.976151,7.7,1.2,17.1,...,149.600006,66.5,3.097713,50.68115,-1.467729,-5.174577,8.199995,253.150236,9,29
4,-0.176786,0.565639,-0.799976,0.094012,-2.952015,0.358007,-1.945084,0.0,0.0,3.18827,...,34.563,90.5,3.132056,50.687971,-9.001679,-3.482618,1.735401,190.693035,6,39


In [33]:
X_train_transformed.columns

Index(['qz_orientation', 'qy_orientation', 'qx_orientation', 'qw_orientation',
       'roll_orientation', 'pitch_orientation', 'yaw_orientation',
       'bearingAccuracy_location', 'speedAccuracy_location',
       'verticalAccuracy_location', 'horizontalAccuracy_location',
       'speed_location', 'bearing_location', 'altitude_location',
       'longitude_location', 'latitude_location', 'z_total_acceleration',
       'y_total_acceleration', 'x_total_acceleration', 'z_magnetometer',
       'y_magnetometer', 'x_magnetometer', 'z_accelerometer',
       'y_accelerometer', 'x_accelerometer', 'bearingAccuracy_location_gps',
       'speedAccuracy_location_gps', 'verticalAccuracy_location_gps',
       'horizontalAccuracy_location_gps', 'speed_location_gps',
       'bearing_location_gps', 'altitude_location_gps',
       'longitude_location_gps', 'latitude_location_gps', 'z_gyroscope',
       'y_gyroscope', 'x_gyroscope', 'steps_pedometer',
       'verticalAccuracy_location_network',
       'hor

In [34]:
len(X_train_transformed.columns)

49