In [111]:
# imports
import pandas as pd
from sklearn.preprocessing import StandardScaler
import lightgbm as lgb
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.metrics import classification_report
from sklearn.preprocessing import OneHotEncoder
from sklearn.preprocessing import LabelEncoder


In [112]:
df = pd.read_csv("final_train_data.csv")
df = df.astype({col: 'category' for col in df.select_dtypes(include='object').columns})

In [113]:
df.dtypes

Distance (cm)                     float64
Illuminance (lx)                  float64
amplitude                         float64
frequency                         float64
Magnetic field x (µT)             float64
Magnetic field y (µT)             float64
Magnetic field z (µT)             float64
Acceleration x (m/s^2)            float64
Acceleration y (m/s^2)            float64
Acceleration z (m/s^2)            float64
Gyroscope x (rad/s)               float64
Gyroscope y (rad/s)               float64
Gyroscope z (rad/s)               float64
Pressure (hPa)                    float64
Linear Acceleration x (m/s^2)     float64
Linear Acceleration y (m/s^2)     float64
Linear Acceleration z (m/s^2)     float64
Common time (s)                   float64
Activity                         category
Mood                                int64
Arousal                             int64
Social engagement                   int64
Noise Level                         int64
Concentration Level               

### Train, val, test split

In [114]:
y = df["Activity"]
X = df.drop(["Activity"],axis=1)
le = LabelEncoder()
y = le.fit_transform(y)



In [115]:
val_df = pd.read_csv("val_data.csv")
val_df = val_df.astype({col: 'category' for col in val_df.select_dtypes(include='object').columns})
y_val = val_df["Activity"]
X_val = val_df.drop(["Activity"],axis=1)
y_val = le.transform(val_df["Activity"])



In [116]:
test_df = pd.read_csv("test_data.csv")
test_df = test_df.astype({col: 'category' for col in test_df.select_dtypes(include='object').columns})
y_test = test_df["Activity"]
X_test = test_df.drop(["Activity"],axis=1)
y_test = le.transform(test_df["Activity"])


In [117]:
test_df.dtypes


Distance (cm)                     float64
Illuminance (lx)                  float64
amplitude                         float64
frequency                         float64
Magnetic field x (µT)             float64
Magnetic field y (µT)             float64
Magnetic field z (µT)             float64
Acceleration x (m/s^2)            float64
Acceleration y (m/s^2)            float64
Acceleration z (m/s^2)            float64
Gyroscope x (rad/s)               float64
Gyroscope y (rad/s)               float64
Gyroscope z (rad/s)               float64
Pressure (hPa)                    float64
Linear Acceleration x (m/s^2)     float64
Linear Acceleration y (m/s^2)     float64
Linear Acceleration z (m/s^2)     float64
Common time (s)                   float64
Activity                         category
Mood                                int64
Arousal                             int64
Social engagement                   int64
Noise Level                         int64
Concentration Level               

In [118]:

num_features  = X.select_dtypes(include ="number").columns
cat_features = X.select_dtypes(include = "category").columns
prep = ColumnTransformer([("num",StandardScaler(),num_features), ("cat",OneHotEncoder(handle_unknown="ignore"),cat_features)])


In [119]:
pipe = Pipeline([("preproc",prep),("clf",lgb.LGBMClassifier(objective='multiclass', num_class=6,random_state=30))])
pipe.fit(X,y)
y_pred =pipe.predict(X_test)     
print(classification_report(y_test,y_pred)) 

[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.007278 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 4609
[LightGBM] [Info] Number of data points in the train set: 416297, number of used features: 23
[LightGBM] [Info] Start training from score -1.924220
[LightGBM] [Info] Start training from score -1.931188
[LightGBM] [Info] Start training from score -1.764138
[LightGBM] [Info] Start training from score -1.829352
[LightGBM] [Info] Start training from score -1.742763
[LightGBM] [Info] Start training from score -1.598762




              precision    recall  f1-score   support

           0       0.00      0.00      0.00     31092
           1       0.31      0.87      0.46     18922
           2       0.00      0.00      0.00     34191
           3       0.00      0.00      0.00     35422
           4       0.15      0.31      0.20     26557
           5       0.18      0.19      0.18     40283

    accuracy                           0.17    186467
   macro avg       0.11      0.23      0.14    186467
weighted avg       0.09      0.17      0.11    186467



  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


In [121]:
print("Train classes:", df["Activity"].unique())
print("Val classes:", val_df["Activity"].unique())



Train classes: ['rest', 'walk', 'phone', 'stairs', 'socialize', 'study']
Categories (6, object): ['phone', 'rest', 'socialize', 'stairs', 'study', 'walk']
Val classes: ['socialize', 'walk', 'stairs', 'phone', 'rest', 'study']
Categories (6, object): ['phone', 'rest', 'socialize', 'stairs', 'study', 'walk']
