In [31]:
import os
import pandas as pd

dfs = []

for file in os.listdir('.'):
    if file.endswith('.csv'):
        local_df = pd.read_csv(file)
        device_type = file.split('_')[-1].lower().replace('.csv', '')
        local_df['device_type'] = device_type

        dfs.append(local_df)

df = pd.concat(dfs, ignore_index=True)

df.head()

Unnamed: 0,date,time,FC1_Read_Input_Register,FC2_Read_Discrete_Value,FC3_Read_Holding_Register,FC4_Read_Coil,label,type,device_type,door_state,...,light_status,latitude,longitude,temperature,pressure,humidity,current_temperature,thermostat_status,fridge_temperature,temp_condition
0,25-Apr-19,09:14:00,49389.0,52921.0,25770.0,13625.0,1,injection,modbus,,...,,,,,,,,,,
1,25-Apr-19,09:14:00,49389.0,52921.0,25770.0,13625.0,1,injection,modbus,,...,,,,,,,,,,
2,25-Apr-19,09:14:01,49389.0,52921.0,25770.0,13625.0,1,injection,modbus,,...,,,,,,,,,,
3,25-Apr-19,09:14:02,49389.0,52921.0,25770.0,13625.0,1,injection,modbus,,...,,,,,,,,,,
4,25-Apr-19,09:14:04,40665.0,44748.0,21098.0,35371.0,1,injection,modbus,,...,,,,,,,,,,


In [32]:
# replace NaN with 0

df.fillna(0, inplace=True)

In [33]:
# show df info

df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 261119 entries, 0 to 261118
Data columns (total 22 columns):
 #   Column                     Non-Null Count   Dtype  
---  ------                     --------------   -----  
 0   date                       261119 non-null  object 
 1   time                       261119 non-null  object 
 2   FC1_Read_Input_Register    261119 non-null  float64
 3   FC2_Read_Discrete_Value    261119 non-null  float64
 4   FC3_Read_Holding_Register  261119 non-null  float64
 5   FC4_Read_Coil              261119 non-null  float64
 6   label                      261119 non-null  int64  
 7   type                       261119 non-null  object 
 8   device_type                261119 non-null  object 
 9   door_state                 261119 non-null  object 
 10  sphone_signal              261119 non-null  object 
 11  motion_status              261119 non-null  float64
 12  light_status               261119 non-null  object 
 13  latitude                   26

In [34]:
# onehot encode categorical columns, and scale numerical columns

from sklearn.model_selection import train_test_split
from sklearn.preprocessing import OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler


X = df.drop(['date', 'time', 'label', 'type'], axis=1)
y = df['label']

categorical_cols = [
  'device_type',
  'door_state',
  'sphone_signal',
  'light_status',
  'temp_condition',
]

numerical_cols = [
  'FC1_Read_Input_Register',
  'FC2_Read_Discrete_Value',
  'FC3_Read_Holding_Register',
  'FC4_Read_Coil',
  'motion_status',
  'latitude',
  'longitude',
  'temperature',
  'pressure',
  'humidity',
  'current_temperature',
  'thermostat_status',
  'fridge_temperature',
]

X[categorical_cols] = X[categorical_cols].astype(str)

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

preprocessor = ColumnTransformer(
  transformers=[
    ('num', StandardScaler(), numerical_cols),
    ('cat', OneHotEncoder(), categorical_cols)
  ]
)

pipeline = Pipeline([
    ('preprocessor', preprocessor)
  ]
)

X_train_processed = pipeline.fit_transform(X_train)
X_test_processed = pipeline.transform(X_test)

In [29]:
# use knn to predict label

from sklearn.neighbors import KNeighborsClassifier

knn = KNeighborsClassifier(n_neighbors=3)
knn.fit(X_train_processed, y_train)
y_pred = knn.predict(X_test_processed)


In [30]:
# accuracy

from sklearn.metrics import accuracy_score

accuracy_score(y_test, y_pred)


0.8569814644607843