### Data Cleaning and Preprocessing
1. Dropping NaN values
2. Converting variables to appropriate data types
3. Grouping data by `date-time` and keeping mode value. Final intended output is uniquely timestamped records.

<br>
<br>
<br>

In [1]:
# Importing required modules

import pandas as pd
from datetime import datetime

In [2]:
# Reading csv files of raw data for all IoT devices

IoT_Fridge = pd.read_csv("./raw_data/IoT_Fridge.csv", low_memory=False)
IoT_Garage_Door = pd.read_csv("./raw_data/IoT_Garage_Door.csv", low_memory=False)
IoT_Motion_Light = pd.read_csv("./raw_data/IoT_Motion_Light.csv", low_memory=False)
IoT_GPS_Tracker = pd.read_csv("./raw_data/IoT_GPS_Tracker.csv", low_memory=False)
IoT_Modbus = pd.read_csv("./raw_data/IoT_Modbus.csv", low_memory=False)
IoT_Motion_Light = pd.read_csv("./raw_data/IoT_Motion_Light.csv", low_memory=False)
IoT_Thermostat = pd.read_csv("./raw_data/IoT_Thermostat.csv", low_memory=False)
IoT_Weather = pd.read_csv("./raw_data/IoT_Weather.csv", low_memory=False)


In [3]:
# Function for precessing data, to be called upon each dataset

def processData(df, name):
    # Remove NaN values
    df.dropna(how='any', inplace=True)

    # infer and convert to pandas datatypes
    df = df.convert_dtypes(infer_objects=True, convert_string=True,
                      convert_integer=True, convert_boolean=True, convert_floating=True)

    # # remove whitespaces
    # cols = df.select_dtypes().columns
    # df[cols] = df[cols].apply(lambda x: x.str.strip())

    # merge 'date' and 'time' columns into 'date-time'
    df['date-time'] = df[['date', 'time']].agg(' '.join, axis=1)
    df.drop('date', inplace=True, axis=1)   # Remove 'date' column
    df.drop('time', inplace=True, axis=1)   # Remove 'time' column

    # convert 'date-time' from string to pandas datetime datatype
    df['date-time'] = pd.to_datetime(df['date-time'], format='%d-%b-%y %H:%M:%S')

    # infer datatypes once again
    print(df.convert_dtypes(infer_objects=True, convert_string=True,
          convert_integer=True, convert_boolean=True, convert_floating=True).dtypes)

    # group data by 'date-time' and selecting mode(or the first value incase all are distinct) for that row
    df = df.groupby(by='date-time').agg(lambda x: pd.Series.mode(x)[0])

    # save this processed data as csv
    df.to_csv(f"./processed_data/{name}.csv")
    print()
    print()


processData(IoT_Fridge, "IoT_Fridge")
processData(IoT_Garage_Door, "IoT_Garage_Door")
processData(IoT_GPS_Tracker, "IoT_GPS_Tracker")
processData(IoT_Modbus, "IoT_Modbus")
processData(IoT_Motion_Light, "IoT_Motion_Light")
processData(IoT_Thermostat, "IoT_Thermostat")
processData(IoT_Weather, "IoT_Weather")

fridge_temperature              Float64
fridge_temp_condition            string
fridge_label                      Int64
fridge_type                      string
date-time                datetime64[ns]
dtype: object


door_state                    string
door_sphone_signal           boolean
door_label                     Int64
door_type                     string
date-time             datetime64[ns]
dtype: object


gps_latitude            Float64
gps_longitude           Float64
gps_label                 Int64
gps_type                 string
date-time        datetime64[ns]
dtype: object


modbus_FC1_Read_Input_Register               Int64
modbus_FC2_Read_Discrete_Value               Int64
modbus_FC3_Read_Holding_Register             Int64
modbus_FC4_Read_Coil                         Int64
modbus_label                                 Int64
modbus_type                                 string
date-time                           datetime64[ns]
dtype: object


KeyboardInterrupt: 

<br>
<br>
<center><b>End of File</b></center>