### General Imports

In [2]:
import pandas as pd
import numpy as np
from datetime import datetime as dt
import os
import glob
import json
import matplotlib.pyplot as plt

### Data Cleaning
Cleaning out 0 entries and any entries not in the timerange we are looking at.
Then dealing with sensors having different time zones

In [164]:
all_csv_files = glob.glob("./Data/*.txt")
loops=len(all_csv_files)
data = {}
# Here we can select the expirement time
cleaningCutOffTime = pd.Timestamp('12/22/2020 12:49')
for idx,x in enumerate(all_csv_files):
    df = pd.read_csv(
        x,
        header=1,
        parse_dates = [[0,1]]
        ).dropna(how='all')

    df.columns = df.columns.str.replace(' ', '') 
    try:
        # Here we need to set up our time changing parameters
        # For this instance we need to roll back all sensors by 1 hour
        # except the two BU sensors which needed to be rolled back by
        # 8 hours.
        if not 'BU' in x:
            df['Date_Time'] = df['Date_Time']-pd.Timedelta(hours = 1)
        else:
            df['Date_Time'] = df['Date_Time']-pd.Timedelta(hours = 8)
        df.drop(df[df['Date_Time'] < cleaningCutOffTime].index, inplace = True)
        # In the instance of a TypeError occuring, we are bascically dealing with
        # the 0 timestamps causing an error in the read_csv parser and not 
        # converting the Date_Time column to timestamp data type.

    except TypeError:
        df.drop(df[df['Date_Time'] == '     0/0/0      0:0:0'].index, inplace = True)
        df['Date_Time'] = pd.to_datetime(df['Date_Time'])
        if not 'BU' in x:
            df['Date_Time'] = df['Date_Time']-pd.Timedelta(hours = 1)
        else:
            df['Date_Time'] = df['Date_Time']-pd.Timedelta(hours = 8)
        df.drop(df[df['Date_Time'] < cleaningCutOffTime].index, inplace = True) 
    data[x[7:len(x)-4]] = df.reset_index(drop=True)

    # ends by printing out the new start and stop times of the data sets
for x in data:
    try:
        print(x,'   ',data[x]['Date_Time'].iloc[0],'    ',data[x]['Date_Time'].iloc[-1])
    except:
        print(x,' NO DATA PRESENT    NO DATA PRESENT')


S-01     2020-12-22 12:49:10      2020-12-22 16:09:31
S-02  NO DATA PRESENT    NO DATA PRESENT
S-03     2020-12-22 12:49:03      2020-12-22 16:09:35
S-04     2020-12-22 12:49:07      2020-12-22 16:09:34
S-05     2020-12-22 12:49:01      2020-12-22 16:10:50
S-06     2020-12-22 12:49:12      2020-12-22 16:09:25
S-07     2020-12-22 12:49:07      2020-12-22 16:12:53
S-08     2020-12-22 12:49:24      2020-12-22 16:57:24
S-09     2020-12-22 12:49:00      2020-12-22 16:11:30
S-11     2020-12-22 13:02:00      2020-12-22 16:09:43
S-12     2020-12-22 12:49:06      2020-12-22 16:10:06
S-13     2020-12-22 12:49:06      2020-12-22 16:08:44
S-14     2020-12-22 12:49:09      2020-12-22 16:13:45
S-15     2020-12-22 12:49:03      2020-12-22 16:10:00
S-BU1     2020-12-22 12:49:00      2020-12-22 16:11:40
S-BU2     2020-12-22 12:49:00      2020-12-22 16:10:30


### Checking Data
Here we scan through the data for irregularities in data recording.

In [176]:
errors = {}
# Enter the expected interval here
interval = 10
for x in data:
    errors[x] = set(())
    counter = 0
    temp = data[x]
    for idx,i in enumerate(temp['Date_Time']):
        try:
            if not ((temp['Date_Time'][idx+1] - i) == pd.Timedelta(seconds=interval)):
                errors[x].add(temp['Date_Time'][idx+1] - i)
                counter += 1
        except:
            continue
    print(str(counter),' possible errors in ', x)
    print([i.seconds for i in errors[x]])


208  possible errors in  S-01
[26, 21, 17, 13, 19, 30, 20, 11]
0  possible errors in  S-02
[]
219  possible errors in  S-03
[21, 12, 25, 27, 16, 18, 62, 84, 51, 20, 31, 11, 22, 9, 13, 0, 90, 15, 70, 59, 17, 19, 63]
220  possible errors in  S-04
[21, 22, 17, 23, 60, 18, 35, 24, 78, 19, 20, 13, 25, 11, 49]
220  possible errors in  S-05
[26, 21, 16, 17, 23, 12, 18, 7, 19, 24, 25, 20, 15]
199  possible errors in  S-06
[28, 20, 85]
212  possible errors in  S-07
[9, 16, 22, 14, 12, 34, 18, 24, 19, 30, 25, 20, 15, 17, 13, 11]
412  possible errors in  S-08
[34, 35, 138, 139, 140, 37]
5  possible errors in  S-09
[9, 17, 3, 31, 20]
11  possible errors in  S-11
[16, 14, 18, 15, 11]
2  possible errors in  S-12
[8, 12]
13  possible errors in  S-13
[16, 14, 9, 18, 19, 41, 20, 15, 11]
13  possible errors in  S-14
[16, 14, 17, 12, 9, 20, 15, 11]
8  possible errors in  S-15
[33, 17, 0, 30, 25, 150, 11]
582  possible errors in  S-BU1
[61, 40, 241, 19, 240, 20]
42  possible errors in  S-BU2
[29, 30, 1086

In [114]:
collect = pd.Timedelta(0)
for i in range(10):
    collect += data['S-1']['Date_Time'][1]-data['S-1']['Date_Time'][0]

In [117]:
counter

2476

In [170]:
a = set(('apple'))
a.add('a')
a

{'a', 'e', 'l', 'p'}

In [166]:
errors

{'S-01': {Timedelta('0 days 00:00:11'),
  Timedelta('0 days 00:00:13'),
  Timedelta('0 days 00:00:17'),
  Timedelta('0 days 00:00:19'),
  Timedelta('0 days 00:00:20'),
  Timedelta('0 days 00:00:21'),
  Timedelta('0 days 00:00:26'),
  Timedelta('0 days 00:00:30')},
 'S-02': set(),
 'S-03': {Timedelta('0 days 00:00:00'),
  Timedelta('0 days 00:00:09'),
  Timedelta('0 days 00:00:11'),
  Timedelta('0 days 00:00:12'),
  Timedelta('0 days 00:00:13'),
  Timedelta('0 days 00:00:15'),
  Timedelta('0 days 00:00:16'),
  Timedelta('0 days 00:00:17'),
  Timedelta('0 days 00:00:18'),
  Timedelta('0 days 00:00:19'),
  Timedelta('0 days 00:00:20'),
  Timedelta('0 days 00:00:21'),
  Timedelta('0 days 00:00:22'),
  Timedelta('0 days 00:00:25'),
  Timedelta('0 days 00:00:27'),
  Timedelta('0 days 00:00:31'),
  Timedelta('0 days 00:00:51'),
  Timedelta('0 days 00:00:59'),
  Timedelta('0 days 00:01:02'),
  Timedelta('0 days 00:01:03'),
  Timedelta('0 days 00:01:10'),
  Timedelta('0 days 00:01:24'),
  Timed

In [91]:
temp = set(())
temp.add('apple')
temp

{'apple'}