# Raw Data
```
mkdir raw
unzip data -d raw
```

# Rename Data
We do this because the names are annoying to work with. We create a mapping into the integers and save the mapping in a file ``mapping". First:
```
mkdir renamed
```
then
```
i=0
f = open('mapping','w+')
for file in os.listdir("raw"):
    command = "cp \"raw/"+file+"\" renamed/"+str(i)+".xls"
    f.write(file+'\n')
    i+=1
    print(i)
    print(command)
    os.system(command)
f.close()
```

# Preprocess
We want to preprocess, prettify, remove NaN, etc. First
```
mkdir preprocessed
```
then
```
from utils import preprocess
for file in os.listdir("renamed"):
    preprocess(file)
```

# Window
As described in our manuscript, we window into 50 second epochs. First
```
mkdir windowed
```
then
```
from pandas import read_csv
import pandas as pd
from numpy import array
from tqdm import tqdm
# from utils import window
for i in range(32):
    target_filename = str(i)+"_preprocessed"
    df = pd.read_csv("preprocessed/"+target_filename+".csv")
    Y = pd.DataFrame()

    for i in tqdm(range(len(df)-4)):
        win = df.iloc[i:i+5]
        x = win.values.flatten()
        X = pd.DataFrame(x).T
        Y = pd.concat([Y,X])
    df_win = Y
    df_win = df_win.reset_index()
    del df_win['index']
    df = df_win
    if ( not os.path.isdir('windowed')):
        os.system('mkdir windowed')
    target_filename = target_filename.replace(".csv","")
    df.to_csv("windowed/"+target_filename+"_windowed.csv",index=False)
```

# Scale
Scale data for neural network goodness. First
```
mkdir windowed_scaled
```
then 
```
from pandas import read_csv
import pandas as pd
for i in range(32):
    filename = "windowed/"+str(i)+"_preprocessed_windowed.csv"
    X = read_csv(filename)
    from sklearn.preprocessing import MinMaxScaler

    scaler = MinMaxScaler()

    X = scaler.fit_transform(X)
    pd.DataFrame(X).to_csv("windowed_scaled/"+str(i)+".csv",index=False)
```

# ANN Prediction
Load model from best_model.h5. Predict and save. These are predictions for 50-second windows. First
```
mkdir predictions
```
then 
```
for i in range(32):
    filename = "windowed_scaled/"+str(i)+".csv"
    X = read_csv(filename)
    X = np.array(X)

    from keras.models import load_model
    model = load_model('best_model.h5')
    import numpy as np
    x = np.array(X)
    y = model.predict(x)
    y = np.array(y)
    y = np.argmax(y,axis=1)
    pd.DataFrame(y).to_csv("predictions/"+str(i)+".csv",index=False)
```

# Expand Predictions
Predictions are based on windowed data. We need to expand predictions to be on unwindowed data. Thus, we argmax the last 5 predictions to find the ith prediction. First
```
mkdir expanded_predictions
```
then
```
from tqdm import tqdm
import pandas as pd
import numpy as np
for file in tqdm(os.listdir("predictions")):
    df = pd.read_csv("predictions/"+file)
    Y = np.array(df)
    Y = Y.reshape(Y.shape[0],)
    # print(len(Y))
    Y_new = []
    for i,x in tqdm(enumerate(range(len(Y)+4))):
        if(i==0):
            # print("i:",i)
            # print(Y[0])
            # print("Bincount:",Y[0])
            Y_new.append(Y[0])
        elif(i<5):
            # print("i:",i)
            # print(Y[0:i])
            # print("Bincount:",np.bincount(Y[0:i]))
            # print("Class:",np.argmax(np.bincount(Y[0:i])))
            Y_new.append(np.argmax(np.bincount(Y[0:i])))
        elif(i>8635 and i!=8639):
            # print("i:",i)
            # print(Y[8635-(4-(i-8635)):8635])
            # print("Bincount:",np.bincount(Y[8635-(4-(i-8635)):8635]))
            # print("Class:",np.argmax(np.bincount(Y[8635-(4-(i-8635)):8635])))
            Y_new.append(np.argmax(np.bincount(Y[8635-(4-(i-8635)):8635])))
        elif(i==8639):
            # print("i:",i)
            # print(Y[8635])
            # print("Bincount:",Y[8365])
            Y_new.append(Y[8365])
        else:
            # print("i:",i)
            # print(Y[i-4:i])
            # print("Bincount:",np.bincount(Y[i-4:i]))
            # print("Class:",np.argmax(np.bincount(Y[i-4:i])))
            Y_new.append(np.argmax(np.bincount(Y[i-4:i])))
    pd.DataFrame(Y_new).to_csv("expanded_predictions/"+file,index=False)
```

In [2]:
from tqdm import tqdm
import pandas as pd
import numpy as np
for file in tqdm(os.listdir("predictions")):
    df = pd.read_csv("predictions/"+file)
    Y = np.array(df)
    Y = Y.reshape(Y.shape[0],)
    # print(len(Y))
    Y_new = []
    for i,x in tqdm(enumerate(range(len(Y)+4))):
        if(i==0):
            # print("i:",i)
            # print(Y[0])
            # print("Bincount:",Y[0])
            Y_new.append(Y[0])
        elif(i<5):
            # print("i:",i)
            # print(Y[0:i])
            # print("Bincount:",np.bincount(Y[0:i]))
            # print("Class:",np.argmax(np.bincount(Y[0:i])))
            Y_new.append(np.argmax(np.bincount(Y[0:i])))
        elif(i>8635 and i!=8639):
            # print("i:",i)
            # print(Y[8635-(4-(i-8635)):8635])
            # print("Bincount:",np.bincount(Y[8635-(4-(i-8635)):8635]))
            # print("Class:",np.argmax(np.bincount(Y[8635-(4-(i-8635)):8635])))
            Y_new.append(np.argmax(np.bincount(Y[8635-(4-(i-8635)):8635])))
        elif(i==8639):
            # print("i:",i)
            # print(Y[8635])
            # print("Bincount:",Y[8365])
            Y_new.append(Y[8365])
        else:
            # print("i:",i)
            # print(Y[i-4:i])
            # print("Bincount:",np.bincount(Y[i-4:i]))
            # print("Class:",np.argmax(np.bincount(Y[i-4:i])))
            Y_new.append(np.argmax(np.bincount(Y[i-4:i])))
    pd.DataFrame(Y_new).to_csv("expanded_predictions/"+file,index=False)


8640it [00:00, 388141.02it/s]
8640it [00:00, 286388.86it/s]
8640it [00:00, 266915.03it/s]
8640it [00:00, 413052.94it/s]
8640it [00:00, 390226.63it/s]
8640it [00:00, 387199.62it/s]
8640it [00:00, 393459.35it/s]
8640it [00:00, 385379.61it/s]
8640it [00:00, 389250.01it/s]
8640it [00:00, 267263.46it/s]
8640it [00:00, 371801.89it/s]
8640it [00:00, 339855.45it/s]
8640it [00:00, 296567.64it/s]
8640it [00:00, 205289.83it/s]
8640it [00:00, 396863.39it/s]
8640it [00:00, 375123.30it/s]
8640it [00:00, 371428.43it/s]
8640it [00:00, 349690.60it/s]
8640it [00:00, 358913.58it/s]
8640it [00:00, 397852.43it/s]
8640it [00:00, 390344.33it/s]
8640it [00:00, 397952.92it/s]
8640it [00:00, 348359.43it/s]
8640it [00:00, 350339.68it/s]
8640it [00:00, 373985.14it/s]
8640it [00:00, 391995.27it/s]
8640it [00:00, 208169.59it/s]
8640it [00:00, 323641.50it/s]
8640it [00:00, 349306.34it/s]
8640it [00:00, 389321.10it/s]
8640it [00:00, 384512.73it/s]
8640it [00:00, 214700.02it/s]
100%|██████████| 32/32 [00:01<00:00, 25.

In [16]:
i = 0
import pandas as pd
with open("mapping") as f:
    df = pd.read_csv("expanded_predictions/"+str(i)+".csv")
    for line in f:
        print(len(line))
        df.to_csv("expanded_predictions_renamed/"+line.replace(".xls",".csv"),index=False)


24
19
19
19
24
19
19
24
19
24
19
24
19
24
24
19
19
19
24
19
24
19
19
19
19
24
24
24
24
24
24
23


In [5]:
contents

['20-AUG-E2-1 Vehicle.xls\n',
 '20-AUG-E1-0 PF.xls\n',
 '20-AUG-C4-0 PF.xls\n',
 '20-AUG-F1-0 PF.xls\n',
 '20-AUG-C4-0 Vehicle.xls\n',
 '20-AUG-F5-1 PF.xls\n',
 '20-AUG-B1-0 PF.xls\n',
 '20-AUG-C1-0 Vehicle.xls\n',
 '20-AUG-E4-1 PF.xls\n',
 '20-AUG-C4-1 Vehicle.xls\n',
 '20-AUG-E4-0 PF.xls\n',
 '20-AUG-B1-0 Vehicle.xls\n',
 '20-AUG-C4-1 PF.xls\n',
 '20-AUG-B3-1 Vehicle.xls\n',
 '20-AUG-F5-1 Vehicle.xls\n',
 '20-AUG-A1-1 PF.xls\n',
 '20-AUG-D1-0 PF.xls\n',
 '20-AUG-F1-1 PF.xls\n',
 '20-AUG-E1-0 Vehicle.xls\n',
 '20-AUG-E2-1 PF.xls\n',
 '20-AUG-F1-0 Vehicle.xls\n',
 '20-AUG-C1-0 PF.xls\n',
 '20-AUG-B3-1 PF.xls\n',
 '20-AUG-A4-0 PF.xls\n',
 '20-AUG-A1-0 PF.xls\n',
 '20-AUG-E4-1 Vehicle.xls\n',
 '20-AUG-A1-1 Vehicle.xls\n',
 '20-AUG-A4-0 Vehicle.xls\n',
 '20-AUG-D1-0 Vehicle.xls\n',
 '20-AUG-E4-0 Vehicle.xls\n',
 '20-AUG-A1-0 Vehicle.xls\n',
 '20-AUG-F1-1 Vehicle.xls']