In [577]:
import pandas as pd
import numpy as np
from sklearn import preprocessing
from collections import deque
import random

In [554]:
df = pd.read_csv('../data/cleaned_df.csv')

In [555]:
df.head()

Unnamed: 0.1,Unnamed: 0,date,garageCode,id,precipitation,pressure,symbol,temperature,totalSpaces,vehicleCount,winddirection,windspeed,month,year,day,hour,weekend,percentage,raining?
0,1,2018-03-22 13:00:01,SCANDCENTER,2,0.0,1008.6,Cloudy,4.0,1240,575,54.2,2.6,3,2018,3,13,False,0.46371,False
1,3,2018-03-22 13:00:01,MAGASIN,4,0.0,1008.6,Cloudy,4.0,378,282,54.2,2.6,3,2018,3,13,False,0.746032,False
2,5,2018-03-22 13:00:01,SALLING,6,0.0,1008.6,Cloudy,4.0,700,506,54.2,2.6,3,2018,3,13,False,0.722857,False
3,6,2018-03-22 13:00:01,Navitas,8,0.0,1008.6,Cloudy,4.0,449,217,54.2,2.6,3,2018,3,13,False,0.483296,False
4,7,2018-03-22 13:00:01,NewBusgadehuset,9,0.0,1008.6,Cloudy,4.0,105,102,54.2,2.6,3,2018,3,13,False,0.971429,False


In [556]:
df = pd.pivot_table(df, values='percentage', index=['date'], columns=['garageCode'], aggfunc=np.mean, fill_value=0).reset_index()

In [557]:
df.head()

garageCode,date,MAGASIN,Navitas,NewBusgadehuset,SALLING,SCANDCENTER,Urban Level 1,Urban Level 2+3
0,2018-03-22 13:00:01,0.746032,0.483296,0.971429,0.722857,0.46371,0.363636,0.350153
1,2018-03-22 13:05:01,0.772487,0.487751,0.971429,0.722857,0.460484,0.366771,0.351682
2,2018-03-22 13:10:01,0.761905,0.489978,0.980952,0.722857,0.46129,0.357367,0.351682
3,2018-03-22 13:15:01,0.76455,0.485523,0.980952,0.717143,0.460484,0.354232,0.351682
4,2018-03-22 13:20:01,0.777778,0.469933,0.980952,0.718571,0.458871,0.357367,0.35474


In [558]:
SEQ_LENGTH = 12
FUTURE_PERIOD_PREDICT = 1
GARAGE_TO_PREDICT = "SALLING"

In [559]:
def classify(current, future):
    if float(future) > float(current):
        return 1
    else:
        return 0

In [560]:
df['future'] = df[f"{GARAGE_TO_PREDICT}"].shift(-FUTURE_PERIOD_PREDICT)

In [561]:
df.head()

garageCode,date,MAGASIN,Navitas,NewBusgadehuset,SALLING,SCANDCENTER,Urban Level 1,Urban Level 2+3,future
0,2018-03-22 13:00:01,0.746032,0.483296,0.971429,0.722857,0.46371,0.363636,0.350153,0.722857
1,2018-03-22 13:05:01,0.772487,0.487751,0.971429,0.722857,0.460484,0.366771,0.351682,0.722857
2,2018-03-22 13:10:01,0.761905,0.489978,0.980952,0.722857,0.46129,0.357367,0.351682,0.717143
3,2018-03-22 13:15:01,0.76455,0.485523,0.980952,0.717143,0.460484,0.354232,0.351682,0.718571
4,2018-03-22 13:20:01,0.777778,0.469933,0.980952,0.718571,0.458871,0.357367,0.35474,0.724286


In [562]:
df['target'] = list(map(classify, df[f"{GARAGE_TO_PREDICT}"], df["future"]))

In [563]:
df.head()

garageCode,date,MAGASIN,Navitas,NewBusgadehuset,SALLING,SCANDCENTER,Urban Level 1,Urban Level 2+3,future,target
0,2018-03-22 13:00:01,0.746032,0.483296,0.971429,0.722857,0.46371,0.363636,0.350153,0.722857,0
1,2018-03-22 13:05:01,0.772487,0.487751,0.971429,0.722857,0.460484,0.366771,0.351682,0.722857,0
2,2018-03-22 13:10:01,0.761905,0.489978,0.980952,0.722857,0.46129,0.357367,0.351682,0.717143,0
3,2018-03-22 13:15:01,0.76455,0.485523,0.980952,0.717143,0.460484,0.354232,0.351682,0.718571,1
4,2018-03-22 13:20:01,0.777778,0.469933,0.980952,0.718571,0.458871,0.357367,0.35474,0.724286,1


In [564]:
df.set_index('date', inplace=True)


In [565]:
times = sorted(df.index.values)
last_5pct = times[-int(0.05*len(times))]

In [566]:
validation_main_df = df[(df.index >= last_5pct)]
main_df = df[(df.index < last_5pct)]

In [588]:
def preprocess_df(df):
    df = df.drop('future', 1)
    # skip the scaling sicne we're working w percentages and its already scaled correctly
    sequential_data = []
    prev_days = deque(maxlen=SEQ_LENGTH)
    for i in df.values: 
        prev_days.append([n for n in i[:-1]])
        if len(prev_days) == SEQ_LENGTH:
            sequential_data.append([np.array(prev_days), i[-1]])
    random.shuffle(sequential_data)
    
    increases = []
    decreases = []
    
    for seq, target in sequential_data:
        if target == 0:
            decreases.append([seq, target])
        elif target == 1:
            increases.append([seq, target])
    
    random.shuffle(increases)
    random.shuffle(decreases)
    
    lower = min(len(increases), len(decreases))
    
    increases = increases[:lower]
    decreases = decreases[:lower]
    
    sequential_data = increases + decreases
    random.shuffle(sequential_data)
    
    X = []
    y = []
    
    for seq, target in sequential_data:
        X.append(seq)
        y.append(target)
    
    return np.array(X), y

In [591]:
train_x, train_y = preprocess_df(main_df)
validation_x, validation_y = preprocess_df(validation_main_df)

In [592]:
print(f"train data: {len(train_x)} validation: {len(validation_x)}")
print(f"Decreases: {train_y.count(0)}, increases: {train_y.count(1)}")
print(f"Validation decreases: {validation_y.count(0)}, increases: {validation_y.count(1)}")

train data: 71988 validation: 3684
Decreases: 35994, increases: 35994
Validation decreases: 1842, increases: 1842
