In [None]:
####################################################################################################
## This file is run once to create the associations_times.csv file from raw data
## This is the data preprocessing steps for one day of raw data
## This is different/more simple than the data preprocessing of a week of data
####################################################################################################

In [1]:
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd
from pandasql import sqldf
from sqlalchemy import create_engine

from datetime import datetime
from datetime import timedelta
import os, sys

In [2]:
##################
# import raw data
##################
df = ## pd.read_csv('') removed - but would use link to raw data
df

Unnamed: 0,start_time,installation_id,station_id
0,2021-04-01 00:00:00.088007,412914,1025503
1,2021-04-01 00:00:00.107122,413884,985795
2,2021-04-01 00:00:00.199838,418743,876082
3,2021-04-01 00:00:00.244472,419447,883379
4,2021-04-01 00:00:00.352282,413584,855284
...,...,...,...
2066168,2021-04-01 23:59:59.494044,413406,972273
2066169,2021-04-01 23:59:59.513663,412357,969521
2066170,2021-04-01 23:59:59.602626,415752,1005312
2066171,2021-04-01 23:59:59.625039,420719,957148


In [30]:
##################################################
# function for integer time since 0 on current day
##################################################
def time_apart(t):
    str1 = t.split()
    h, m, s = str1[1].split(':')
    return int(h) * 3600 + int(m) * 60 + float(s)

In [45]:
# get dates as integers of seconds since time 0 on given day
df['time_stamp'] = df.apply(lambda row : time_apart(row['start_time']), axis = 1)

Unnamed: 0,start_time,installation_id,station_id,time_stamp
0,2021-04-01 00:00:00.088007,412914,1025503,0.088007
1,2021-04-01 00:00:00.107122,413884,985795,0.107122
2,2021-04-01 00:00:00.199838,418743,876082,0.199838
3,2021-04-01 00:00:00.244472,419447,883379,0.244472
4,2021-04-01 00:00:00.352282,413584,855284,0.352282
...,...,...,...,...
2066168,2021-04-01 23:59:59.494044,413406,972273,86399.494044
2066169,2021-04-01 23:59:59.513663,412357,969521,86399.513663
2066170,2021-04-01 23:59:59.602626,415752,1005312,86399.602626
2066171,2021-04-01 23:59:59.625039,420719,957148,86399.625039


In [46]:
df.drop(columns=['start_time'])
df

Unnamed: 0,start_time,installation_id,station_id,time_stamp
0,2021-04-01 00:00:00.088007,412914,1025503,0.088007
1,2021-04-01 00:00:00.107122,413884,985795,0.107122
2,2021-04-01 00:00:00.199838,418743,876082,0.199838
3,2021-04-01 00:00:00.244472,419447,883379,0.244472
4,2021-04-01 00:00:00.352282,413584,855284,0.352282
...,...,...,...,...
2066168,2021-04-01 23:59:59.494044,413406,972273,86399.494044
2066169,2021-04-01 23:59:59.513663,412357,969521,86399.513663
2066170,2021-04-01 23:59:59.602626,415752,1005312,86399.602626
2066171,2021-04-01 23:59:59.625039,420719,957148,86399.625039


In [50]:
######################
# create db connection
######################
connection_to_db = create_engine('sqlite:///associations.db')

In [60]:
# table from raw data
df.to_sql('associations', con=connection_to_db, if_exists='replace')

In [59]:
# run first time
%load_ext sql
%sql sqlite:///associations.db

The sql extension is already loaded. To reload it, use:
  %reload_ext sql


In [61]:
# otherwise run this
%reload_ext sql
%sql sqlite:///associations.db

In [63]:
%%sql

SELECT * FROM associations LIMIT 5; 

 * sqlite:///associations.db
Done.


index,start_time,installation_id,station_id,time_stamp
0,2021-04-01 00:00:00.088007,412914,1025503,0.088007
1,2021-04-01 00:00:00.107122,413884,985795,0.107122
2,2021-04-01 00:00:00.199838,418743,876082,0.199838
3,2021-04-01 00:00:00.244472,419447,883379,0.244472
4,2021-04-01 00:00:00.352282,413584,855284,0.352282


In [None]:
# builds associations_cross table
# format:
# station_id, ap_1, ap_2, t1, t2

In [65]:
%%sql

create table associations_cross as select
a.station_id, 
a.installation_id as ap_1,
b.installation_id as ap_2,
a.time_stamp as t1, 
b.time_stamp as t2
from associations a join associations b on a.station_id = b.station_id;

 * sqlite:///associations.db
Done.


[]

In [None]:
# removes instances where station does not move

In [66]:
%%sql
delete from associations_cross where ap_1 = ap_2

 * sqlite:///associations.db
1342052527 rows affected.


[]

In [None]:
# sanity check

In [67]:
%%sql

SELECT * FROM associations_cross LIMIT 5; 

 * sqlite:///associations.db
Done.


station_id,ap_1,ap_2,t1,t2
1025503,412914,412744,0.088007,39428.796809
1025503,412914,412840,0.088007,64031.297609
1025503,412914,412870,0.088007,63917.439381
1025503,412914,412870,0.088007,64100.414458
1025503,412914,412915,0.088007,66805.114127


In [None]:
# calculate time differences

In [75]:
%%sql
create table associations_differences as select
a.station_id, 
a.ap_1,
a.ap_2,
abs(a.t2 - a.t1) as time_difference
from associations_cross a;

 * sqlite:///associations.db
Done.


[]

In [None]:
# sanity check

In [76]:
%%sql

SELECT * FROM associations_differences LIMIT 5; 

 * sqlite:///associations.db
Done.


station_id,ap_1,ap_2,time_difference
1025503,412914,412744,39428.708802
1025503,412914,412840,64031.209602
1025503,412914,412870,63917.351374
1025503,412914,412870,64100.326451
1025503,412914,412915,66805.02612


In [None]:
# join instances where ap_1 == ap_1 AND ap_2 == ap_2
# add col for count
# add col for sum of time_stamps

In [77]:
%%sql
create table associations_results as select
ap_1,
ap_2,
sum(time_difference) as total_time,
count(*) as frequency
from associations_differences
group by
ap_1,
ap_2;

 * sqlite:///associations.db
Done.


[]

In [None]:
# sanity check

In [78]:
%%sql

SELECT * FROM associations_results LIMIT 5; 

 * sqlite:///associations.db
Done.


ap_1,ap_2,total_time,frequency
1106,412046,8666.911969999994,2
1106,412126,15732.078110000002,4
1106,412130,22974.635945999988,6
1106,412153,24848.284046000008,6
1106,412329,8791.733820000001,2


In [None]:
# export csv or zip file if too big

In [80]:
result = %sql SELECT * FROM associations_results
result_df = result.DataFrame()

 * sqlite:///associations.db
Done.


In [None]:
cwd = os.getcwd()
path = cwd + "/associations_times.csv"
results_df.to_csv(path)