<a href="https://colab.research.google.com/github/claudiosegala/tcc/blob/master/project/tcc_data_processing.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [0]:
import google as g # To connect with google drive
g.colab.drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [0]:
!pip install numpy
!pip install pandas
!pip install statsmodels



In [0]:
import pandas as pd # data manipulation library
import numpy as np # math library
import datetime as dt # to discover week day
import statsmodels as sm # statistical models
import statsmodels.api as sma # statistical models api
import time as tm
import matplotlib.pyplot as plt

In [0]:
TCC_FOLDER = '/content/drive/My Drive/TCC/'

In [0]:
DATASET_PATH = f"{TCC_FOLDER}dataset/all_data_sorted.csv"

In [0]:
col_names = [
  'Sensor',
  'Date',
  'Time',
  'Lane',
  'Speed',
  'Max Speed',
  'Size'
]

data = pd.read_csv(DATASET_PATH, ';', header=None, names=col_names)

In [0]:
data.head()

Unnamed: 0,Sensor,Date,Time,Lane,Speed,Max Speed,Size
0,RSI128,2016/05/01,00:00:09,1,26.0,60.0,0.0
1,RSI131,2016/05/01,00:00:09,2,20.0,60.0,1.1
2,RSI132,2016/05/01,00:00:09,1,45.0,60.0,0.0
3,RSI131,2016/05/01,00:00:10,1,40.0,60.0,0.5
4,RSI129,2016/05/01,00:00:12,1,35.0,60.0,0.0


In [0]:
data.describe()

Unnamed: 0,Lane,Speed,Max Speed,Size
count,10801780.0,10801780.0,10801781.0,10801780.0
mean,1.656316,36.54532,60.0,0.1049269
std,0.7029681,11.33506,0.0,0.8707685
min,1.0,1.0,60.0,0.0
25%,1.0,28.0,60.0,0.0
50%,2.0,37.0,60.0,0.0
75%,2.0,46.0,60.0,0.0
max,3.0,112.0,60.0,50.0


In [0]:
print(f"It contains {len(data['Sensor'])} entries\n\n")

It contains 10801781 entries




In [0]:
print(f"This dataset contains {len(set(data['Sensor']))} sensors.")

for val in set(data['Sensor']):
  print(f"Sensor {val} has {len(data[data['Sensor'] == val])}")

This dataset contains 8 sensors.
Sensor RSI032 has 2117820
Sensor RSI131 has 816219
Sensor RSI132 has 652998
Sensor RSI128 has 536879
Sensor RSI129 has 578652
Sensor RSI018 has 2029559
Sensor RSI017 has 1686900
Sensor RSI033 has 2382754


In [0]:
# Limit sensor usage
data = data[data['Sensor'] == 'RSI128']

In [0]:
# Remove unnecessary columns
data = data.drop(columns=['Sensor','Lane','Max Speed','Size'])

In [0]:
# Get datetime
data['Date'] = pd.to_datetime(data['Date'], format='%Y/%m/%d')

In [0]:
# Adjust type
f = lambda x : tm.strptime(x, '%H:%M:%S')
data['Time'] = data['Time'].apply(f)

g = lambda x : dt.timedelta(hours=x.tm_hour,minutes=x.tm_min,seconds=x.tm_sec).total_seconds()
data['Time'] = data['Time'].apply(g)

h = lambda x : int(x)
data['Time'] = data['Time'].apply(h)

In [0]:
# Create week day from date
j = lambda x : x.weekday()
data['WeekDay'] = data['Date'].apply(j)

In [0]:
data.head()

Unnamed: 0,Date,Time,Speed,WeekDay
0,2016-05-01,9,26.0,6
7,2016-05-01,18,32.0,6
39,2016-05-01,104,16.0,6
108,2016-05-01,206,22.0,6
160,2016-05-01,305,25.0,6


In [0]:
data.describe()

Unnamed: 0,Time,Speed,WeekDay
count,536879.0,536879.0,536879.0
mean,51715.214387,31.078891,2.951632
std,17952.214406,7.168803,1.959146
min,3.0,2.0,0.0
25%,38412.5,26.0,1.0
50%,52364.0,31.0,3.0
75%,65757.0,35.0,5.0
max,86399.0,67.0,6.0


In [0]:
for col, cont in data.iteritems():
  print(f"Column {col} has {cont.isnull().sum()} null elements")
  print(f"Column {col} has {cont.isna().sum()} nan elements")

Column Date has 0 null elements
Column Date has 0 nan elements
Column Time has 0 null elements
Column Time has 0 nan elements
Column Speed has 0 null elements
Column Speed has 0 nan elements
Column WeekDay has 0 null elements
Column WeekDay has 0 nan elements


In [0]:
start = data['Date'].min()
end = data['Date'].max()

print(f"\nThis data is from <{start}> to <{end}>. {(end - start).days + 1} days.\n")
print(f"It contains {len(data['Date'])} entries\n\n")


This data is from <2016-05-01 00:00:00> to <2016-07-31 00:00:00>. 92 days.

It contains 536879 entries




In [0]:
data.to_csv(f"{TCC_FOLDER}dataset/dataset.csv", ";", index=False)

In [0]:
def get_day_size (flow_interval):
  return (24 * 60 * 60) // flow_interval  

In [0]:
def get_week_size (flow_interval):
  return (7 * 24 * 60 * 60) // flow_interval  

In [0]:
def get_flow_data(n, accSpeed, weekDay):
  avgSpeed = (accSpeed / n) if n else 0
  density = (n / avgSpeed) if avgSpeed else 0
  w = [(1 if weekDay == i else 0) for i in range(7)] # weekday
  
  return (n, density, avgSpeed, w[0], w[1], w[2], w[3], w[4], w[5], w[6])

In [0]:
def get_flow (data, flow_interval):
  date = np.asarray(data['Date'])
  weekDay = np.asarray(data['WeekDay'])
  time = np.asarray(data['Time'])
  speed = np.asarray(data['Speed'])
  
  dateControl = date[0]
  timeBlock = flow_interval
  countFlow = 0
  accSpeed = 0
  flowData = []

  for i in range(len(date)):
    if time[i] >= timeBlock: # init a new time block
      flowData.append(get_flow_data(countFlow, accSpeed, weekDay[i])) 
      timeBlock += flow_interval
      accSpeed = 0
      countFlow = 0
      
    if date[i] > dateControl: # reset on day change
      dateControl = date[i]
      timeBlock = flow_interval 
      countFlow = 0
      accSpeed = 0
      
    if time[i] < timeBlock: # add car on flow
      countFlow += 1
      accSpeed += speed[i]

  day_size = get_day_size(flow_interval)
  k = (day_size - (len(flowData) % day_size)) % day_size

  for i in range(k):
    flowData.append(get_flow_data(0, 0, weekDay[len(date) - 1])) 
      
  cols = [
    'Flow',
    'Density',
    'AveSpeed',
    'Sunday',
    'Monday',
    'Tuesday',
    'Wednesday',
    'Thursday',
    'Friday',
    'Saturday',
  ]
  
  flowData = pd.DataFrame(flowData, columns=cols)
  
  # from sklearn.preprocessing import MinMaxScaler
  # scaler = MinMaxScaler(feature_range=(0,1))
  # flowDataScaled = scaler.fit_transform(flowData)  
  # flowData = pd.DataFrame(flowDataScaled, columns=flowData.columns, index=flowData.index)
  
  return flowData

In [0]:
def plot_flow_decomposition(flow_series, freq, flow_interval):
  path = f"{TCC_FOLDER}plots/flow/seasonal_decompose_{flow_interval}"
  
  decompose = sm.tsa.seasonal.seasonal_decompose
  decomposition = decompose(flow_series, model='additive', freq=freq)
  fig = decomposition.plot()

  plt.rcdefaults()
  
  plt.savefig(path + ".png")
  # plt.savefig(path + ".pdf")
    
  plt.close('all')

In [0]:
def plot_flow(flow_series, flow_interval):
  """ Plot of Flow
  
  Plot the flow from week to week
  
  Arguments:
    flow_series: an array of flows
    flow_interval: the interval in which the flow was made
  """

  week_size = get_week_size(flow_interval)
  n = len(flow_series) // week_size

  if len(flow_series) % week_size == 0:
    print('Yey')

  for i in range(n):
    s = week_size * i
    e = min(s + week_size, len(flow_series))
    path = f"{TCC_FOLDER}plots/flow/flow_{flow_interval}_week_{str(i+1).zfill(2)}"

    plt.plot(flow_series[s:e])

    plt.title(f"Fluxo (Intervalo de {flow_interval} segundos) - Semana {i+1}")
    plt.ylabel('Fluxo')
    plt.xlabel('Tempo')
    plt.rcdefaults()
    
    plt.savefig(path + ".png", bbox_inches='tight')
    # plt.savefig(path + ".pdf")
    
    plt.close('all')

In [0]:
flows_intervals = [150, 300, 450, 900]

for flow_interval in flows_intervals:
  flow_data = get_flow(data, flow_interval)
  week_size = get_week_size(flow_interval)

  plot_flow(flow_data['Flow'], flow_interval)
  plot_flow_decomposition(flow_data['Flow'], week_size, flow_interval)
  print(flow_data.head(), end="\n\n")
  print(flow_data.describe(), end="\n\n")

  flow_data.to_csv(f"{TCC_FOLDER}dataset/dataset_flow_{flow_interval}.csv", ";", index=False)

   Flow   Density   AveSpeed  Sunday  ...  Wednesday  Thursday  Friday  Saturday
0     3  0.121622  24.666667       0  ...          0         0       0         1
1     1  0.045455  22.000000       0  ...          0         0       0         1
2     2  0.088889  22.500000       0  ...          0         0       0         1
3     3  0.138462  21.666667       0  ...          0         0       0         1
4     7  0.206751  33.857143       0  ...          0         0       0         1

[5 rows x 10 columns]

               Flow       Density  ...        Friday      Saturday
count  52992.000000  52992.000000  ...  52992.000000  52992.000000
mean       9.917837      0.322920  ...      0.141059      0.153665
std        8.456189      0.269372  ...      0.348086      0.360630
min        0.000000      0.000000  ...      0.000000      0.000000
25%        1.000000      0.055556  ...      0.000000      0.000000
50%        9.000000      0.297674  ...      0.000000      0.000000
75%       17.000000  