<a href="https://colab.research.google.com/github/shayesteh99/Tehran-Traffic/blob/main/create_dataset.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
import numpy as np
import pandas as pd
import math
import torch
import torch.nn as nn
from torch.utils.data import Dataset, DataLoader
import matplotlib.pyplot as plt
import torch.nn.functional as F
import torchvision
from tqdm.auto import tqdm, trange
from sklearn.metrics import roc_auc_score
from sklearn.metrics import mean_squared_error
from sklearn.metrics import mean_absolute_error
from sklearn.metrics import r2_score
import xgboost as xgb
from sklearn.neural_network import MLPRegressor
from sklearn.cluster import KMeans
from sklearn.decomposition import PCA

from IPython import display
import torch.backends.cudnn as cudnn
import random

def set_seed(seed):
    cudnn.deterministic = True
    torch.cuda.manual_seed(seed)
    np.random.seed(seed)
    torch.manual_seed(seed)
    torch.cuda.manual_seed_all(seed)
    random.seed(seed)

In [2]:
from google.colab import drive
drive.mount('/content/gdrive')

Mounted at /content/gdrive


In [3]:
%cd 'gdrive/MyDrive/Neshan2'
%ls

/content/gdrive/.shortcut-targets-by-id/1kb7msLseHDgbNcoeHcAexfHINwu2zRER/Neshan2
check_neighbors_average.ipynb
[0m[01;34mClassification[0m/
[01;34mDATA[0m/
find_common_days.ipynb
find_good_segments.ipynb
find_mapped_good_neighors.ipynb
find_neighbors.ipynb
find_shorter_neighbors.ipynb
get_good_segment-information.ipynb
get_important_good_segment_information.ipynb
get_mapped_good_segment_information.ipynb
[01;34mGOOD_DATA[0m/
[01;34mGOOD_IMPORTANT_DATA[0m/
good_important_segment_information.csv
[01;34mGOOD_MAPPED_DATA[0m/
good_mapped_segment_information.csv
good_segment_information.csv
Important_segment.csv
just_good_neighbors_1000.csv
just_good_neighbors_1500.csv
make_mapped_good_days.ipynb
[01;34mMAPPED[0m/
mapped_good_data.csv
mapped_good_igraph.ipynb
mapped_good_neighors.csv
mapped_important_good_neighors.csv
mapped_segment.csv
[01;34mNEIGHBOR[0m/
neighbors_1000.csv
neighbors_750.csv
new_data_df.csv
[01;34mPP[0m/
[01;34mRegression[0m/
segment1.txt
segment_locatii

In [4]:
def convert_time_to_float(time):
  angle = time / (24 * 4) * 2 * math.pi
  return math.sin(angle), math.cos(angle)

In [5]:
def label_quarter(row):
  date_time = row['date_time']
  time = date_time.split()[1]
  parts = time.split(':')
  hour = int(parts[0])
  minute = int(parts[1])
  return 4*hour + (minute // 15)

In [6]:
def get_low_variance_segments(df, threshold):
  variances = df.groupby('Unnamed: 0')['speed'].var()
  low_vars = variances[variances < threshold].index.to_numpy()
  return low_vars

In [7]:
def filter_neighbors(neighbors, low_vars):
  new_neighgbors = neighbors.loc[~neighbors['segment_id'].isin(low_vars)]
  ns = neighbors.columns[2:]
  for n in ns:
    new_neighgbors = new_neighgbors.loc[~new_neighgbors[n].isin(low_vars)]
  return new_neighgbors

In [8]:
number_of_neighbors = 7
k = 3

In [9]:
neighbors_df = pd.read_csv("Shayesteh/neighbors/graph_neighbors_3.csv")
print(neighbors_df)

      Unnamed: 0        seg_id  ...  out_graph_dist3  out_uc_dist3
0              0  462727807000  ...                2   1027.332955
1              1  482388476002  ...                3   5722.362624
2              2  482388476003  ...                3   5494.900363
3              3  472634491000  ...                3   4667.278650
4              4  472634491001  ...                3   4388.200770
...          ...           ...  ...              ...           ...
6678        6678    4434034014  ...                3   5738.811114
6679        6679    4434034015  ...                3   6068.638233
6680        6680    4434034016  ...                3   6433.824524
6681        6681    4434034018  ...                3   5186.788216
6682        6682  687487066003  ...                3   3973.901483

[6683 rows x 20 columns]


In [10]:
  columns = ['seg_id', 'time_sin', 'time_cos', 'case']
  for i in range(k * 2):
    columns.append('nei_id' + str(i+1))
    columns.append('speed' + str(i+1))
    columns.append('graph_dist' + str(i+1))
    columns.append('dist' + str(i+1))
  columns.append('label')
  print(columns)

['seg_id', 'time_sin', 'time_cos', 'case', 'nei_id1', 'speed1', 'graph_dist1', 'dist1', 'nei_id2', 'speed2', 'graph_dist2', 'dist2', 'nei_id3', 'speed3', 'graph_dist3', 'dist3', 'nei_id4', 'speed4', 'graph_dist4', 'dist4', 'nei_id5', 'speed5', 'graph_dist5', 'dist5', 'nei_id6', 'speed6', 'graph_dist6', 'dist6', 'label']


In [11]:
for day in range(19, 20):
  dataset = []
  day_df = pd.read_csv("GOOD_IMPORTANT_DATA/good_important_segment_day" + str(day) + ".csv")

  # filter low variances
  # low_var_segs = get_low_variance_segments(day_df, 100)
  # filtered_day_df = day_df.loc[~day_df['Unnamed: 0'].isin(low_var_segs)]
  # filtered_neighbors = filter_neighbors(neighbors_df, low_var_segs)

  day_df['quarter'] = day_df.apply (lambda row: label_quarter(row), axis=1)
  day_df = day_df.drop(columns = ['date_time'])

  records_dict = {}
  segments = day_df['Unnamed: 0'].unique()
  for s in segments:
    records_dict[s] = day_df[day_df['Unnamed: 0'] == s]
  
  
  for i in range(len(neighbors_df)):
    if i % 1000 == 0:
      print(i)
    record = neighbors_df.iloc[i].to_numpy()
    segment_id = record[1]
    if segment_id not in records_dict:
      continue
    segment_records = records_dict[segment_id]
    segment_times = segment_records['quarter'].to_numpy()
    intersect_times = segment_times
    if len(segment_times) < 1:
      continue
    neighbors = []
    for j in range(2 * k):
      id = record[k * j + 2]
      if id not in records_dict:
        break
      
      id_records = records_dict[id][records_dict[id]['quarter'].isin(intersect_times)]
      neighbors.append(id_records)
      intersect_times = id_records['quarter'].to_numpy()
      if len(intersect_times) == 0:
        break

    if len(intersect_times) > 0 and len(neighbors) == 2 * k:
      for time in intersect_times:
        next_time = time + 1
        if next_time not in segment_times:
          continue
        time_sin, time_cos = convert_time_to_float(time)
        sample = [segment_id, time_sin, time_cos]
        sample.append(segment_records[segment_records['quarter'] == time]['speed'].to_numpy()[0])
        for n in range(len(neighbors)):
          sample.append(record[k * n + 2])
          sample.append(neighbors[n][neighbors[n]['quarter'] == time]['speed'].to_numpy()[0])
          sample.append(record[k * n + 3])
          sample.append(record[k * n + 4])
        sample.append(segment_records[segment_records['quarter'] == next_time]['speed'].to_numpy()[0])
        dataset.append(sample)
  
  dataset_df = pd.DataFrame(dataset, columns = columns)
  dataset_df.to_csv("Shayesteh/dataset/dataset_graph_3/dataset_" + str(day) + ".csv")
  print("len: ", len(dataset))
  print("day " + str(day) + " completed.")

0
1000
2000
3000
4000
5000
6000
len:  549529
day 19 completed.
