In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

In [None]:
segReport_df = pd.read_csv("../input/traffic-flow-data-in-ho-chi-minh-city-viet-nam/segment_reports.csv", index_col="_id", 
                            parse_dates=["updated_at"])
segment_df = pd.read_csv("../input/traffic-flow-data-in-ho-chi-minh-city-viet-nam/segments.csv", index_col="_id",
                         parse_dates=["created_at", "updated_at"])

## Transformation magic

In [None]:
from math import ceil

def transform_LOS(segment_id, velocity):
    max_velocity = segment_df.loc[segment_id, "max_velocity"]
    if max_velocity is None:
        max_velocity = 50
    
    # Transform to label
    labels = ["A", "B", "C", "D", "E", "F"]
    threshold = 35
    if max_velocity >= 70:
        threshold = 45
    elif max_velocity >= 60:
        threshold = 40

    t = max(threshold - velocity, 0)
    return labels[min(ceil(t / 5), 5)]

def transform_report(row):
    """
    @Params:
        dt: Timestamp object of Pandas
    @Return:
        dict: {"date", "period_{hour}_{00|30}"}
    """
    LOS = transform_LOS(row["segment_id"], row["velocity"])
    dt = row["updated_at"]
    intervals = list(range(24))
    h = dt.hour
    m = "00" if dt.minute < 30 else "30"
    p_name = f"period_{h}_{m}"
    return dt.date(), dt.weekday(), p_name, LOS

## Do it!

In [None]:
dates = []
weekdays = []
p_names = []
LOSes = []

for _, row in segReport_df.iterrows():
    date, weekday, p_name, LOS = transform_report(row)
    dates.append(date)
    weekdays.append(weekday)
    p_names.append(p_name)
    LOSes.append(LOS)

segReport_df["date"] = dates
segReport_df["weekday"] = weekdays
segReport_df["period"] = p_names
segReport_df["LOS"] = LOSes

## Divide into periods may cause a period has many LOS labels, so need to mitigate this by setting a major label

In [None]:
def major_voting(labels):
    unique_labels = set(labels)
    count_labels = [labels.count(label) for label in unique_labels]

    sorted_labels = sorted(zip(unique_labels, count_labels), key=lambda x: x[1])
    if len(sorted_labels) > 1 and sorted_labels[0][1] == sorted_labels[1][1]:
        print("Oh no, many majors?")
    return sorted_labels[0][0]

def mean_voting(labels):
    l = ["A", "B", "C", "D", "E", "F"]
    values = {"A":0, "B":1, "C":2, "D":3, "E":4, "F":5}
    mean = sum(values[label] for label in labels) / len(labels)
    return l[min(round(mean), 5)]

In [None]:
compress_LOS = segReport_df.groupby(by=["segment_id", "date", "weekday", "period"])["LOS"].apply(list)
compress_LOS = pd.DataFrame(compress_LOS).reset_index()
compress_LOS["LOS"] = compress_LOS["LOS"].apply(mean_voting)

## Now the data should be good (maybe)

In [None]:
compress_LOS