In [1]:
import os
import glob
import pandas as pd
import numpy as np
import datetime
import itertools
from library.scoreCalculatorLibrary import calculate_score
from library.correlationAndTtestLib import *

In [2]:
def time_zone_cal(s):
    '''
    @param s: a string of format "hh:mm:ss"
    
    @return: a string denoting the timezone
    '''
    hour=int(s.split(':')[0])

    if 6<hour<=9:
        time_zone='Early_Morning'
    elif 9<hour<=12:
        time_zone='Morning'
    elif 12<hour<=17:
        time_zone='Afternoon'
    elif 17<hour<=23:
        time_zone='Evening'
    return time_zone

# converts hour:minute:seconds to seconds
def time_as_int(timestr):
    '''
    @param timestr: a string of format "hh:mm:ss"
    
    @return: an integer denoting the time in seconds
    '''
    hh, mm, ss = timestr.split(":")
    time_int = (int(hh)*3600)+(int(mm)*60)+int(ss)
    return time_int

def get_day_of_week(date_string, seperator="/", date_format="mm/dd/yyyy"):
    '''
    @param date_string: a string denoting a date
    @param seperator (optional): a character which acts as seperator in the date_string. Default = "/"
    @param date_format (optional): a string which denotes the format the date string is. Default = "mm/dd/yyyy"
                                Supported values are : {"dd/mm/yyyy", "mm/dd/yyyy"}
    
    @return: a string denoting the day of week corressponding to the date denoted by date_string
    '''
    date_format_idxs = {'mm/dd/yyyy':{'date_idx': 1, 'month_idx': 0, 'year_idx': 2},
                        'dd/mm/yyyy':{'date_idx': 0, 'month_idx': 1, 'year_idx': 2}}
    try:
        idx_dict = date_format_idxs[date_format]
        date_arr = [int(val) for val in date_string.split(seperator)]
        weekdays = {0:"Monday", 1:"Tuesday", 2:"Wednesday", 3:"Thursday", 4:"Friday", 5:"Saturday", 6:"Sunday"}
        date = datetime.datetime(date_arr[idx_dict['year_idx']],\
                                 date_arr[idx_dict['month_idx']],\
                                 date_arr[idx_dict['date_idx']])
        return weekdays[date.weekday()]
    except KeyError:
        raise ValueError(f"{date_format} is not supported. Supported values are: {list(date_format_idxs.keys())}.")

In [3]:
# name = "prantika_bus_stand"
# name = "junction_mall"
name = "54ft_road"
# name = "dvc_more"
# name = "station"

In [4]:
parent_dir = r"../data/54ft/"
log_dir = f"../reports/54ft/{name}"

results_csv_name = f"Predictions_merged_54ft_{name}_zone.csv"

welch_output = f"across_date_across_timezone_analysis_{name}_zone.csv"

score_timezone_analysis_output = f"score_timezone_analysis_{name}_zone.csv"

time_zones = ["Early_Morning", "Morning", "Afternoon", "Evening"]

timezone_analysis_name = "time_zone_analysis"

In [5]:
os.makedirs(log_dir, exist_ok=True)
os.makedirs(os.path.join(log_dir, timezone_analysis_name), exist_ok=True)

In [6]:
result_df = pd.read_csv(os.path.join(parent_dir, results_csv_name))

In [7]:
df = result_df.copy()
df['time_zone'] = df['instance_start_time'].apply(lambda x: time_zone_cal(x))
df["instance_start_time"] = df["instance_start_time"].apply(lambda x: time_as_int(x))
df["instance_end_time"] = df["instance_end_time"].apply(lambda x: time_as_int(x))

In [8]:
date_group = df.groupby('instance_date')

whole_day_scores = {}
whole_day_list_scores = {}
whole_day_instance_counts = {}

for date in list(date_group.groups):
    temp_df = date_group.get_group(date).copy()
    overall_score, list_scores, no_of_instances = calculate_score(temp_df)
    # calculate_score() is defined in library.scoreCalculatorLibrary.py
    whole_day_scores[date] = overall_score
    whole_day_list_scores[date] = list_scores
    whole_day_instance_counts[date] = no_of_instances

In [9]:
whole_day_df = pd.DataFrame.from_dict(whole_day_scores, orient="index", columns=["Whole_Day"])
whole_day_df = whole_day_df.reset_index()
whole_day_df.columns = ["Date", "Whole_Day"]
whole_day_df

Unnamed: 0,Date,Whole_Day
0,09/14/2019,0.377964
1,09/15/2019,-0.174136
2,09/16/2019,1.069045
3,09/17/2019,0.0
4,09/20/2019,0.0
5,09/21/2019,1.069045
6,09/23/2019,0.727607


In [10]:
date_timezone_groups = df.groupby(["instance_date", "time_zone"])
date_timezone_pairs = list(date_timezone_groups.groups.keys())

In [11]:
date_timezone_scores = {}
overall_scores = {}
instance_counts = {}

for pair in date_timezone_pairs:
    date, timezone = pair
    temp_df = date_timezone_groups.get_group(pair)
    overall_score, list_scores, no_of_instances = calculate_score(temp_df)
    # calculate_score() is defined in library.scoreCalculatorLibrary.py
    date_timezone_scores[pair] = list_scores
    overall_scores[pair] = overall_score
    instance_counts[pair] = no_of_instances

In [12]:
dates = {}
for key in overall_scores:
    date, timezone = key
    dates[date] = dates.get(date, []) + [timezone]

result_dict = {}
result_dict["Date"] = list(dates.keys())
for timezone in time_zones:
    result_dict[timezone] = []
    
for date in dates:
    for timezone in time_zones:
        result_dict[timezone].append(overall_scores.get((date, timezone), "NA"))


result = pd.DataFrame.from_dict(result_dict)
result = result.merge(whole_day_df)
result["Day"] = result["Date"].apply(lambda x: get_day_of_week(x))
result

Unnamed: 0,Date,Early_Morning,Morning,Afternoon,Evening,Whole_Day,Day
0,09/14/2019,,0.0,0.57735,,0.377964,Saturday
1,09/15/2019,,,-0.20017,0.0,-0.174136,Sunday
2,09/16/2019,,0.57735,0.707107,0.57735,1.069045,Monday
3,09/17/2019,,-0.707107,0.816497,0.0,0.0,Tuesday
4,09/20/2019,,,-0.333333,1.0,0.0,Friday
5,09/21/2019,,0.447214,0.377964,1.41421,1.069045,Saturday
6,09/23/2019,,-0.707107,1.414214,1.0,0.727607,Monday


In [13]:
result.to_csv(os.path.join(log_dir, timezone_analysis_name, score_timezone_analysis_output), index=False)

## Welch T

In [14]:
def form_welch_df(instance_scores, need_day = True):
    '''
    @param instance_scores: a dictionary of format '{date: [instance_score_1, instance_score_2, ...]}'
    
    @return: a pandas.DataFrame() object with columns '['Pair', 't_test_stat', 'p_value', 'Day']'
    '''
    welch_dict = welch_t(form_pairs_with_scores(instance_scores)) 
    # welch_t() and form_pairs_with_scores() is defined in library.correlationsAndTtestLib.py
    df = pd.DataFrame(columns=["Pair", "t_test_stat", "p_value"])
    df["Pair"] = list(welch_dict.keys())
    df["t_test_stat"] = [val[0] for val in welch_dict.values()]
    df["p_value"] = [val[1] for val in welch_dict.values()]
    if need_day:
        df["Day"] = df['Pair'].apply(lambda x: (get_day_of_week(x[0]), get_day_of_week(x[1])))
    return df


def form_timezone_instance_scores(timezone):
    '''
    @param timezone: any value from ["Early_Morning", "Morning", "Afternoon", "Evening"]
    
    @return: a dictionary of format "{date: [instance_score_1, instance_score_2, ...]}",
                 where each date is from common_timezone_dates of corressponding "timezone" param.
    '''
    dates = common_timezone_dates[timezone]
    instance_scores = {}
    for date in dates:
        key = (date, timezone)
        array1 = date_timezone_scores[key]
        
        assert len(array1) == instance_counts[key], f"Wrong length for {key}. Actual:{len(array1)} \
        Expected: {instance_counts[key]}"
        
        instance_scores[date] = array1
    return instance_scores

### 1. Inter dates common time zone

In [15]:
common_timezone_dates = {}
for timezone in time_zones:
    for key in date_timezone_pairs:
        if timezone in key:
            common_timezone_dates[timezone] = common_timezone_dates.get(timezone, []) + [key[0]]
# common_timezone_dates

In [16]:
morning_instance_scores = form_timezone_instance_scores('Morning')
afternoon_instance_scores = form_timezone_instance_scores('Afternoon')
evening_instance_scores = form_timezone_instance_scores('Evening')

In [17]:
morning_welch_df = form_welch_df(morning_instance_scores)
morning_welch_df

Unnamed: 0,Pair,t_test_stat,p_value,Day
0,"(09/14/2019, 09/16/2019)",-0.377964,0.722788,"(Saturday, Monday)"
1,"(09/14/2019, 09/17/2019)",0.365729,0.728185,"(Saturday, Tuesday)"
2,"(09/14/2019, 09/21/2019)",-0.264135,0.799989,"(Saturday, Saturday)"
3,"(09/14/2019, 09/23/2019)",0.365729,0.728185,"(Saturday, Monday)"
4,"(09/16/2019, 09/17/2019)",0.767031,0.494199,"(Monday, Tuesday)"
5,"(09/16/2019, 09/21/2019)",0.161165,0.879525,"(Monday, Saturday)"
6,"(09/16/2019, 09/23/2019)",0.767031,0.494199,"(Monday, Monday)"
7,"(09/17/2019, 09/21/2019)",-0.735899,0.482205,"(Tuesday, Saturday)"
8,"(09/17/2019, 09/23/2019)",0.0,1.0,"(Tuesday, Monday)"
9,"(09/21/2019, 09/23/2019)",0.735899,0.482205,"(Saturday, Monday)"


In [18]:
afternoon_welch_df = form_welch_df(afternoon_instance_scores)
afternoon_welch_df

Unnamed: 0,Pair,t_test_stat,p_value,Day
0,"(09/14/2019, 09/15/2019)",0.525963,0.631082,"(Saturday, Sunday)"
1,"(09/14/2019, 09/16/2019)",0.109576,0.919073,"(Saturday, Monday)"
2,"(09/14/2019, 09/17/2019)",0.0,1.0,"(Saturday, Tuesday)"
3,"(09/14/2019, 09/20/2019)",0.589768,0.594375,"(Saturday, Friday)"
4,"(09/14/2019, 09/21/2019)",0.244339,0.820328,"(Saturday, Saturday)"
5,"(09/14/2019, 09/23/2019)",-0.22441,0.83672,"(Saturday, Monday)"
6,"(09/15/2019, 09/16/2019)",-0.605515,0.555427,"(Sunday, Monday)"
7,"(09/15/2019, 09/17/2019)",-0.709297,0.493386,"(Sunday, Tuesday)"
8,"(09/15/2019, 09/20/2019)",0.076047,0.940517,"(Sunday, Friday)"
9,"(09/15/2019, 09/21/2019)",-0.384218,0.707554,"(Sunday, Saturday)"


In [19]:
evening_welch_df = form_welch_df(evening_instance_scores)
evening_welch_df

  **kwargs)
  ret = ret.dtype.type(ret / rcount)


Unnamed: 0,Pair,t_test_stat,p_value,Day
0,"(09/15/2019, 09/16/2019)",-0.27735,0.808768,"(Sunday, Monday)"
1,"(09/15/2019, 09/17/2019)",0.0,1.0,"(Sunday, Tuesday)"
2,"(09/15/2019, 09/20/2019)",,,"(Sunday, Friday)"
3,"(09/15/2019, 09/21/2019)",-1.0,0.5,"(Sunday, Saturday)"
4,"(09/15/2019, 09/23/2019)",,,"(Sunday, Monday)"
5,"(09/16/2019, 09/17/2019)",0.27735,0.808768,"(Monday, Tuesday)"
6,"(09/16/2019, 09/20/2019)",,,"(Monday, Friday)"
7,"(09/16/2019, 09/21/2019)",-1.0,0.42265,"(Monday, Saturday)"
8,"(09/16/2019, 09/23/2019)",,,"(Monday, Monday)"
9,"(09/17/2019, 09/20/2019)",,,"(Tuesday, Friday)"


### 2. Inter time zone common dates

In [20]:
for i, row in result.iterrows():
    print(f"For {row['Date']}, {row['Day']}")
    day_instance_scores = {}
    for timezone in time_zones:
        if row[timezone] != 'NA' and timezone != 'Early_Morning':
            if timezone == 'Morning':
                temp = morning_instance_scores.get(row['Date'], 0)
            elif timezone == 'Afternoon':
                temp = afternoon_instance_scores.get(row['Date'], 0)
            elif timezone == 'Evening':
                temp = evening_instance_scores.get(row['Date'], 0)
            if temp != 0:
                day_instance_scores[timezone] = temp
#     print(day_instance_scores)
    day_welch_df = form_welch_df(day_instance_scores, False)
    print(day_welch_df, end='\n\n')

For 09/14/2019, Saturday
                   Pair  t_test_stat   p_value
0  (Morning, Afternoon)    -0.377964  0.722788

For 09/15/2019, Sunday
                   Pair  t_test_stat   p_value
0  (Afternoon, Evening)    -0.066725  0.955494

For 09/16/2019, Monday
                   Pair  t_test_stat   p_value
0  (Morning, Afternoon)     0.109576  0.919073
1    (Morning, Evening)     0.000000  1.000000
2  (Afternoon, Evening)    -0.109576  0.919073

For 09/17/2019, Tuesday
                   Pair  t_test_stat   p_value
0  (Morning, Afternoon)    -1.044826  0.318646
1    (Morning, Evening)    -0.234772  0.846280
2  (Afternoon, Evening)     0.307148  0.798669

For 09/20/2019, Friday
                   Pair  t_test_stat  p_value
0  (Afternoon, Evening)          NaN      NaN

For 09/21/2019, Saturday
                   Pair  t_test_stat   p_value
0  (Morning, Afternoon)     0.089984  0.930353
1    (Morning, Evening)    -1.632993  0.177808
2  (Afternoon, Evening)    -2.121320  0.078141

For 09/

In [21]:
# time_zones

### 3. Across dates accross timezones

In [22]:
all_across_date_across_timezone_pairs = set(itertools.combinations(date_timezone_scores.keys(), 2))
print(len(all_across_date_across_timezone_pairs))

153


In [23]:
inter_date_cross_timezone_pairs = set() # different date same timezone

# Adding inter date cross 'Morning' pairs to the set
for pair in morning_welch_df['Pair']:
    date1, date2 = pair
    inter_date_cross_timezone_pairs.add(((date1, 'Morning'), (date2, 'Morning')))

# Adding inter date cross 'Afternoon' pairs to the set
for pair in afternoon_welch_df['Pair']:
    date1, date2 = pair
    inter_date_cross_timezone_pairs.add(((date1, 'Afternoon'), (date2, 'Afternoon')))

    
# Adding inter date cross 'Evening' pairs to the set
for pair in evening_welch_df['Pair']:
    date1, date2 = pair
    inter_date_cross_timezone_pairs.add(((date1, 'Evening'), (date2, 'Evening')))
    
print(len(inter_date_cross_timezone_pairs))

46


In [24]:
intra_date_cross_timezone = set() # same date different timezone

for pair in all_across_date_across_timezone_pairs:
    if pair[0][0] == pair[1][0]: # both tuples have same date
        intra_date_cross_timezone.add(pair)
        
print(len(intra_date_cross_timezone))

15


In [25]:
"""
If Set_whole = Set_A + Set_B + Set_C,
then, Set_C = Set_whole - Set_A - Set_B
"""

across_date_across_timezone_pairs = all_across_date_across_timezone_pairs \
                                    - intra_date_cross_timezone \
                                    - inter_date_cross_timezone_pairs

len(across_date_across_timezone_pairs)

92

In [26]:
across_date_across_timezone_instance_scores = {}
for pair in across_date_across_timezone_pairs:
    across_date_across_timezone_instance_scores[pair] = (date_timezone_scores[pair[0]], date_timezone_scores[pair[1]])

In [27]:
across_date_across_timezone_welch = welch_t(across_date_across_timezone_instance_scores)

across_date_across_timezone_welch_df = pd.DataFrame(columns=["Pair", "t_test_stat", "p_value"])
across_date_across_timezone_welch_df["Pair"] = list(across_date_across_timezone_welch.keys())
across_date_across_timezone_welch_df["t_test_stat"] = [val[0] for val in across_date_across_timezone_welch.values()]
across_date_across_timezone_welch_df["p_value"] = [val[1] for val in across_date_across_timezone_welch.values()]
across_date_across_timezone_welch_df['Day'] = across_date_across_timezone_welch_df["Pair"].apply(\
                                                        lambda x: (get_day_of_week(x[0][0]),\
                                                                   get_day_of_week(x[1][0])))

In [28]:
across_date_across_timezone_welch_df.to_csv(os.path.join(log_dir,\
                                                         timezone_analysis_name,\
                                                         welch_output),\
                                           index=False)
# change csv name

across_date_across_timezone_welch_df

Unnamed: 0,Pair,t_test_stat,p_value,Day
0,"((09/16/2019, Morning), (09/23/2019, Afternoon))",-0.224410,0.836720,"(Monday, Monday)"
1,"((09/15/2019, Evening), (09/17/2019, Afternoon))",-0.307148,0.798669,"(Sunday, Tuesday)"
2,"((09/16/2019, Evening), (09/21/2019, Afternoon))",0.244339,0.820328,"(Monday, Saturday)"
3,"((09/14/2019, Morning), (09/15/2019, Evening))",0.000000,1.000000,"(Saturday, Sunday)"
4,"((09/20/2019, Afternoon), (09/21/2019, Evening))",-3.162278,0.013349,"(Friday, Saturday)"
5,"((09/14/2019, Morning), (09/21/2019, Evening))",-1.732051,0.181690,"(Saturday, Saturday)"
6,"((09/14/2019, Morning), (09/17/2019, Evening))",0.000000,1.000000,"(Saturday, Tuesday)"
7,"((09/20/2019, Afternoon), (09/23/2019, Evening))",,,"(Friday, Monday)"
8,"((09/14/2019, Morning), (09/20/2019, Afternoon))",0.164399,0.875441,"(Saturday, Friday)"
9,"((09/20/2019, Evening), (09/21/2019, Afternoon))",,,"(Friday, Saturday)"
