In [1]:
import os
import pandas as pd
import numpy as np
import datetime
import itertools
from library.correlationAndTtestLib import *

In [17]:
def time_zone_cal(s):
    '''
    @param s: a string of format "hh:mm:ss"
    
    @return: a string denoting the timezone
    '''
    #print(s)
    try:
        hour=int(s.split(':')[0])
    except:
        hour=13 #for nan set to afternoon

    if 6<hour<=9:
        time_zone='Early_Morning'
    elif 9<hour<=12:
        time_zone='Morning'
    elif 12<hour<=17:
        time_zone='Afternoon'
    elif 17<hour<=23:
        time_zone='Evening'
    return time_zone

# converts hour:minute:seconds to seconds
def time_as_int(timestr):
    '''
    @param timestr: a string of format "hh:mm:ss"
    
    @return: an integer denoting the time in seconds
    '''
    hh, mm, ss = timestr.split(":")
    time_int = (int(hh)*3600)+(int(mm)*60)+int(ss)
    return time_int

def get_day_of_week(date_string, seperator="/", date_format="mm/dd/yyyy"):
    '''
    @param date_string: a string denoting a date
    @param seperator (optional): a character which acts as seperator in the date_string. Default = "/"
    @param date_format (optional): a string which denotes the format the date string is. Default = "mm/dd/yyyy"
                                Supported values are : {"dd/mm/yyyy", "mm/dd/yyyy"}
    
    @return: a string denoting the day of week corressponding to the date denoted by date_string
    '''
    date_format_idxs = {'mm/dd/yyyy':{'date_idx': 1, 'month_idx': 0, 'year_idx': 2},
                        'dd/mm/yyyy':{'date_idx': 0, 'month_idx': 1, 'year_idx': 2}}
    try:
        idx_dict = date_format_idxs[date_format]
        date_arr = [int(val) for val in date_string.split(seperator)]
        weekdays = {0:"Monday", 1:"Tuesday", 2:"Wednesday", 3:"Thursday", 4:"Friday", 5:"Saturday", 6:"Sunday"}
        date = datetime.datetime(date_arr[idx_dict['year_idx']],\
                                 date_arr[idx_dict['month_idx']],\
                                 date_arr[idx_dict['date_idx']])
        return weekdays[date.weekday()]
    except KeyError:
        raise ValueError(f"{date_format} is not supported. Supported values are: {list(date_format_idxs.keys())}.")

In [18]:
zones = ["junct_mall", "prantika_bus_stand"]

In [19]:
log_dir = r"../reports/54ft/"
instance_wise_scores_dir = "instance_wise_scores"

dates = ["14_09_2019", "15_09_2019", "16_09_2019", "17_09_2019", "20_09_2019", "21_09_2019", "23_09_2019"]

## 1

In [20]:
zone_timezone_analysis_dfs = []
for zone in zones:
    path = os.path.join(log_dir, zone, f"time_zone_analysis/score_timezone_analysis_{zone}_zone.csv")
    zone_timezone_analysis_dfs.append(pd.read_csv(path))

In [21]:
array_1, array_2 = zone_timezone_analysis_dfs[0]['Whole_Day'], zone_timezone_analysis_dfs[1]['Whole_Day']
zonal_welch_t_result = welch_t_test(array_1, array_2)
print(f"Welch's (t_test_stat, p_value) for zones {zones}: {zonal_welch_t_result}")

Welch's (t_test_stat, p_value) for zones ['junct_mall', 'prantika_bus_stand']: (6.9628689398734664, 2.0936879865449384e-05)


## 2.A

In [22]:
score_dfs = {}

for zone in zones:
    for date in dates:
        score_dfs[zone] = score_dfs.get(zone, {})
        score_dfs[zone][date] = pd.read_csv(os.path.join(log_dir, zone, 
                                                      instance_wise_scores_dir, 
                                                      f"{date}.csv"))

In [23]:
# test
for zone in score_dfs.keys():
    for date in dates:
        temp_df = score_dfs.get(zone).get(date)
        temp_df['Date'] = [date for i in range(len(temp_df))]

In [24]:
joined_dfs = {}
for zone in score_dfs.keys():
    temp_df = pd.concat(score_dfs[zone].values(), ignore_index=True)
    joined_dfs[zone] = temp_df
    print(len(temp_df))

53
48


In [25]:
for temp_df in joined_dfs.values():
    temp_df['Timezone'] = temp_df['start_time'].apply(lambda x : time_zone_cal(x))
    temp_df['Day'] = temp_df['Date'].apply(lambda x : get_day_of_week(x, seperator="_", date_format="dd/mm/yyyy"))
#     print(temp_df)

In [26]:
for zone in joined_dfs.keys():
    print(f"For {zone}: ")
    grouped = joined_dfs[zone].groupby(["Date", "Timezone"])
    for date in dates:
        print(f"\tFor {date}:")
        welch_test_pair = []
        for pair in grouped.groups.keys():
            if date in pair:
                welch_test_pair.append(pair)
                if len(welch_test_pair) == 2:
                    break
        if not len(welch_test_pair) < 2:
            pair_1, pair_2 = welch_test_pair
            array_1 = list(grouped.get_group(pair_1)['score'])
            array_2 = list(grouped.get_group(pair_2)['score'])
            
            print(f"\t\t{pair_1[1]}, {pair_2[1]}: {len(array_1)},{len(array_2)}")
            print(f"\t\t{pair_1[1]}, {pair_2[1]}:"\
                +f"{welch_t_test(array_1, array_2)}")
        else:
            print(f"\t\tOnly {len(welch_test_pair)} timezone available:\t{welch_test_pair}.")
        print()

For junct_mall: 
	For 14_09_2019:
		Afternoon, Morning: 2,3
		Afternoon, Morning:(1.0, 0.42264973081037427)

	For 15_09_2019:
		Only 1 timezone available:	[('15_09_2019', 'Afternoon')].

	For 16_09_2019:
		Afternoon, Evening: 7,2
		Afternoon, Evening:(0.6868028197434451, 0.6029039678713222)

	For 17_09_2019:
		Afternoon, Evening: 3,2
		Afternoon, Evening:(-1.8410785575880018, 0.1660703195925413)

	For 20_09_2019:
		Afternoon, Evening: 6,1
		Afternoon, Evening:(nan, nan)

	For 21_09_2019:
		Afternoon, Evening: 4,1
		Afternoon, Evening:(nan, nan)

	For 23_09_2019:
		Afternoon, Evening: 4,1
		Afternoon, Evening:(nan, nan)

For prantika_bus_stand: 
	For 14_09_2019:
		Only 1 timezone available:	[('14_09_2019', 'Afternoon')].

	For 15_09_2019:
		Only 1 timezone available:	[('15_09_2019', 'Afternoon')].

	For 16_09_2019:
		Only 1 timezone available:	[('16_09_2019', 'Afternoon')].

	For 17_09_2019:
		Only 1 timezone available:	[('17_09_2019', 'Afternoon')].

	For 20_09_2019:
		Only 1 timezone 

## 2.B

In [27]:
a, b = joined_dfs.keys()
a_group = joined_dfs[a].groupby(["Date", "Timezone"])
b_group = joined_dfs[b].groupby(["Date", "Timezone"])
for timezone in set(joined_dfs[zone]['Timezone']):
    print(f"For {timezone}:\n")
    for date in dates:
        print(f"\tFor {date}:")
        pair = (date, timezone)
        a_exists = pair in a_group.groups.keys()
        b_exists = pair in b_group.groups.keys()
        if(a_exists or b_exists):
            if(a_exists and b_exists):
                a_array = list(a_group.get_group(pair)['score'])
                b_array = list(b_group.get_group(pair)['score'])
                print(f"\t\t{a}, {b}: {len(a_array)},{len(b_array)}")
                print(f"\t\t\t{welch_t_test(a_array, b_array)}")
            else:
                print(f"\t\tFor pair: {pair}, only available: " + a if a_exists else b)
        else:
            print("\t\tNothing found!")
    print()

For Afternoon:

	For 14_09_2019:
		junct_mall, prantika_bus_stand: 2,2
			(1.6666666666666667, 0.3440417392452613)
	For 15_09_2019:
		junct_mall, prantika_bus_stand: 5,2
			(3.9999999999999996, 0.01613008990009254)
	For 16_09_2019:
		junct_mall, prantika_bus_stand: 7,11
			(3.9262435111551293, 0.002270496890740338)
	For 17_09_2019:
		junct_mall, prantika_bus_stand: 3,11
			(0.7513176183148906, 0.5092833464614165)
	For 20_09_2019:
		junct_mall, prantika_bus_stand: 6,4
			(1.0744680507152484, 0.315703633645757)
	For 21_09_2019:
		junct_mall, prantika_bus_stand: 4,9
			(4.899086389260638, 0.0011951492036522944)
	For 23_09_2019:
		junct_mall, prantika_bus_stand: 4,9
			(2.3206768503158806, 0.09260893211811398)

