In [1]:
import glob
import numpy as np
import pandas as pd 
import tqdm
from scipy import stats
from itertools import combinations,product

# Constants

In [2]:
parent_dir=r"../reports/54ft/"

#date=['2019-09-14','2019-09-15','2019-09-16','2019-09-17','2019-09-20','2019-09-21','2019-09-23','*'] #last N days
date=['2019-06-28','2019-06-30','2019-07-01','2019-07-02','2019-07-03','2019-07-04','2019-07-05','*'] #first N days
timezone=['Early_Morning','Morning','Afternoon','Evening','*']
zone=['54ft_road','dvc_more','junction_mall','prantika_bus_stand','station','54ft_test']#i.e All=54ft_test

In [3]:
def time_zone_cal(s):
    hour=int(s.split(':')[0])
    if 6<hour<=9:
        time_zone='Early_Morning'
    elif 9<hour<=12:
        time_zone='Morning'
    elif 12<hour<=17:
        time_zone='Afternoon'
    elif 17<hour<=23:
        time_zone='Evening'
    return time_zone


#Resouree Locator Function
def get_chunk_of_data_on_resource_locator(file1):
    files=glob.glob(parent_dir+f"{file1[2]}/DateWise/{file1[0]}.csv")
    if len(files)==0:
        return 'file_not_available' # for those where a date is missing
    #otherwise
    df=pd.concat([pd.read_csv(f) for f in files],axis=0) #reading files w.r.to dates or * &  zone
    return df[df.start_time.map(time_zone_cal).apply(lambda e:True if file1[1]=='*' else e==file1[1])].copy() #filtering w.r.to time_zone or *

# Cross Product

In [4]:
cross_prod=list(product(date,timezone,zone))

# Combinations

In [5]:
def welch_t_test(array1, array2):
    return tuple(stats.ttest_ind(array1, array2, equal_var=False))

In [6]:
def welch_t_prep(com):
    left=get_chunk_of_data_on_resource_locator(com[0])
    right=get_chunk_of_data_on_resource_locator(com[1])
    if ((left is'file_not_available') or (right is'file_not_available')): #if any of the file is not available
        fnA='file_not_available'
        return  [*com[0],fnA if left is fnA else left.shape[0],*com[1],fnA if right is fnA else right.shape[0],np.nan,np.nan]
        
    if(left.shape[0]<6 or right.shape[0]<6): #skipping tests for array of length<6
        return  [*com[0],left.shape[0],*com[1],right.shape[0],np.nan,np.nan]
    else:
        t_test_res=welch_t_test(left.scores,right.scores) #t_test
        return [*com[0],left.shape[0],*com[1],right.shape[0],*t_test_res]

In [7]:
comb_res=[]
all_combinations=list(combinations(cross_prod,2))


for comb in tqdm.tqdm(all_combinations):
    comb_res.append(welch_t_prep(comb))

100%|██████████| 28680/28680 [07:06<00:00, 67.28it/s]


# Converting to DataFrame

In [8]:
df=pd.DataFrame(comb_res,columns=['left_date','left_time_zone','left_zone','left_count',
                                  'right_date','right_time_zone','right_zone','right_count',
                                  't_stat','p_value'])

In [9]:
df.to_csv("../reports/welch_t_test/whole_comb_t_test.csv",index=False)
df.dropna().to_csv("../reports/welch_t_test/possible_comb_t_test.csv",index=False)
df[df['p_value']<0.05].to_csv("../reports/welch_t_test/significant_comb_t_test.csv",index=False)

In [11]:
#NICE