## This Notebook finds the ChiSquared Test P values for each feature vector in AFIB_100 dataset

### Imports

In [1]:
import pandas as pd
import pickle
from pandarallel import pandarallel
from scipy import stats
import time

In [2]:
## Read the dataset
df=pd.read_csv("afib100.csv")

In [3]:
df.columns

Index(['person_id', 'label', 'diag_med'], dtype='object')

In [None]:
# Create a dictionary of the diag_med id counts, where key is the id and the value is its count
diag_med_key_count = {}
for index, row in df.iterrows():
    # Get the list of diag_med values for each patient
    list_ids = row['diag_med'].split(',')
    # We want only unique values, not count multiple times for one patient
    list_id_set = set(list_ids)
    for diag_med_id in list_id_set:
        # For every id increament the count in the dictionary
        #int_id = int(diag_med_id)
        if diag_med_id not in diag_med_key_count:
            diag_med_key_count[diag_med_id] = 0
        diag_med_key_count[diag_med_id] = diag_med_key_count[diag_med_id] + 1

In [None]:
diag_med_key_set = set()
for index, row in df.iterrows():
    # Get the list of diag_med values for each patient
    list_ids = row['diag_med'].split(',')
    # We want only unique values, not count multiple times for one patient
    list_id_set = set(list_ids)
    diag_med_key_set.update(list_id_set)

In [None]:
len(diag_med_key_set)

In [4]:
diag_med_count_df = diag_med_key_count

In [5]:
diag_med_count_df

Unnamed: 0,id,count
0,0,1039138
1,35605482,365583
2,40220357,360139
3,19070869,316448
4,320128,316394
...,...,...
27551,704988,1
27552,74808,1
27553,19123015,1
27554,19112868,1


In [6]:
## Find out the diag_med ids which has count more than 1000
diag_med_count_df_1000 = diag_med_count_df[diag_med_count_df['count'] > 1000]

In [7]:
diag_med_count_df_1000.shape

(4700, 2)

In [8]:
diag_med_count_df_30000 = diag_med_count_df[diag_med_count_df['count'] > 30000]

In [9]:
diag_med_count_df_30000.shape

(338, 2)

### Use diag_med_count_df_1000 as input for ChiSquaredTest

In [10]:
# We would like to do it in parallel 
pandarallel.initialize()

INFO: Pandarallel will run on 8 workers.
INFO: Pandarallel will use Memory file system to transfer data between the main process and workers.


In [14]:

STAT_KEY = 'stat'
P_VALUE_KEY = 'p_value'
## Creates a dataframe for one diag_med id from diag_med_count_df_1000 
def create_df_for_key(str_key, origin_df):
    df_new = pd.DataFrame()
    df_new['person_id'] = origin_df['person_id']
    df_new['label'] = origin_df['label']
    df_new['diag_med'] = origin_df['diag_med']
    df_new[str_key] = 0
    def if_exists(list_diag_med):
        list_id_set = set(list_diag_med.split(','))
        if str_key in list_id_set:
            return 1
        else:
            return 0

    df_new[str_key] = df_new['diag_med'].parallel_apply(if_exists)
    df_new.drop(['diag_med'], axis = 1, inplace = True)
    #df_new.to_pickle('data_partition/df_' + str_key+ '.pickle')
   
    return df_new


## Finds the actual P values for each diag_med ids in diag_med_count_df_1000 and store it in chi_sqaure_val_dict_1000
## which will be lated used for Analysis
def find_chi_square_value(diag_med_count_df):
    chi_sqaure_val_dict = {}
    count = 0
    start_time = time.time()
    for str_key in diag_med_count_df['id']:
        #print(count)
        
        df_fetch = create_df_for_key(str_key,df)
        cross_tab = pd.crosstab(df_fetch['label'], df_fetch[str_key], margins=True)
        stat, p, dof, expected = stats.chi2_contingency(cross_tab)
        chi_sqaure_val_dict[str_key] = {}
        chi_sqaure_val_dict[str_key][STAT_KEY] = stat
        chi_sqaure_val_dict[str_key][P_VALUE_KEY] = p
        count = count + 1
        with open('chi_sqaure_val_dict_1000.pickle', 'wb') as handle:
            pickle.dump(chi_sqaure_val_dict, handle, protocol=pickle.HIGHEST_PROTOCOL)
        if count % 100 == 0:
            print("%d --- %s seconds ---" % (count, time.time() - start_time))
            start_time = time.time()
    return chi_sqaure_val_dict
        
        

    
    

In [15]:
chi_sqaure_val_dict_1000 = find_chi_square_value(diag_med_count_df_1000)

100 --- 494.4723608493805 seconds ---
200 --- 472.0745258331299 seconds ---
300 --- 465.2748599052429 seconds ---
400 --- 486.6910355091095 seconds ---
500 --- 463.5092360973358 seconds ---
600 --- 471.762446641922 seconds ---
700 --- 469.4711182117462 seconds ---
800 --- 485.177939414978 seconds ---
900 --- 535.6110029220581 seconds ---
1000 --- 483.4647464752197 seconds ---
1100 --- 490.77819299697876 seconds ---
1200 --- 464.335654258728 seconds ---
1300 --- 498.00420475006104 seconds ---
1400 --- 471.73712372779846 seconds ---
1500 --- 468.38530588150024 seconds ---
1600 --- 464.7618944644928 seconds ---
1700 --- 505.4905686378479 seconds ---
1800 --- 500.81805086135864 seconds ---
1900 --- 477.7144603729248 seconds ---
2000 --- 498.2261426448822 seconds ---
2100 --- 463.35010528564453 seconds ---
2200 --- 495.7636320590973 seconds ---
2300 --- 468.30504155158997 seconds ---
2400 --- 475.6915853023529 seconds ---
2500 --- 469.99483346939087 seconds ---
2600 --- 529.5054357051849 se