In [1]:
import pandas as pd
import numpy as np
import networkx as nx
import codecs
import pickle
import os

In [5]:
# temporal aggregation interval
AGGR_TIME = 10*60 #only for sociopatterns
# AGGR_TIME = 1 #only for OpenABM

# temporal coupling weight
temp_edge_weight_list = [1]#,2,4,8,16]

# temporal coupling repr
rep_time_list = ['Non']
#rep_time_list = ['Inverse']

# dataset_list = ['LyonSchool', 'InVS15', 'SFHH', 'LH10', 'Thiers13']
dataset_list = ['LyonSchool']
# dataset_list = ['InVS15']
# dataset_list = ['SFHH']
# dataset_list = ['LH10']
# dataset_list = ['Thiers13']

# dataset_list = ['OpenABM-Covid19-Interactions-5k-20']
# dataset_list = ['OpenABM-Covid19-Interactions-2k-100]

In [6]:
def load_temp_data(dataset):
    if dataset.split('-')[0]=='OpenABM':
        df_temp_net = pd.read_csv(('../data/Data_OpenABM/%s.tar.gz' % dataset))\
                                .rename(columns={'ID_1':'i', 'ID_2':'j', 'time':'t'})
    else:
        df_temp_net = pd.read_csv(('../data/Data_SocioPatterns_20s_nonights/tij_%s.dat_nonights.dat' % dataset),
                        sep = '\t', header = None,
                        names = ['t', 'i', 'j'])
    # compute slice each contact event belongs to
    df_temp_net.sort_values('t', inplace=True)
    df_temp_net.loc[:,'tslice'] = np.floor((df_temp_net.t - df_temp_net.t.iloc[0]) / AGGR_TIME)
    # group over (slice, i, j), and compute number of contacts within time slice,
    # regarded as "weight" for contacts in each time slice
    
    df_temp_net = df_temp_net[df_temp_net.i!=df_temp_net.j]
    s = df_temp_net['i'] > df_temp_net['j']
    df_temp_net.loc[s, ['i','j']] = df_temp_net.loc[s, ['j','i']].values
    df_temp_net.drop_duplicates(['t','i','j'], inplace=True)
    
    s_temp_net = df_temp_net.groupby(['tslice','i','j']).size().rename('weight')
    
    # times for all temporal slices, note that it may have a big gap (return to home)
    partial_times = sorted(list(s_temp_net.index.levels[0]))

    # convenience: dataframe version of the series above
    df_tnet = s_temp_net.reset_index()

    #make list of ID and active time
    #sort the embedding result by using this list.
    pat_active_time = [[('%d-%d' % (e[1], e[0])), ('%d-%d' % (e[2], e[0]))] for e, weight in s_temp_net.iteritems()]
    pat_active_time = list(set([item for sublist in pat_active_time for item in sublist]))

    target_dir = '../preprocessed/SupraAdjacencyMatrix/%s' %dataset
    os.makedirs(target_dir, exist_ok=True)
    
    pat_active_time_file_name = "../preprocessed/SupraAdjacencyMatrix/%s/PatActiveTimeAggtime%d.pkl" % (dataset, AGGR_TIME)
    if (0==os.path.isfile(pat_active_time_file_name)):
        pickle.dump(pat_active_time, open(pat_active_time_file_name, "wb" ) )
    
    return partial_times, df_temp_net, df_tnet

In [7]:
for dataset in dataset_list:
    _, df_unfold_net, df_tnet = load_temp_data(dataset)
    
    df_tree = df_tnet.groupby('tslice')\
                .apply(lambda x: nx.Graph(list(x[['i', 'j']].values)))\
                .map(lambda g: np.array(list(nx.minimum_spanning_edges(g, data=False))))\
                .to_frame('edge').reset_index()#.explode('edge')
    
    edges = np.sort(np.concatenate([e for e in df_tree.edge]), axis=1)
    df_tree = df_tree.explode('edge')
    df_tree['i'] = edges[:,0]
    df_tree['j'] = edges[:,1]
    del df_tree['edge']
    
    df_merge = df_tree.merge(df_tnet.reset_index(), on=['tslice', 'i', 'j'], how='outer', indicator=True)
    idx_samp = df_merge[df_merge._merge=='right_only']['index'].values
    
    start_size = df_tnet.shape[0]
    tree_size = df_tree.shape[0]
    final_size = int(0.7*start_size)
    samp_size = final_size - tree_size
    assert(samp_size>0)
    print(start_size, tree_size, samp_size)
    
    for itr_split in range(1):
        
        df_samp = df_tnet.iloc[idx_samp].sample(n=samp_size, replace=False, random_state=itr_split)
        df_samp = pd.concat((df_tree, df_samp)).reset_index(drop=True)
        
        df_save = df_unfold_net.merge(df_samp, on=['i', 'j', 'tslice']).loc[:,['t','i','j']]
        target_dir = '../preprocessed/RemovedLinksTempNet/%s' % dataset
        os.makedirs(target_dir, exist_ok=True)
        df_save.to_csv(target_dir + '/tij_%s_7030_%d.csv.gz' % (dataset, itr_split), header=False, index=False)

44820 15873 15500
