In [1]:
import pandas as pd
import numpy as np
import networkx as nx
import codecs
import pickle
import os

In [2]:
# temporal aggregation interval
AGGR_TIME = 10*60

# temporal coupling weight
temp_edge_weight_list = [1]#,2,4,8,16]

# temporal coupling repr
rep_time_list = ['Non']
#rep_time_list = ['Inverse']

# dataset_list = ['LyonSchool', 'InVS15', 'SFHH', 'LH10', 'Thiers13']
dataset_list = ['LyonSchool']
# dataset_list = ['InVS15']
# dataset_list = ['SFHH']
# dataset_list = ['LH10']
# dataset_list = ['Thiers13']

In [3]:
def load_temp_data(dataset):
    df_temp_net = pd.read_csv(('../data/Data_SocioPatterns_20s_nonights/tij_%s.dat_nonights.dat' % dataset),
                        sep = '\t', header = None,
                        names = ['t', 'i', 'j'])
    # compute slice each contact event belongs to
    df_temp_net.loc[:,'tslice'] = np.floor((df_temp_net.t - df_temp_net.t.iloc[0]) / AGGR_TIME)
    # group over (slice, i, j), and compute number of contacts within time slice,
    # regarded as "weight" for contacts in each time slice
    
    df_temp_net = df_temp_net[df_temp_net.i!=df_temp_net.j]
    s = df_temp_net['i'] > df_temp_net['j']
    df_temp_net.loc[s, ['i','j']] = df_temp_net.loc[s, ['j','i']].values
    df_temp_net.drop_duplicates(['t','i','j'], inplace=True)
    
    s_temp_net = df_temp_net.groupby(['tslice','i','j']).size().rename('weight')
    
    # times for all temporal slices, note that it may have a big gap (return to home)
    partial_times = sorted(list(s_temp_net.index.levels[0]))

    # convenience: dataframe version of the series above
    df_tnet = s_temp_net.reset_index()

    #make list of ID and active time
    #sort the embedding result by using this list.
    pat_active_time = [[('%d-%d' % (e[1], e[0])), ('%d-%d' % (e[2], e[0]))] for e, weight in s_temp_net.iteritems()]
    pat_active_time = list(set([item for sublist in pat_active_time for item in sublist]))

    target_dir = '../preprocessed/SupraAdjacencyMatrix/%s' %dataset
    os.makedirs(target_dir, exist_ok=True)
    
    pat_active_time_file_name = "../preprocessed/SupraAdjacencyMatrix/%s/PatActiveTimeAggtime%d.pkl" % (dataset, AGGR_TIME)
    if (0==os.path.isfile(pat_active_time_file_name)):
        pickle.dump(pat_active_time, open(pat_active_time_file_name, "wb" ) )
    
    return partial_times, df_temp_net, df_tnet

In [4]:
def samp_func(df, random_state):
    act = len(set(df.i.values) | set(df.j.values))
    todrop = df[df.r].index
    
    if len(todrop)==0:
        return df
    
    i = random_state.choice(todrop, 1)[0]
    df_samp = df.drop(index=i)
    act_samp = len(set(df_samp.i.values) | set(df_samp.j.values))
    
    if act_samp == act:
        return df_samp
    else:
        df.at[i,'r'] = False 
        return df

In [13]:
for dataset in dataset_list:
    _, df_unfold_net, df_tnet = load_temp_data(dataset)
    
    for itr_split in range(1):
        rs = np.random.RandomState(itr_split)
        df_samp = df_tnet.copy()
        df_samp['r'] = True
        start_size = df_samp.shape[0]
        while True:
            prev_size = df_samp.shape[0]
            df_samp = df_samp.groupby('tslice').apply(lambda x: samp_func(x, rs)).reset_index(drop=True)
            if (df_samp.shape[0] <= 0.7*start_size) or (df_samp.shape[0]==prev_size):
                break
        del df_samp['r']
        df_save = df_unfold_net.merge(df_samp, on=['i', 'j', 'tslice']).loc[:,['t','i','j']]
        target_dir = '../preprocessed/RemovedLinksTempNet/%s' % dataset
        os.makedirs(target_dir, exist_ok=True)
        df_save.to_csv(target_dir + '/tij_%s_7030_%d.csv.gz' % (dataset, itr_split), header=False, index=False)