# Recurrency Metric for the tgbl-comment dataset

This notebook explores a recurrency metric for the tgbl-comment dataset based on how often interaction between pairs of source and destination occur. It splits the interactions in groups based on frequncy of recurrence - 1 day, 1 week, 1 month etc. 

### Dataset Preparation

In [7]:
from TGB.tgb.linkproppred.dataset import LinkPropPredDataset
import pandas as pd

In [8]:
name = "tgbl-comment"
dataset = LinkPropPredDataset(name=name, root="./Datasets", preprocess=True)
data = dataset.full_data

data_df = pd.DataFrame([data['sources'], data['destinations'], data['timestamps']]).T
data_df.columns = ['Source', 'Destination', 'Timestamp']

raw file found, skipping download
Dataset directory is  /opt/homebrew/lib/python3.11/site-packages/tgb/./Datasets/tgbl_comment
loading processed file


In [3]:
data_df

Unnamed: 0,Source,Destination,Timestamp
0,0.0,0.0,1.134365e+09
1,1.0,1.0,1.134366e+09
2,2.0,3.0,1.134368e+09
3,4.0,4.0,1.134371e+09
4,5.0,5.0,1.134372e+09
...,...,...,...
44314502,270407.0,970235.0,1.293840e+09
44314503,307508.0,178151.0,1.293840e+09
44314504,609052.0,199010.0,1.293840e+09
44314505,906272.0,1643.0,1.293840e+09


### Recurrency Metric based on Interaction Frequency

In [4]:
data_df['Timestamp'] = pd.to_datetime(data_df['Timestamp'], unit='s')

In [5]:
data_df['Year'] = data_df['Timestamp'].dt.year
most_common_year = data_df['Year'].value_counts().idxmax()

start_date = pd.Timestamp(year=most_common_year, month=1, day=1)
end_date = pd.Timestamp(year=most_common_year, month=6, day=30)
df_filtered = data_df[(data_df['Timestamp'] >= start_date) & (data_df['Timestamp'] <= end_date)]

df_filtered['YearWeek'] = df_filtered['Timestamp'].dt.to_period('W')

def check_weekly_occurrences(group):
    expected_weeks = pd.date_range(start=start_date, end=end_date, freq='W-MON').nunique()
    unique_weeks = group['YearWeek'].nunique()
    return unique_weeks == expected_weeks / 2

weekly_occurrences = df_filtered.groupby(['Source', 'Destination']).filter(check_weekly_occurrences)

summary = weekly_occurrences.drop_duplicates(subset=['Source', 'Destination'])

print(f"Year with the most records: {most_common_year}")
print(summary)


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_filtered['YearWeek'] = df_filtered['Timestamp'].dt.to_period('W')


Year with the most records: 2010
            Source  Destination           Timestamp  Year  \
15466202  105954.0     105954.0 2010-01-01 00:04:24  2010   
15469791  129858.0     129858.0 2010-01-01 02:03:07  2010   
15470966  125388.0     320215.0 2010-01-01 02:47:35  2010   
15471914   52237.0      52237.0 2010-01-01 03:28:35  2010   
15472685   63541.0      63541.0 2010-01-01 04:02:43  2010   
...            ...          ...                 ...   ...   
20241505  210090.0     455378.0 2010-03-26 01:07:27  2010   
20640650  247710.0     538817.0 2010-04-01 15:11:11  2010   
20843279  538817.0     538817.0 2010-04-05 15:52:17  2010   
20864278  232399.0     538817.0 2010-04-05 21:19:17  2010   
20971744  545722.0     545722.0 2010-04-07 14:27:14  2010   

                       YearWeek  
15466202  2009-12-28/2010-01-03  
15469791  2009-12-28/2010-01-03  
15470966  2009-12-28/2010-01-03  
15471914  2009-12-28/2010-01-03  
15472685  2009-12-28/2010-01-03  
...                         ..

In [21]:
frequency = data_df.groupby(['Source', 'Destination'])['Timestamp'].transform(classify_frequency)
data_df['Frequency'] = frequency


In [22]:
data_df

Unnamed: 0,Source,Destination,Timestamp,Frequency
0,0.0,1.0,1999-06-13,Single occurrence
1,2.0,3.0,1999-07-01,Single occurrence
2,4.0,1.0,1999-07-13,Single occurrence
3,5.0,6.0,1999-07-13,Single occurrence
4,7.0,8.0,1999-07-13,Single occurrence
...,...,...,...,...
4873535,257355.0,234180.0,2018-10-02,Single occurrence
4873536,195919.0,301073.0,2018-10-03,Single occurrence
4873537,143936.0,137681.0,2018-10-03,Single occurrence
4873538,70003.0,249788.0,2018-10-04,Single occurrence


Now obtain only pairs that have daily interactions

In [23]:
daily_occurrences = data_df[data_df['Frequency'] == 'Daily']

In [24]:
daily_occurrences

Unnamed: 0,Source,Destination,Timestamp,Frequency
587,669.0,185.0,2000-08-01,Daily
588,669.0,100.0,2000-08-01,Daily
589,669.0,185.0,2000-08-01,Daily
590,669.0,100.0,2000-08-01,Daily
664,669.0,560.0,2000-08-17,Daily
...,...,...,...,...
4871920,154555.0,308688.0,2018-09-11,Daily
4871922,154555.0,308688.0,2018-09-11,Daily
4871923,154555.0,308688.0,2018-09-11,Daily
4871924,154555.0,308688.0,2018-09-11,Daily


In [25]:
daily_occurrences.to_csv('datasets/tgbl-comment_daily_occurrences.csv', index=False)


### Measure average period based on sequence length

In [None]:
def compute_timeframe(dataset, seq_length):
    dataset_sorted = dataset.sort_values('Timestamp', ascending=False)

    all_periods = []
    for _, row in dataset_sorted.iterrows():
        current_source = row['Source']
        current_destination = row['Destination']
        current_timestamp = row['Timestamp']

        neighbor_interactions = dataset_sorted[((dataset_sorted['Source'] == current_source) |
                                                (dataset_sorted['Destination'] == current_source) |
                                                (dataset_sorted['Source'] == current_destination) |
                                                (dataset_sorted['Destination'] == current_destination)) &
                                               (dataset_sorted['Timestamp'] < current_timestamp)]

        num_neighbor_interactions = neighbor_interactions.shape[0]

        if num_neighbor_interactions > seq_length:
            oldest_interaction_timestamp = neighbor_interactions['Timestamp'].min()

            cur_period = current_timestamp - oldest_interaction_timestamp
            all_periods.append(cur_period)

    return sum(all_periods) / len(all_periods) if all_periods else 0


In [None]:
seq_lengths = [8, 16, 32, 128, 512, 1024, 4096]

In [None]:
for seq_length in seq_lengths:
	average_timeframe = compute_timeframe(data_df, seq_length)
	print(f"Average Timeframe for sequence length of {seq_length}: {average_timeframe}")