In [1]:
import pandas as pd
import pickle

In [3]:
def get_max_spike_page(page, central_df, prev_month_df, next_month_df):
    pages_to_check = set([*central_df[(central_df['prev']==page) & (central_df['clicks']>=100)]['curr'].values,
                        *central_df[(central_df['curr']==page) & (central_df['clicks']>=100)]['prev'].values,
                        *prev_month_df[(prev_month_df['prev']==page) & (prev_month_df['clicks']>=100)]['curr'].values,
                        *prev_month_df[(prev_month_df['curr']==page) & (prev_month_df['clicks']>=100)]['prev'].values,
                        *next_month_df[(next_month_df['prev']==page) & (next_month_df['clicks']>=100)]['curr'].values,
                        *next_month_df[(next_month_df['curr']==page) & (next_month_df['clicks']>=100)]['prev'].values,
                        page
                        ])
    inflow_prev = prev_month_df[prev_month_df['type']!='other'].groupby('curr')[['clicks']].agg(sum)
    inflow_central = central_df[central_df['type']!='other'].groupby('curr')[['clicks']].agg(sum)
    inflow_next = next_month_df[next_month_df['type']!='other'].groupby('curr')[['clicks']].agg(sum)

    inflow_prev = inflow_prev[inflow_prev.index.isin(pages_to_check)]
    inflow_central = inflow_central[inflow_central.index.isin(pages_to_check)]
    inflow_next = inflow_next[inflow_next.index.isin(pages_to_check)]

    merged = inflow_central.join(inflow_prev, lsuffix='_central', rsuffix='_prev')
    merged = merged.join(inflow_next, rsuffix='_next')
    merged['clicks_next'] = merged['clicks']
    merged=merged.drop('clicks',axis=1)
    merged['spike_ratio'] = (2*merged['clicks_central'])/(merged['clicks_prev']+merged['clicks_next'])
    next_step_pages = merged[merged['spike_ratio']>1.5].sort_values('clicks_central', ascending=False)
    return next_step_pages.head(10)


In [4]:
# first pass
years = [2019, 2020, 2021, 2022, 2023]

page = 'World_Trade_Center_(1973–2001)'

for year in years:
    aug = pd.read_csv(f'./spikes/{year}/august{year}.csv')
    sept = pd.read_csv(f'./spikes/{year}/september{year}.csv')
    octo = pd.read_csv(f'./spikes/{year}/october{year}.csv')
    display(year, get_max_spike_page(page=page, central_df=sept, prev_month_df=aug, next_month_df=octo))

2019

Unnamed: 0_level_0,clicks_central,clicks_prev,clicks_next,spike_ratio
curr,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
September_11_attacks,2799511,409780,3241630,1.533386
Casualties_of_the_September_11_attacks,772326,64409,841290,1.705481
Timeline_for_the_day_of_the_September_11_attacks,599876,21663,630083,1.840828
United_Airlines_Flight_93,584705,68876,658933,1.606754
Rick_Rescorla,388323,73912,404504,1.62337
American_Airlines_Flight_11,287207,46131,333964,1.511238
Collapse_of_the_World_Trade_Center,278131,29122,313135,1.625276
The_Falling_Man,269971,25890,297978,1.667167
United_Airlines_Flight_175,240384,33629,276438,1.550529
Rescue_and_recovery_effort_after_the_September_11_attacks_on_the_World_Trade_Center,124006,11069,138121,1.66239


2020

Unnamed: 0_level_0,clicks_central,clicks_prev,clicks_next,spike_ratio
curr,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1


2021

Unnamed: 0_level_0,clicks_central,clicks_prev,clicks_next,spike_ratio
curr,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
September_11th_Victim_Compensation_Fund,226676,54235,245291,1.513565


2022

Unnamed: 0_level_0,clicks_central,clicks_prev,clicks_next,spike_ratio
curr,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
Timeline_for_the_day_of_the_September_11_attacks,142027,21044,157169,1.593902
September_11,61029,10576,68413,1.545253
Mychal_Judge,32859,5114,37227,1.552113
Tribute_in_Light,18814,3214,21274,1.536589


2023

Unnamed: 0_level_0,clicks_central,clicks_prev,clicks_next,spike_ratio
curr,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
September_11_attacks,1950667,470818,492785,4.048694
Israel_Adesanya,590606,194202,181586,3.143294
World_Trade_Center_(1973–2001),532825,163411,186482,3.045645
United_Airlines_Flight_93,530003,121397,101727,4.750748
Osama_bin_Laden,463701,233042,352634,1.583473
Casualties_of_the_September_11_attacks,388127,67011,83747,5.149007
American_Airlines_Flight_11,333990,85485,82518,3.976
Timeline_for_the_day_of_the_September_11_attacks,298598,27067,22817,11.971694
Collapse_of_the_World_Trade_Center,275545,52161,56160,5.087564
One_World_Trade_Center,267221,114110,123860,2.245838


In [6]:
# second pass
years = [2019, 2021, 2022, 2023]

pages = ['September_11_attacks','September_11th_Victim_Compensation_Fund','Timeline_for_the_day_of_the_September_11_attacks','September_11_attacks']

for idx, year in enumerate(years):
    aug = pd.read_csv(f'./spikes/{year}/august{year}.csv')
    sept = pd.read_csv(f'./spikes/{year}/september{year}.csv')
    octo = pd.read_csv(f'./spikes/{year}/october{year}.csv')
    display(year, get_max_spike_page(page=pages[idx], central_df=sept, prev_month_df=aug, next_month_df=octo))

2019

Unnamed: 0_level_0,clicks_central,clicks_prev,clicks_next,spike_ratio
curr,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
September_11_attacks,2799511,409780.0,3241630,1.533386
Casualties_of_the_September_11_attacks,772326,64409.0,841290,1.705481
Timeline_for_the_day_of_the_September_11_attacks,599876,21663.0,630083,1.840828
United_Airlines_Flight_93,584705,68876.0,658933,1.606754
Rick_Rescorla,388323,73912.0,404504,1.62337
Hijackers_in_the_September_11_attacks,296783,39478.0,343573,1.549574
American_Airlines_Flight_11,287207,46131.0,333964,1.511238
Collapse_of_the_World_Trade_Center,278131,29122.0,313135,1.625276
The_Falling_Man,269971,25890.0,297978,1.667167
United_Airlines_Flight_175,240384,33629.0,276438,1.550529


2021

Unnamed: 0_level_0,clicks_central,clicks_prev,clicks_next,spike_ratio
curr,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
Worth_(film),411252,54596,463510,1.587521
Kenneth_Feinberg,399966,36561,442841,1.668604
September_11th_Victim_Compensation_Fund,226676,54235,245291,1.513565


2022

Unnamed: 0_level_0,clicks_central,clicks_prev,clicks_next,spike_ratio
curr,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
Timeline_for_the_day_of_the_September_11_attacks,142027,21044,157169,1.593902
Mychal_Judge,32859,5114,37227,1.552113
Orio_Palmer,25774,4415,29565,1.51701


2023

Unnamed: 0_level_0,clicks_central,clicks_prev,clicks_next,spike_ratio
curr,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
September_11_attacks,1950667,470818.0,492785,4.048694
Dianne_Feinstein,1175875,259820.0,515358,3.033819
Aaron_Rodgers,931974,417795.0,262737,2.738957
Mark_Wahlberg,584859,341908.0,300941,1.819584
World_Trade_Center_(1973–2001),532825,163411.0,186482,3.045645
United_Airlines_Flight_93,530003,121397.0,101727,4.750748
Osama_bin_Laden,463701,233042.0,352634,1.583473
Casualties_of_the_September_11_attacks,388127,67011.0,83747,5.149007
Ryder_Cup,384310,74501.0,250393,2.365756
American_Airlines_Flight_11,333990,85485.0,82518,3.976


In [7]:
# second pass
years = [2021]

pages = ['Worth_(film)']

for idx, year in enumerate(years):
    aug = pd.read_csv(f'./spikes/{year}/august{year}.csv')
    sept = pd.read_csv(f'./spikes/{year}/september{year}.csv')
    octo = pd.read_csv(f'./spikes/{year}/october{year}.csv')
    display(year, get_max_spike_page(page=pages[idx], central_df=sept, prev_month_df=aug, next_month_df=octo))

2021

Unnamed: 0_level_0,clicks_central,clicks_prev,clicks_next,spike_ratio
curr,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
Worth_(film),411252,54596,463510,1.587521
Kenneth_Feinberg,399966,36561,442841,1.668604
September_11th_Victim_Compensation_Fund,226676,54235,245291,1.513565


In [8]:
page = 'September_11_attacks'

years = [2019, 2020, 2021, 2022, 2023]

for year in years:
    page_store = set()
    aug = pd.read_csv(f'./spikes/{year}/august{year}.csv')
    sept = pd.read_csv(f'./spikes/{year}/september{year}.csv')
    octo = pd.read_csv(f'./spikes/{year}/october{year}.csv')
    def get_spiked_pages(page, central_df, prev_month_df, next_month_df):
        pages_to_check = set([*central_df[(central_df['prev']==page) & (central_df['clicks']>=100)]['curr'].values,
                            #*central_df[(central_df['curr']==page) & (central_df['clicks']>=100)]['prev'].values,
                            *prev_month_df[(prev_month_df['prev']==page) & (prev_month_df['clicks']>=100)]['curr'].values,
                            #*prev_month_df[(prev_month_df['curr']==page) & (prev_month_df['clicks']>=100)]['prev'].values,
                            *next_month_df[(next_month_df['prev']==page) & (next_month_df['clicks']>=100)]['curr'].values,
                            #*next_month_df[(next_month_df['curr']==page) & (next_month_df['clicks']>=100)]['prev'].values,
                            #page
                            ])
        inflow_prev = prev_month_df[prev_month_df['type']=='link'].groupby('curr')[['clicks']].agg(sum)
        inflow_central = central_df[central_df['type']=='link'].groupby('curr')[['clicks']].agg(sum)
        inflow_next = next_month_df[next_month_df['type']=='link'].groupby('curr')[['clicks']].agg(sum)

        inflow_prev = inflow_prev[inflow_prev.index.isin(pages_to_check)]
        inflow_central = inflow_central[inflow_central.index.isin(pages_to_check)]
        inflow_next = inflow_next[inflow_next.index.isin(pages_to_check)]

        merged = inflow_central.join(inflow_prev, lsuffix='_central', rsuffix='_prev')
        merged = merged.join(inflow_next, rsuffix='_next')
        merged['clicks_next'] = merged['clicks']
        merged=merged.drop('clicks',axis=1)
        merged['spike_ratio'] = (2*merged['clicks_central'])/(merged['clicks_prev']+merged['clicks_next'])
        next_step_pages = merged[merged['spike_ratio']>1.5].sort_values('clicks_central', ascending=False)
        return next_step_pages.index.values

    queue = [page]
    visited_pages = set()
    page_store.add(page)

    level_size = 1
    next_level_size = 0

    spike_level_nums = []

    while len(queue)!=0:
        node = queue.pop(0)
        level_size-=1
        if node not in visited_pages:
            new_pages = get_spiked_pages(page, central_df=sept, prev_month_df=aug, next_month_df=octo)
            next_level_size += len(new_pages)
            queue.extend(new_pages)
            visited_pages.add(node)
            page_store.update(new_pages)
        if level_size==0:
            spike_level_nums.append(next_level_size)
            display(year, spike_level_nums)
            level_size = next_level_size
            next_level_size = 0
    with open(f'./spikes/{year}/spiked_nodes.pkl', 'wb') as f:
        pickle.dump(page_store, f, protocol=pickle.HIGHEST_PROTOCOL)

2019

[31]

2019

[31, 961]

2019

[31, 961, 0]

2020

[0]

2021

[3]

2021

[3, 9]

2021

[3, 9, 0]

2022

[1]

2022

[1, 1]

2022

[1, 1, 0]

2023

[194]

2023

[194, 37636]

2023

[194, 37636, 0]

In [10]:
years = [2022, 2021]
pages = ['Elizabeth_II', 'Worth_(film)']

for idx, year in enumerate(years):
    page_store = set()
    page = pages[idx]
    aug = pd.read_csv(f'./spikes/{year}/august{year}.csv')
    sept = pd.read_csv(f'./spikes/{year}/september{year}.csv')
    octo = pd.read_csv(f'./spikes/{year}/october{year}.csv')
    def get_spiked_pages(page, central_df, prev_month_df, next_month_df):
        pages_to_check = set([*central_df[(central_df['prev']==page) & (central_df['clicks']>=100)]['curr'].values,
                            #*central_df[(central_df['curr']==page) & (central_df['clicks']>=100)]['prev'].values,
                            *prev_month_df[(prev_month_df['prev']==page) & (prev_month_df['clicks']>=100)]['curr'].values,
                            #*prev_month_df[(prev_month_df['curr']==page) & (prev_month_df['clicks']>=100)]['prev'].values,
                            *next_month_df[(next_month_df['prev']==page) & (next_month_df['clicks']>=100)]['curr'].values,
                            #*next_month_df[(next_month_df['curr']==page) & (next_month_df['clicks']>=100)]['prev'].values,
                            #page
                            ])
        inflow_prev = prev_month_df[prev_month_df['type']=='link'].groupby('curr')[['clicks']].agg(sum)
        inflow_central = central_df[central_df['type']=='link'].groupby('curr')[['clicks']].agg(sum)
        inflow_next = next_month_df[next_month_df['type']=='link'].groupby('curr')[['clicks']].agg(sum)

        inflow_prev = inflow_prev[inflow_prev.index.isin(pages_to_check)]
        inflow_central = inflow_central[inflow_central.index.isin(pages_to_check)]
        inflow_next = inflow_next[inflow_next.index.isin(pages_to_check)]

        merged = inflow_central.join(inflow_prev, lsuffix='_central', rsuffix='_prev')
        merged = merged.join(inflow_next, rsuffix='_next')
        merged['clicks_next'] = merged['clicks']
        merged=merged.drop('clicks',axis=1)
        merged['spike_ratio'] = (2*merged['clicks_central'])/(merged['clicks_prev']+merged['clicks_next'])
        next_step_pages = merged[merged['spike_ratio']>1.5].sort_values('clicks_central', ascending=False)
        return next_step_pages.index.values

    queue = [page]
    visited_pages = set()
    page_store.add(page)

    level_size = 1
    next_level_size = 0

    spike_level_nums = []

    while len(queue)!=0:
        node = queue.pop(0)
        level_size-=1
        if node not in visited_pages:
            new_pages = get_spiked_pages(page, central_df=sept, prev_month_df=aug, next_month_df=octo)
            next_level_size += len(new_pages)
            queue.extend(new_pages)
            visited_pages.add(node)
            page_store.update(new_pages)
        if level_size==0:
            spike_level_nums.append(next_level_size)
            display(year, spike_level_nums)
            level_size = next_level_size
            next_level_size = 0
    with open(f'./spikes/{year}/spiked_nodes_max.pkl', 'wb') as f:
        pickle.dump(page_store, f, protocol=pickle.HIGHEST_PROTOCOL)


2022

[123]

2022

[123, 15129]

2022

[123, 15129, 0]

ParserError: Error tokenizing data. C error: Calling read(nbytes) on source failed. Try engine='python'.