In [1]:
import sys

sys.path.append('/home/jovyan/work')
%load_ext autoreload
%autoreload 2

In [2]:
import pandas as pd
import numpy as np

In [3]:
from helpers.logger import Logger
from helpers.vars import DUMPS_PATH, DATA_PATH, PRE_PATH, RES_PATH

### Load general Data and user kind, datetime, title normalization


In [4]:
column_names = np.loadtxt(f'{DATA_PATH}/mediawiki_history_columns.txt', dtype=str)

In [5]:
def process_lang_history(lang, column_names, dtypes, path=DUMPS_PATH, ending='tsv.bz2', years=[2018, 2019, 2020]):
    df_lang = pd.DataFrame()
    # quick fix for small wikis
    try:
        start = time.time()
        df_all = pd.read_csv(f'{path}/{lang}.{ending}', sep='\t', names=list(column_names), dtype=dtypes, warn_bad_lines=True, error_bad_lines=False)
        Logger.instance().info(f'Loaded {lang} in {time.time() - start}')
        return df_all
    except:
        traceback.print_exc()
        Logger.instance().info(f'PROBABLY EXPECTED ERROR: No "all-times" file for {lang}')

    for year in years:
        start = time.time()
        if lang == 'en':
            start = time.time()
            try:
                for month in range(1, 13): # just throw exception when out of bounds here
                    df_lang = pd.concat([df_lang, pd.read_csv(f'{path}/{lang}-{year}-{month:02d}.{ending}', sep='\t', names=list(column_names), dtype=dtypes, warn_bad_lines=True, error_bad_lines=False)])
                    Logger.instance().info(f'Loaded {lang}-{year}-{month:02d} in {time.time() - start}')
            except:
                Logger.instance().info(f'Error when processing {lang}-{year}-{month}')
        else:
            try:
                df_lang = pd.concat([df_lang, pd.read_csv(f'{path}/{lang}-{year}.{ending}', sep='\t', names=list(column_names), dtype=dtypes, warn_bad_lines=True, error_bad_lines=False)])
                Logger.instance().info(f'Loaded {lang}-{year} in {time.time() - start}')
            except:
                traceback.print_exc()
                Logger.instance().info(f'Error when processing {lang}-{year}')
    return df_lang

In [6]:
import swifter
import pendulum
from mw.lib import title as mw_t
import time
import traceback
from pathlib import Path

In [7]:
path_lang_codes = f'{DATA_PATH}/lang_code.csv'

tz_ = dict(pd.read_csv(path_lang_codes).set_index("code").timezone)
tz = {code: pendulum.timezone(t) for code, t in tz_.items()}
codes = list(tz.keys())
codes

['en',
 'sv',
 'de',
 'fr',
 'nl',
 'it',
 'ja',
 'ca',
 'sr',
 'no',
 'ko',
 'fi',
 'da',
 'tr']

In [8]:
# TEST PARAMS! (please ignore)
#PRE_PATH = '../../data/coronawiki/testrun'
#DUMPS_PATH = '../../data/coronawiki/mw-history'

In [None]:
df_dict = {}
for code in codes:
    df_dict[code] = process_lang_history(code, column_names, path=DUMPS_PATH, dtypes=str)

In [None]:
Path(f'{PRE_PATH}').mkdir(parents=True, exist_ok=True)

for code in codes:
    start = time.time()
    try:
        # === user kind merged to hear
        df_dict[code]['user_kind'] = df_dict[code].apply(lambda row: 'anonymous' if pd.isna(row.event_user_id) else 'bot' if not pd.isna(row.event_user_is_bot_by) else 'account', axis=1)
        Logger.instance('pipeline').info(f'User-Kind assignment for {code} in {time.time() - start}')
        
        # === Convert timestamp
        df_dict[code]['event_timestamp_t'] = df_dict[code].event_timestamp.swifter.apply(pd.to_datetime) #pd.to_datetime(df_dict[code].event_timestamp)
        # Set UTC date
        df_dict[code]["event_timestamp_t"] = df_dict[code].event_timestamp_t.dt.tz_localize("UTC", ambiguous='NaT', nonexistent='NaT')
        df_dict[code]["date_utc"] = df_dict[code].event_timestamp_t.dt.strftime("%Y%m%d").astype(int)
        # Localize date
        df_dict[code]["event_timestamp_t"] = df_dict[code].event_timestamp_t.dt.tz_convert(tz_[code])
        df_dict[code]["date"] = df_dict[code].event_timestamp_t.dt.strftime("%Y%m%d").astype(int)
        Logger.instance('pipeline').info(f'Finished date assignment for {code} in {time.time() - start}')
        
        # === Normalize date
        df_dict[code]['page_title_norm'] = df_dict[code].page_title.swifter.apply(lambda title: mw_t.normalize(str(title)) if not pd.isna(title) else np.nan)
        df_dict[code]['page_title_historical_norm'] = df_dict[code].page_title_historical.swifter.apply(lambda title: mw_t.normalize(str(title)) if not pd.isna(title) else np.nan)
        Logger.instance('pipeline').info(f'Finished name normalization for {code} in {time.time() - start}')
    except Exception as e:
        traceback.print_exc()
        Logger.instance('pipeline').info(f'Error for {code}: {str(e)}')

In [None]:
# OPTIONAL! save preprocessing so far
# NOTE: takes up lots of memory.
# write to processed
for code in codes:
    try:
        start = time.time()
        df_dict[code].to_csv(f'{PRE_PATH}/{code}_mwh_processed.tsv.gz', header=True, index=False, sep="\t", compression='gzip')
        Logger.instance('pipeline').info(f'Dumping {code} done in {time.time() - start}')
    except Exception as e:
        traceback.print_exc()
        Logger.instance('pipeline').info(f'Error when saving {code}: {str(e)}')

### Group By Date and User_Kind to get newcomers

In [12]:
from helpers.files import save_to_pickle

In [13]:
dict_creation = {}
for code in codes:
    start = time.time()
    try:
        df_code = df_dict[code]
        # define masks
        create_event_mask = (df_code.event_entity=='user') & (df_code.event_type == 'create')
        create_revision_mask = (df_code.event_entity=='revision') & (df_code.event_type == 'create')
        no_bot_mask = (df_code['event_user_is_bot_by'].isna() | df_code['event_user_is_bot_by_historical'].isna())
        self_creation_mask = (df_code['event_user_is_created_by_self'] == 'true')
        no_anon_mask = (df_code['event_user_is_anonymous'] != 'true')

        # === get users by registration
        group_creation = df_code[create_event_mask & no_anon_mask & no_bot_mask & self_creation_mask].groupby(['date'])['event_user_id'].size()

        # === get user by nth edit
        # n=1
        edit_count_mask = df_code.event_user_revision_count == '1'
        group_edit1 = df_code[create_revision_mask & no_anon_mask & no_bot_mask & edit_count_mask].groupby(['date'])['event_user_id'].size().rename('edit_1')
        # n=5
        edit_count_mask = df_code.event_user_revision_count == '5'
        group_edit5 = df_code[create_revision_mask & no_anon_mask & no_bot_mask & edit_count_mask].groupby(['date'])['event_user_id'].size().rename('edit_5')
        dict_creation[code] = pd.concat([group_creation, group_edit1, group_edit5], axis=1).fillna(0)
        Logger.instance('pipeline').info(f'User creation computation for {code} done in {time.time() - start}')
    except Exception as e:
        traceback.print_exc()
        Logger.instance('pipeline').info(f'Error for {code}: {str(e)}')
        
save_to_pickle(f'{PRE_PATH}/dict_newcomers_selfcreated.pkl', dict_creation)
Logger.instance('pipeline').info(f'Finished newcomers')

02-19 07:37 : INFO : User creation computation for en done in 912.7182765007019
INFO:pipeline:User creation computation for en done in 912.7182765007019
02-19 07:37 : INFO : User creation computation for sv done in 32.329793214797974
INFO:pipeline:User creation computation for sv done in 32.329793214797974
02-19 07:40 : INFO : User creation computation for de done in 136.2126841545105
INFO:pipeline:User creation computation for de done in 136.2126841545105
02-19 07:43 : INFO : User creation computation for fr done in 173.75721073150635
INFO:pipeline:User creation computation for fr done in 173.75721073150635
02-19 07:43 : INFO : User creation computation for nl done in 33.34605574607849
INFO:pipeline:User creation computation for nl done in 33.34605574607849
02-19 07:45 : INFO : User creation computation for it done in 106.61678528785706
INFO:pipeline:User creation computation for it done in 106.61678528785706
02-19 07:46 : INFO : User creation computation for ja done in 60.48223471641

### Group By Date and Page and user_kind to get edits per day

In [14]:
# Group By Date and Page and user_kind to get edits per day
dict_edits_byid = {} # grouped by id
dict_edits_bytitle = {} # grouped by title
for code in codes:
    try:
        start = time.time()
        df_code = df_dict[code]
        create_revision_mask = (df_code.event_entity=='revision') & (df_code.event_type == 'create')
        ns_mask = df_code.page_namespace == '0'

        # group by date, page_id, user_kind
        df_code.revision_text_bytes_diff = pd.to_numeric(df_code['revision_text_bytes_diff'], errors='coerce').fillna(0)
        df_code_masked = df_code[create_revision_mask & ns_mask]
        
        dict_edits_byid[code] = df_code_masked.groupby(['date', 'page_id', 'user_kind']).agg(
            {'event_user_id': 'size', 'revision_text_bytes_diff': 'sum', 'page_title': 'last', 
             'page_title_norm': 'last', 'page_title_historical_norm': 'last'})
        dict_edits_bytitle[code] = df_code_masked.groupby(['date', 'page_title_norm', 'user_kind']).agg(
            {'event_user_id': 'size', 'revision_text_bytes_diff': 'sum', 'page_id': lambda x: set(x), 'page_title': lambda x: set(x),  'page_title_historical_norm': lambda x: set(x)})

        Logger.instance('pipeline').info(f'Grouped by user/user_kind/pageid for {code} done in {time.time() - start}')
    except Exception as e:
        traceback.print_exc()
        Logger.instance('pipeline').info(f'Error for {code}: {str(e)}')  
    
save_to_pickle(f'{PRE_PATH}/dict_edits_byid.pkl', dict_edits_byid)
save_to_pickle(f'{PRE_PATH}/dict_edits_bytitle.pkl', dict_edits_bytitle)
Logger.instance('pipeline').info(f'Finished edits')

02-19 09:02 : INFO : Grouped by user/user_kind/pageid for en done in 4440.065267801285
INFO:pipeline:Grouped by user/user_kind/pageid for en done in 4440.065267801285
02-19 09:10 : INFO : Grouped by user/user_kind/pageid for sv done in 468.8935797214508
INFO:pipeline:Grouped by user/user_kind/pageid for sv done in 468.8935797214508
02-19 09:22 : INFO : Grouped by user/user_kind/pageid for de done in 723.7860300540924
INFO:pipeline:Grouped by user/user_kind/pageid for de done in 723.7860300540924
02-19 09:37 : INFO : Grouped by user/user_kind/pageid for fr done in 893.5918982028961
INFO:pipeline:Grouped by user/user_kind/pageid for fr done in 893.5918982028961
02-19 09:40 : INFO : Grouped by user/user_kind/pageid for nl done in 185.4799027442932
INFO:pipeline:Grouped by user/user_kind/pageid for nl done in 185.4799027442932
02-19 09:52 : INFO : Grouped by user/user_kind/pageid for it done in 763.2101054191589
INFO:pipeline:Grouped by user/user_kind/pageid for it done in 763.210105419158

### Group By Date and user_kind to get identity reverts per day (see above)

In [15]:
dict_reverts = {}
for code in codes:
    try:
        start = time.time()
        df_code = df_dict[code]
        create_revision_mask = (df_code.event_entity=='revision') & (df_code.event_type == 'create')
        ns_mask = df_code.page_namespace == '0'

        # get reverts per day as well as reverted
        df_reverted = df_code[create_revision_mask & ns_mask & (df_code.revision_is_identity_reverted == 'true')].groupby(['date', 'user_kind'])['revision_is_identity_reverted'].size()
        df_reverts = df_code[create_revision_mask & ns_mask & (df_code.revision_is_identity_revert == 'true')].groupby(['date', 'user_kind'])['revision_is_identity_revert'].size()
        
        # reindex so all dates are filled
        df_reverted = df_reverted.reindex(
            pd.MultiIndex.from_product([df_code.date.unique(), df_reverted.index.levels[1]], names=['date', 'user_kind']), fill_value=0)
        df_reverts = df_reverts.reindex(
            pd.MultiIndex.from_product([df_code.date.unique(), df_reverts.index.levels[1]], names=['date', 'user_kind']), fill_value=0)
    
        dict_reverts[code] = pd.concat([df_reverted, df_reverts], axis=1).fillna(0)
        Logger.instance('pipeline').info(f'Computed reverts by {code} done in {time.time() - start}')
    except Exception as e:
        traceback.print_exc()
        Logger.instance('pipeline').info(f'Error for {code}: {str(e)}')  
    
save_to_pickle(f'{PRE_PATH}/dict_reverts.pkl', dict_reverts)
Logger.instance('pipeline').info(f'Finished identity reverts')

02-19 10:43 : INFO : Computed reverts by en done in 383.1931080818176
INFO:pipeline:Computed reverts by en done in 383.1931080818176
02-19 10:43 : INFO : Computed reverts by sv done in 8.665184497833252
INFO:pipeline:Computed reverts by sv done in 8.665184497833252
02-19 10:44 : INFO : Computed reverts by de done in 51.96122360229492
INFO:pipeline:Computed reverts by de done in 51.96122360229492
02-19 10:45 : INFO : Computed reverts by fr done in 52.19099450111389
INFO:pipeline:Computed reverts by fr done in 52.19099450111389
02-19 10:45 : INFO : Computed reverts by nl done in 13.434366464614868
INFO:pipeline:Computed reverts by nl done in 13.434366464614868
02-19 10:46 : INFO : Computed reverts by it done in 41.403966426849365
INFO:pipeline:Computed reverts by it done in 41.403966426849365
02-19 10:46 : INFO : Computed reverts by ja done in 27.565186738967896
INFO:pipeline:Computed reverts by ja done in 27.565186738967896
Traceback (most recent call last):
  File "<ipython-input-15-13

## Combine edit dictionary with covid info

In [16]:
# This file was pre-generated using the .json-list of COVID-articles from:
# https://covid-data.wmflabs.org/
path_covid = f'{DATA_PATH}/covid_linked.f'
df_covid = pd.read_feather(path_covid)
df_covid['covid'] = True
df_covid['index'] = df_covid['index'].apply(lambda t: mw_t.normalize(str(t)))

In [17]:
df_edits_covid = {}
for code, df_code in dict_edits_bytitle.items():
    df_edits_covid[code] = df_code.reset_index().merge(df_covid[df_covid.site == f'{code}wiki'], left_on=['page_title_norm'], right_on=['index'], how='left').fillna({'covid': False}).drop(['index', 'site', 'qid'], axis=1)

save_to_pickle(f'{PRE_PATH}/dict_edits_bytitle_covid.pkl', df_edits_covid)

# Generate Final Aggregation

In [18]:
from helpers.preprocessing import aggregate_preprocess_results

In [19]:
final_agg = aggregate_preprocess_results(codes, df_edits_covid, dict_creation, dict_reverts)
final_agg

reading and making sure all dates are filled


02-19 11:21 : INFO : Processing en took 400.5793981552124
INFO:pipeline:Processing en took 400.5793981552124


reading and making sure all dates are filled


02-19 11:21 : INFO : Processing sv took 25.778750896453857
INFO:pipeline:Processing sv took 25.778750896453857


reading and making sure all dates are filled


02-19 11:23 : INFO : Processing de took 74.87158846855164
INFO:pipeline:Processing de took 74.87158846855164


reading and making sure all dates are filled


02-19 11:24 : INFO : Processing fr took 74.31901431083679
INFO:pipeline:Processing fr took 74.31901431083679


reading and making sure all dates are filled


02-19 11:24 : INFO : Processing nl took 17.01649022102356
INFO:pipeline:Processing nl took 17.01649022102356


reading and making sure all dates are filled


02-19 11:25 : INFO : Processing it took 57.786693811416626
INFO:pipeline:Processing it took 57.786693811416626


reading and making sure all dates are filled


02-19 11:26 : INFO : Processing ja took 35.69470715522766
INFO:pipeline:Processing ja took 35.69470715522766
Traceback (most recent call last):
  File "/home/jovyan/work/helpers/preprocessing.py", line 42, in aggregate_preprocess_results
    df_gb = process_edits(dict_edits, code)
  File "/home/jovyan/work/helpers/preprocessing.py", line 10, in process_edits
    df_code = dict_date[code].reset_index()
KeyError: 'ca'
02-19 11:26 : INFO : Error for ca: 'ca'
INFO:pipeline:Error for ca: 'ca'
02-19 11:26 : INFO : Processing ca took 0.002092599868774414
INFO:pipeline:Processing ca took 0.002092599868774414


reading and making sure all dates are filled
reading and making sure all dates are filled


02-19 11:26 : INFO : Processing sr took 36.63195300102234
INFO:pipeline:Processing sr took 36.63195300102234


reading and making sure all dates are filled


02-19 11:26 : INFO : Processing no took 6.5973920822143555
INFO:pipeline:Processing no took 6.5973920822143555


reading and making sure all dates are filled


02-19 11:27 : INFO : Processing ko took 16.5759117603302
INFO:pipeline:Processing ko took 16.5759117603302


reading and making sure all dates are filled


02-19 11:27 : INFO : Processing fi took 5.3210015296936035
INFO:pipeline:Processing fi took 5.3210015296936035


reading and making sure all dates are filled


02-19 11:27 : INFO : Processing da took 2.4243338108062744
INFO:pipeline:Processing da took 2.4243338108062744
Traceback (most recent call last):
  File "/home/jovyan/work/helpers/preprocessing.py", line 42, in aggregate_preprocess_results
    df_gb = process_edits(dict_edits, code)
  File "/home/jovyan/work/helpers/preprocessing.py", line 10, in process_edits
    df_code = dict_date[code].reset_index()
KeyError: 'tr'
02-19 11:27 : INFO : Error for tr: 'tr'
INFO:pipeline:Error for tr: 'tr'
02-19 11:27 : INFO : Processing tr took 0.0016481876373291016
INFO:pipeline:Processing tr took 0.0016481876373291016


reading and making sure all dates are filled


Unnamed: 0,date,covid,user_kind,count,rev_len_sum,actor_user,edit_1,edit_5,revision_is_identity_reverted,revision_is_identity_revert,code
0,2018-01-01,False,account,71799,1.929916e+09,4178.0,1297.0,348.0,6011,6683,en
1,2018-01-01,False,anonymous,23740,7.026144e+08,0.0,0.0,0.0,7155,1083,en
2,2018-01-01,False,bot,7196,1.487072e+08,0.0,0.0,0.0,257,627,en
3,2018-01-01,True,account,4,1.942120e+05,4178.0,1297.0,348.0,6011,6683,en
4,2018-01-02,False,account,81001,1.992566e+09,5155.0,1905.0,485.0,6355,7728,en
...,...,...,...,...,...,...,...,...,...,...,...
3423,2020-11-30,True,account,1,1.020430e+05,16.0,12.0,2.0,5,29,da
3424,2020-12-01,False,account,307,3.912818e+06,10.0,12.0,2.0,19,35,da
3425,2020-12-01,False,anonymous,134,1.528426e+06,0.0,0.0,0.0,44,11,da
3426,2020-12-01,False,bot,2,1.804700e+04,0.0,0.0,0.0,0,0,da


In [20]:
Path(f'{RES_PATH}').mkdir(parents=True, exist_ok=True)
final_agg.to_csv(f'{RES_PATH}/aggregated.tsv.gz', index=False, sep="\t", compression="gzip")