# Prepare Data

In [None]:
# Download packages
import pandas as pd

# Load the revisions as a dataframe
file_index = 19

import_file = 'revisions' + str(file_index) + '_complete.csv'
revisions_df = pd.read_csv(import_file)

revisions_df.reset_index(inplace = True, drop = True)
revisions_df

In [None]:
# Tokenize the words and find the difference between new and old texts 
import re
from tqdm import tqdm
import copy
import numpy as np

diff_lst = []
new_text_lst = []
old_text_lst = []
index_lst = []
i = 0

for index, row in tqdm(revisions_df.iterrows()):
    if index < 0:
        continue
    
    if pd.isna(row['new_text']):
        revisions_df = revisions_df.drop([index])
    else:
        # remove links
        text_new = [re.sub(r'http\S+', '', row['new_text'])]

        # remove all non-alphanumeric strings
        text_new = re.sub(r'[\W_]', ' ', str(text_new))

        # remove all words with numbers
        text_new = re.sub(r'\S*\d+\S*', ' ', str(text_new))

        # remove line break markers 
        text_new = re.sub(r'\sn\s', ' ', str(text_new))

        # remove hanging contractions or possessives
        text_new = re.sub(r'\ss\s', ' ', str(text_new))

        # remove excess spaces
        text_new = re.sub(' +', ' ', str(text_new))

        # convert into single strings
        text_new = str(text_new).lower().strip().split(' ')

        new_text_lst.append(copy.deepcopy(text_new))
        index_lst.append(index)

        # repeat for old text
        if row['rev_id_prev'] == 0 or not isinstance(row['old_text'], str):
            old_text_lst.append([''])
        else:
            text_old = [re.sub(r'http\S+', '', row['old_text'])]
            text_old = re.sub(r'[\W_]', ' ', str(text_old))
            text_old = re.sub(r'\S*\d+\S*', ' ', str(text_old))
            text_old = re.sub(r'\sn\s', ' ', str(text_old))
            text_old = re.sub(r'\ss\s', ' ', str(text_old))
            text_old = re.sub(' +', ' ', str(text_old))
            text_old = str(text_old).lower().strip().split(' ')
            old_text_lst.append(copy.deepcopy(text_old))

        # Find the difference between the two texts
        add = [x for x in text_new if not x in text_old or text_old.remove(x)]
        subtract = [x for x in text_old if not x in text_new or text_new.remove(x)]
        diff = add + subtract
        diff_lst.append(diff)
    
revisions_df['diff_text'] = diff_lst
revisions_df['new_text'] = new_text_lst
revisions_df['old_text'] = old_text_lst

revisions_df.to_csv('revisions' + str(file_index) + '_parsed.csv', index=False)

# Edits Model

In [None]:
# Combine edits to the same page by the same author
revisions_df_combined = copy.deepcopy(revisions_df)
revisions_df_combined = revisions_df_combined.drop(['redirect', 'ns', 'rev_id', 'timestamp', 'ip', 'username', 'comment', 'byt', 'rev_id_prev'], axis = 1) 
revisions_df_combined = revisions_df_combined.groupby(['page_id','page_title', 'country', 'org']).agg({'diff_text':'sum'}).reset_index()

revisions_df_combined.to_csv('revisions' + str(file_index) + '_read4edits.csv', index=False)
revisions_df_combined

# Article Model

In [None]:
# Convert the timestamp so it can be analyzed
import datetime

new_df_combined = copy.deepcopy(revisions_df)

for index, row in tqdm(new_df_combined.iterrows()):
    try:
        time_string = row['timestamp']

        year = int(time_string[0:4])
        month = int(time_string[5:7])
        day = int(time_string[8:10])
        hour = int(time_string[11:13])
        minute = int(time_string[14:16])
        second = int(time_string[17:19])

        time_val = pd.to_datetime(datetime.datetime(year, month, day, hour, minute, second))
        time_placeholder = (datetime.datetime.utcfromtimestamp(0) - time_val).total_seconds()
        
        new_df_combined.loc[index, 'timestamp'] = time_placeholder
    except:
        new_df_combined.loc[index, 'timestamp'] = 0

new_df_combined.timestamp = new_df_combined.timestamp.astype(float)

# Combine pages to the same page by the same author; taking the last edited version of the page by the author
new_df_combined = new_df_combined.drop(['redirect', 'ns', 'rev_id', 'ip', 'username', 'comment', 'byt', 'rev_id_prev'], axis = 1) 
new_df_combined = new_df_combined.groupby(['page_id','page_title', 'country', 'org']).apply(lambda x: x.loc[x.timestamp.idxmax()]).reset_index(drop = True)

new_df_combined.to_csv('article' + str(file_index) + '_read4article.csv', index=False)
new_df_combined