## Detecting Domination in Wikipedia
Hypothesis: destructive domination where updates to a Wikipedia page are performed by relatively few users, at the (explicit) exclusion of other users by reverting their edits that were of good quality

In [None]:
import os
import argparse
import numpy as np
import pandas as pd
import sys


#Preprocessing the logfile
node_folder =  'wikidata/'
timestamp_col='revtime'
dateparse = lambda x: pd.datetime.strptime(x, '%Y-%m-%dT%H:%M:%SZ') 

# read all the data into one data frame
# remove all pages that have only single user edits
nodefile = node_folder + 'benign_2013_' 

df_dev = pd.read_csv(nodefile + '1' + '.csv') #, parse_dates=['revtime'], date_parser=dateparse)
df_dev['revtime'] = pd.to_datetime(df_dev['revtime'])
df_complete = df_dev[df_dev.groupby(['pagetitle'])['username'].transform('nunique')>1]
for i in range(2, 13):
    df_temp = pd.read_csv(nodefile+ str(i) + '.csv')
    df_temp['revtime'] = pd.to_datetime(df_temp['revtime'])
    df = df_temp[df_temp.groupby(['pagetitle'])['username'].transform('nunique')>1]
    df_complete = df_complete.append(df_temp)

    
nodefile = node_folder + 'benign_2014_' 

for i in range(1, 8):
    df_temp = pd.read_csv(nodefile+ str(i) + '.csv')
    df_temp['revtime'] = pd.to_datetime(df_temp['revtime'])
    df = df_temp[df_temp.groupby(['pagetitle'],as_index=False)['username'].transform('nunique')>1]
    df_complete = df_complete.append(df_temp)
    
df_complete.to_csv(node_folder + "wiki_consolidated.csv")
df_complete




In [None]:
import os
import argparse
import numpy as np
import pandas as pd
import sys
node_folder =  'wikidata/'
timestamp_col='revtime'

df_complete = pd.read_csv(node_folder + "wiki_consolidated.csv")
df_complete['revtime'] = pd.to_datetime(df_complete['revtime'])

In [None]:
df_complete.drop(['cluebotRevert', 'stiki_REP_USER'], axis=1)

def get_reverted_user(group):
    
    group = group.sort_values('revtime', ascending=False, kind='mergesort')
    
    group['rev_username']=group['username'].shift(1)
    group['rev_username']=group['rev_username'].fillna('NA')
    
    
    return group


#df_dev['userid'] = df_dev['userid'].fillna('NA')
df_complete.index.name=None
df_complete.reset_index(inplace=True, drop=True)

df_complete = df_complete.groupby('pagetitle', as_index=False).apply(get_reverted_user)
print(df_complete.count())

df_complete.index.name=None
df_complete.head(60)

In [None]:
df_complete.count()

In [None]:
df_complete.to_csv(node_folder + "wiki_processed_rev.csv")

df_total_users =  df_complete.groupby('pagetitle')['username'].agg(['nunique'])
print(df_total_users.count())
df_total_users.reset_index(inplace=True)
df_complete.reset_index(inplace=True,drop=True)

df_true = df_complete[df_complete.isReverted==True]
df_true= df_true[df_true['username']!=df_true['rev_username']]

#print(df_true)
df_true.index.name=None
df_true.reset_index(inplace=True, drop=True)
df_true = df_true.sort_values(['revtime'], ascending=True, kind='mergesort')
page_list = list(df_true.pagetitle.unique())
#print(page_list)

df_true.to_csv(node_folder + "wiki_revertdata.csv")
df_min_max_score = df_true.groupby('pagetitle')['stiki_score'].agg(['mean', 'count', 'median'])

df_nusers =  df_true.groupby('pagetitle')['username'].agg(['nunique'])

df_min_max_score = df_min_max_score.merge(df_nusers, on='pagetitle', how='left')
#print(df_min_max_score)
df_min_max_score = df_min_max_score.sort_values(['nunique'], ascending=False, kind='mergesort')
df_min_max_score.reset_index(inplace=True)
df_min_max_score= df_min_max_score[~df_min_max_score['pagetitle'].str.contains("Wikipedia")]
df_min_max_score = df_min_max_score.iloc[3:]
df_min_max_score

In [None]:
df_false = df_complete[df_complete.isReverted==False]
df_false = df_false[df_false.pagetitle.isin(page_list)]
print(df_false)
df_false.to_csv(node_folder + "wiki_non_revertdata.csv")
df_false.index.name=None
df_false.reset_index(inplace=True, drop=True)
df_false = df_false.sort_values(['revtime'], ascending=True, kind='mergesort')
df_min_max_fscore = df_false.groupby('pagetitle')['stiki_score'].agg(['mean', 'max', 'count', 'median'])
#print(df_min_max_fscore)
df_nfusers =  df_false.groupby('pagetitle')['username'].agg(['nunique'])

df_min_max_fscore = df_min_max_fscore.merge(df_nfusers, on='pagetitle', how='left')

df_min_max_fscore = df_min_max_fscore.sort_values(['nunique'], ascending=False, kind='mergesort')
df_min_max_fscore.reset_index(inplace=True)
df_min_max_fscore= df_min_max_fscore[~df_min_max_fscore['pagetitle'].str.contains("Wikipedia")]
df_min_max_fscore.rename(columns={'mean':'no_revert_mean', 'max':'no_revert_max', 'count':'no_revert_count', 'median':'no_revert_median', 'nunique':'no_revert_users'}, inplace=True)

df_min_max_score.rename(columns={'mean':'revert_mean', 'max':'revert_median', 'count':'revert_count','nunique':'revert_users'}, inplace=True)

df_min_max_fscore

In [None]:
df_ncomplete = df_false.append(df_true)
df_total_rscore = df_ncomplete.groupby('pagetitle')['stiki_score'].agg(['median'])


df_final = df_min_max_score.merge(df_min_max_fscore, on='pagetitle', how='left')
df_final = df_final.merge(df_total_rscore, on='pagetitle', how='right')
df_final = df_final.merge(df_total_users, on='pagetitle', how='right')

df_final.dropna(inplace=True)
df_final

In [None]:
df_final.to_csv(node_folder + "wiki_domination_h1_analysis.csv")