# Example Computation Pipeline
## for results presented in "On the Relation of Edit Behavior, Link Structure, and Article Quality on Wikipedia" by Thorsten Ruprechter, Tiago Santos, and Denis Helic

### Import statements and assignment of frequently used variables

In [None]:
import csv

import editbehavior as eb
import linkstructure as ls
import stats
import utils

data_dir = 'data'
article_dir = 'data/articles'
article_folders = ['a', 'b', 'c', 'featured', 'good', 'cw']
labels = ['Content', 'Format', 'WikiContext']
label_combinations = eb.get_label_combinations(labels)[0]

### Load Results of External Frameworks

In [None]:
# Load features and labels as processed by Yang et al.'s framework here (e.g., from pickle file)
# Necessary for computation of edit label metrics
features, y_labels = utils.load_from_file_pickle('<Filename for Features>', data_dir)

# Load (and postprocess, if required) WikiLinkGraphs dataset
# Necessary for link metric computation
# Note the optional postprocessing: resolve Wikipedia redirects and remove redirects (nodes with single outgoing link).
# If no processing is needed for the current task, one could also directly load the WikiLinkGraphs dataset.
# The example uses csv's, but files could be of any type.
wikilinkgraphs = ls.resolve_redirects(
        utils.read_csv('<Filename for Wikipedia redirects>.csv', data_dir, escape=csv.QUOTE_ALL),
        utils.read_csv('<Filename for WikiLinkGraphs>.csv', data_dir, delimiter='\t'))


### Calculating Relative Frequencies and Transition Probabilities for Edit Actions

In [None]:
# Train random forest on given data
random_forest = eb.train_random_forest(features, y_labels)

# for this example, we store our articles in separate folders
category_dict = {}
for cat_folder in article_folders:
    # this could be any article list you like
    articles = utils.get_article_files_pickle(utils.get_path(article_dir, cat_folder))
    labeled_articles = eb.label_article_folder(random_forest, articles, cat_folder, article_dir)
    category_dict[cat_folder] = labeled_articles

# Compute transition probabilities and relative frequencies
tp_articles, tp_categories = eb.calculate_transition_probabilities(category_dict, labels)
rf_articles, rf_categories = eb.calculate_relative_frequencies(category_dict)

# Option to calculate revision info (articles, revs/article, etc.) as well as macro and micro results for categories
cat_revcount, rf_cat_macro, rf_cat_micro = eb.get_micro_and_macro_arrays(rf_categories, label_combinations)

### Permutation Tests for Relative Frequencies and Transition Probabilities

In [None]:
tp_sig = stats.permutation_test(tp_articles)
rf_sig = stats.permutation_test_frequency(rf_articles, label_combinations)

### Link Analysis of WikiLinkGraphs

In [None]:
# Build graph
G = ls.build_graph_from_wikilinkgraphs(wikilinkgraphs)

# Specify which articles belong to which category - depends on current task
articles_per_category = None  # can also just use previous results as we do here

# Calculate metric (e.g., out-degree)
out_degree_dict = {utils.normalize_article(k): deg for k, deg in G.out_degree().items()}

# Use helper to pick apart article results per category
# This step returns the outdegrees per category, in nested arrays.
categories, outdegree_per_cat, not_found = ls.order_metric_by_category(out_degree_dict, articles_per_category)
