In [None]:
from google.colab import drive
drive.mount('/content/drive')

import pandas as pd
import urllib
from sklearn.pipeline import Pipeline
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfTransformer
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import roc_auc_score
import os

Mounted at /content/drive


In [None]:
# Use the cumulative_word_count_past_month.tsv here. It already have all the confounders that will be used in Matching
df = pd.read_csv('/content/drive/MyDrive/ExMachina/cumulative_word_count_past_month.tsv', delimiter='\t')

In [None]:
df['thread_id'] = df.apply(lambda row: (row['page_id'], row['thread']), axis=1)

In [None]:
thread_stats = df.groupby('thread_id').agg(
    page_id=pd.NamedAgg(column='page_id', aggfunc='first'),
    thread=pd.NamedAgg(column='thread', aggfunc='first'),
    num_comments=pd.NamedAgg(column='comment', aggfunc='count'),
    num_users=pd.NamedAgg(column='user_text', aggfunc='nunique'),
    has_attack=pd.NamedAgg(column='is_attack', aggfunc=lambda x: any(x == 1)),
    num_attack_comments=pd.NamedAgg(column='is_attack', aggfunc='sum')
    # mean_active_rate=pd.NamedAgg(column='cumulative_word_count_past_month', aggfunc='mean')
).reset_index()
thread_stats

Unnamed: 0,thread_id,page_id,thread,num_comments,num_users,has_attack,num_attack_comments
0,"(692.0, 0)",692.0,0,3,1,False,0
1,"(692.0, 1)",692.0,1,2,1,False,0
2,"(722.0, 0)",722.0,0,2,2,False,0
3,"(722.0, 1)",722.0,1,3,2,False,0
4,"(722.0, 2)",722.0,2,7,3,False,0
...,...,...,...,...,...,...,...
149101,"(48896727.0, 1)",48896727.0,1,6,2,False,0
149102,"(48901009.0, 0)",48901009.0,0,8,2,False,0
149103,"(48901009.0, 1)",48901009.0,1,2,2,False,0
149104,"(48906777.0, 0)",48906777.0,0,2,1,False,0


In [None]:
filtered_df = df.dropna(subset=['user_id'])
grouped = filtered_df.groupby(['thread_id', 'user_id']).first()
grouped = grouped.reset_index()
mean_values = grouped.groupby('thread_id')['cumulative_word_count_past_month'].mean()
mean_values_df = mean_values.reset_index()
mean_values_df.rename(columns={'cumulative_word_count_past_month': 'mean_cumulative_word_count'}, inplace=True)

In [None]:
thread_stats = thread_stats.merge(mean_values_df, on='thread_id', how='left')

In [None]:
import math
import numpy as np
import scipy
from scipy.stats import binom, hypergeom
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.linear_model import LogisticRegression

In [None]:
thread_stats['has_attack'] = thread_stats['has_attack'].astype(int)

In [None]:
# After the matching, some value might be NaN
thread_stats['mean_cumulative_word_count'].fillna(0, inplace=True)

In [None]:
propensity = LogisticRegression()
propensity = propensity.fit(thread_stats[["num_comments","num_users","mean_cumulative_word_count"]], thread_stats.has_attack)
pscore = propensity.predict_proba(thread_stats[["num_comments","num_users","mean_cumulative_word_count"]])[:,1]
thread_stats['Propensity'] = pscore

In [None]:
attack_df = thread_stats[thread_stats['has_attack'] == 1]
print(len(attack_df))
no_attack_df = thread_stats[thread_stats['has_attack'] == 0]
print(len(no_attack_df))

2281
146825


In [None]:
import pandas as pd


attack_df = thread_stats[thread_stats['has_attack'] == 1]
no_attack_df = thread_stats[thread_stats['has_attack'] == 0]

matched_pairs = []

for page_id in df['page_id'].unique():
    attack_threads = attack_df[attack_df['page_id'] == page_id]
    no_attack_threads = no_attack_df[no_attack_df['page_id'] == page_id]

    for _, attack_thread in attack_threads.iterrows():
        # Find the closest propensity score among no-attack threads
        if len(no_attack_threads)==0:
          continue
        propensity_diff = no_attack_threads['Propensity'].sub(attack_thread['Propensity']).abs()
        closest_match_index = propensity_diff.idxmin()
        closest_match = no_attack_threads.loc[closest_match_index]

        if not closest_match.empty:
            matched_pairs.append([attack_thread['thread_id'], closest_match['thread_id']])

            # Optional: Remove the matched no-attack thread to prevent it from being matched again
            no_attack_df = no_attack_df[no_attack_df['thread_id'] != closest_match['thread_id']]
            no_attack_threads = no_attack_threads[no_attack_threads['thread_id'] != closest_match['thread_id']]

matched_pairs_df = pd.DataFrame(matched_pairs, columns=['Thread_ID_1', 'Thread_ID_2'])


In [None]:
flat_matched_pairs = set([thread_id for pair in matched_pairs for thread_id in pair])
len(flat_matched_pairs)

4492

In [None]:
filtered_df = df[df['thread_id'].isin(flat_matched_pairs)]
filtered_df.to_csv('/content/drive/MyDrive/ExMachina/matched_thread_2014_2015.tsv', sep='\t', index=False)

In [None]:
filtered_df

Unnamed: 0,rev_id,comment,raw_comment,timestamp,page_id,page_title,user_id,user_text,admin,predicted_prob,is_attack,time_diff,page_total_comments,thread,cumulative_word_count_past_month,thread_id
10,665467489,"""NEWLINENEWLINE== See:also ==NEWLINENEWLINEWha...","""NEWLINENEWLINE== See:also ==NEWLINENEWLINEWha...",2015-06-04 13:19:22+00:00,722.0,Animal,,135.19.158.135,0.0,0.015985,0,87 days 22:24:03,20,2,0,"(722.0, 2)"
11,665756115,"""NEWLINE:::Nope. Two sumpsimians can be pompou...","""NEWLINE:::Nope. Two sumpsimians can be pompou...",2015-06-06 13:24:53+00:00,722.0,Animal,7764849.0,LlywelynII,0.0,0.003372,0,2 days 00:05:31,20,2,26566,"(722.0, 2)"
12,665756756,"""NEWLINE:Seems utterly appropriate to me. The ...","""NEWLINE:Seems utterly appropriate to me. The ...",2015-06-06 13:31:43+00:00,722.0,Animal,7764849.0,LlywelynII,0.0,0.005268,0,0 days 00:06:50,20,2,26654,"(722.0, 2)"
13,665756831,Thomas Henry |author-link=Thomas Henry Huxley,Thomas Henry |author-link=Thomas Henry Huxley,2015-06-06 13:32:34+00:00,722.0,Animal,7764849.0,LlywelynII,0.0,0.058566,0,0 days 00:00:51,20,2,26748,"(722.0, 2)"
14,665757216,author-link=Peter Chalmers Mitchell |,author-link=Peter Chalmers Mitchell |,2015-06-06 13:36:42+00:00,722.0,Animal,7764849.0,LlywelynII,0.0,0.029225,0,0 days 00:04:08,20,2,26753,"(722.0, 2)"
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1171469,696045438,NEWLINE: Looks like it was still (just) Mandat...,NEWLINE: Looks like it was still (just) [[Mand...,2015-12-20 15:24:24+00:00,48866568.0,Mohamed Hadid,1450053.0,Edwardx,0.0,0.023928,0,0 days 03:13:21,21,0,1278,"(48866568.0, 0)"
1171470,696080618,NEWLINE::: Does this mean he's British?,NEWLINE::[[User:Edwardx]]: Does this mean he's...,2015-12-20 20:05:35+00:00,48866568.0,Mohamed Hadid,1673502.0,Zigzig20s,0.0,0.060824,0,0 days 04:41:11,21,0,14680,"(48866568.0, 0)"
1171471,696085737,NEWLINE:::I don't think the people living in t...,NEWLINE:::I don't think the people living in t...,2015-12-20 20:47:37+00:00,48866568.0,Mohamed Hadid,1450053.0,Edwardx,0.0,0.015987,0,0 days 00:42:02,21,0,1303,"(48866568.0, 0)"
1171472,696543469,NEWLINENEWLINEReally? He is a Palestinian. Sto...,NEWLINENEWLINEReally? He is a Palestinian. Sto...,2015-12-23 21:56:29+00:00,48866568.0,Mohamed Hadid,26178114.0,Dundun1984,0.0,0.855639,1,3 days 01:08:52,21,1,0,"(48866568.0, 1)"


In [None]:
matched_pairs_df.to_csv('/content/drive/MyDrive/ExMachina/matched_pairs_full_2014_2015.csv', index=False)

Evaluation of matching use SMD

In [None]:
matched_pairs_df = pd.read_csv('/content/drive/MyDrive/ExMachina/matched_pairs_full_2014_2015.csv')

In [None]:
import ast
matched_pairs_df['Thread_ID_1'] = matched_pairs_df['Thread_ID_1'].apply(ast.literal_eval)
matched_pairs_df['Thread_ID_2'] = matched_pairs_df['Thread_ID_2'].apply(ast.literal_eval)

In [None]:
treatment_df = pd.merge(matched_pairs_df, thread_stats, left_on='Thread_ID_1', right_on='thread_id', how='inner')

In [None]:
control_df = pd.merge(matched_pairs_df, thread_stats, left_on='Thread_ID_2', right_on='thread_id', how='inner')

In [None]:
def calculate_smd(treatment_df, control_df, column):
    treatment_mean = treatment_df[column].mean()
    control_mean = control_df[column].mean()
    pooled_sd = ((treatment_df[column].std()**2 + control_df[column].std()**2) / 2)**0.5
    smd = (treatment_mean - control_mean) / pooled_sd
    return smd

smd_num_users = calculate_smd(treatment_df, control_df, 'num_users')
smd_num_comments = calculate_smd(treatment_df, control_df, 'num_comments')
smd_mean_cumulative_word_count = calculate_smd(treatment_df, control_df, 'mean_cumulative_word_count')

print(f"SMD for num_users: {smd_num_users}")
print(f"SMD for num_comments: {smd_num_comments}")
print(f"SMD for mean_cumulative_word_count: {smd_mean_cumulative_word_count}")

SMD for num_users: 0.24368952993433338
SMD for num_comments: 0.17093127981192824
SMD for mean_cumulative_word_count: 0.04341543900244072
