In [1]:
import json
import pprint

In [2]:
tweets =[]
with open('../Gamergate_Dataset/Gamergate.json','r') as f:
    for line in f.readlines():
        tweets.append(json.loads(line))

In [3]:
with open('../Gamergate_Dataset/Labels_for_7320_users/gamergate_opposer_feminist_usernames', 'r') as f:
    opposer =f.read().splitlines()
with open('../Gamergate_Dataset/Labels_for_7320_users/gamergate_supporter_misogynist_usernames','r') as f:
    supporter = f.read().splitlines()
with open('../Gamergate_Dataset/Labels_for_7320_users/unaffiliated_usernames','r') as f:
    unaffiliated = f.read().splitlines()

In [4]:
allusers = opposer + unaffiliated + supporter

In [5]:
import networkx as nx

In [6]:
G = nx.Graph()
G.add_nodes_from(allusers)

In [7]:
for tweet in tweets:
    username = tweet['user']['screen_name']
    if username not in allusers:
        continue
    if 'retweeted_status' not in tweet:
        continue
    retweeteduser = tweet['retweeted_status']['user']['screen_name']
    if retweeteduser not in allusers:
        continue
    G.add_edge(username,retweeteduser)
G = G.to_undirected()

# Louvain

In [8]:
import community as community_louvain

In [69]:
partition = community_louvain.best_partition(G)

In [70]:
len(partition)

7320

# Get Communities and Post-Processing

In [71]:
import numpy as np

In [108]:
tmp = np.unique(list(partition.values()), return_counts=True)
tmp = list(zip(*tmp))
major_communities = sorted(tmp, key=lambda x: x[1], reverse=True)[:2]
# sum([i[1] for i in tmp])/7320*100
major_communities = dict(major_communities)
major_communities

{0: 2701, 2: 1206}

In [109]:
communities = {k: [] for k in major_communities}
for user, community in partition.items():
    if community in major_communities:
        communities[community].append(user)

In [110]:
majorities = {}
for community, users in communities.items():
    o = 0
    s = 0
    u = 0
    for sn in users:
        if sn in opposer:
            o += 1
        elif sn in supporter:
            s += 1
        else:
            u += 1
    if max(s, o, u) == s:
        majorities[community] = 1
    elif max(s, o, u) == o:
        majorities[community] = 0
    else:
        majorities[community] = 2

In [111]:
user_pred_labels = {k: [] for k in allusers}
for community, users in communities.items():
    for user in users:
        user_pred_labels[user].append(majorities[community])

In [112]:
for _, i in user_pred_labels.items():
    assert len(i)<=1

In [113]:
for user, labels in user_pred_labels.items():
    if len(labels) == 0:
        user_pred_labels[user] = -1
        continue
    user_pred_labels[user] = sum(user_pred_labels[user])*2//len(user_pred_labels[user])
    if user_pred_labels[user] > 1:
        user_pred_labels[user] = 1

In [114]:
accuracy_user_pred = {k: v if v >= 0 else 2 for k, v in user_pred_labels.items()}

In [115]:
accuracy_user_gt = {}
for user, label in user_pred_labels.items():
    if user in supporter:
        accuracy_user_gt[user] = 1
    elif user in opposer:
        accuracy_user_gt[user] = 0
    else:
        accuracy_user_gt[user] = 2

# Accuracy and Jaccard Scores

In [116]:
from sklearn.metrics import classification_report, jaccard_score
from sklearn.preprocessing import OneHotEncoder

In [117]:
print(classification_report(list(accuracy_user_gt.values()), list(accuracy_user_pred.values())))

              precision    recall  f1-score   support

           0       0.76      0.55      0.64      3778
           1       0.68      0.26      0.38      3117
           2       0.08      0.62      0.14       425

    accuracy                           0.43      7320
   macro avg       0.51      0.48      0.38      7320
weighted avg       0.69      0.43      0.50      7320



In [118]:
jaccard_user_pred = {k: v if v >= 0 else 2 for k, v in user_pred_labels.items()}

In [119]:
jaccard_user_gt = {}
for user, label in user_pred_labels.items():
    if label == -1:
        jaccard_user_gt[user] = 0
    elif user in supporter:
        jaccard_user_gt[user] = 1
    elif user in opposer:
        jaccard_user_gt[user] = 0
    else:
        jaccard_user_gt[user] = 2

In [120]:
onehot = OneHotEncoder(handle_unknown='ignore')
jaccard_gt = onehot.fit_transform([[i] for i in list(jaccard_user_gt.values())]).toarray()
jaccard_pred = onehot.transform([[i] for i in list(jaccard_user_pred.values())]).toarray()

In [121]:
print(jaccard_score(jaccard_gt, jaccard_pred, average='samples'))

0.3941256830601093


# Discarded Users

In [122]:
discarded_users = []
for user, cid in partition.items():
    if cid not in major_communities:
        discarded_users.append(user)

In [123]:
len(discarded_users)/len(allusers)*100

46.62568306010929

# Discarded Hashtags

In [124]:
discarded_ht = []
for tweet in tweets:
    if tweet['user']['screen_name'] in discarded_users:
        discarded_ht.extend([ht['text'].lower() for ht in tweet['entities']['hashtags']])
discarded_ht = set(discarded_ht)

all_ht = []
for tweet in tweets:
    if tweet['user']['screen_name'] in allusers:
        all_ht.extend([ht['text'].lower() for ht in tweet['entities']['hashtags']])
all_ht = set(all_ht)

In [125]:
len(discarded_ht)/len(all_ht)*100

64.9763353617309