In [1]:
import json
import pprint

In [2]:
tweets =[]
with open('../Gamergate_Dataset/Gamergate.json','r') as f:
    for line in f.readlines():
        tweets.append(json.loads(line))

In [3]:
with open('../Gamergate_Dataset/Labels_for_7320_users/gamergate_opposer_feminist_usernames', 'r') as f:
    opposer =f.read().splitlines()
with open('../Gamergate_Dataset/Labels_for_7320_users/gamergate_supporter_misogynist_usernames','r') as f:
    supporter = f.read().splitlines()
with open('../Gamergate_Dataset/Labels_for_7320_users/unaffiliated_usernames','r') as f:
    unaffiliated = f.read().splitlines()

In [4]:
allusers = opposer + unaffiliated + supporter

In [5]:
import networkx as nx
# import matplotlib.pyplot as plt
G = nx.Graph()
G.add_nodes_from(allusers)
for tweet in tweets:
    username = tweet['user']['screen_name']
    if username not in allusers:
        continue
    if 'retweeted_status' not in tweet:
        continue
    retweeteduser = tweet['retweeted_status']['user']['screen_name']
    if retweeteduser not in allusers:
        continue
    # G.add_nodes_from([username,retweeteduser])
    G.add_edge(username,retweeteduser)
G = G.to_undirected()

In [6]:

user_number = list(G.nodes.keys())
user_number = {user_number[i]: i for i in range(len(user_number))}


## Edges file

In [8]:
with open('gamergate.edges', 'w') as f:
    for edge in G.edges():
        f.write(str(user_number[edge[0]]) + '\t' + str(user_number[edge[1]]) + '\n')

## Features file

In [8]:
tweets_text = [tweet['text'] for tweet in tweets]

In [9]:
from sklearn.feature_extraction.text import CountVectorizer

In [10]:
vectorizer = CountVectorizer()
vectorizer.fit(tweets_text)

CountVectorizer(analyzer='word', binary=False, decode_error='strict',
                dtype=<class 'numpy.int64'>, encoding='utf-8', input='content',
                lowercase=True, max_df=1.0, max_features=None, min_df=1,
                ngram_range=(1, 1), preprocessor=None, stop_words=None,
                strip_accents=None, token_pattern='(?u)\\b\\w\\w+\\b',
                tokenizer=None, vocabulary=None)

In [53]:
user_texts = {k: "" for k, _ in user_number.items()}
for tweet in tweets:
    if 'retweeted_status' in tweet:
        continue
    if tweet['user']['screen_name'] in user_texts:
        user_texts[tweet['user']['screen_name']] += ' ' + tweet['text']

In [88]:
with open("gamergate.nodefeat", 'w') as f:
    for user, text in user_texts.items():
        text = vectorizer.transform([text])
        text = text.toarray()[0]
        user = user_number[user]
        for i, enumi in enumerate(text):
            if enumi != 0:
                f.write(str(user) + '\t' + str(i) + '\n')

## Feature Names file

In [16]:
with open('gamergate.nodefeatnames', 'w') as f:
    for name, feat_id in sorted(vectorizer.vocabulary_.items()):
        f.write(str(feat_id) + '\t' + name + '\n')

# Get Communities and Post-Processing

In [19]:
communities = {}
with open('gamergatecmtyvv.txt', 'r') as f:
    for i in range(100):
        communities[i] = list(map(int, f.readline().split('\t')[:-1]))

In [20]:
user_number_reversed = {v: k for k, v in user_number.items()}

In [21]:
predicted_users = set()
for community in communities.values():
    predicted_users = predicted_users.union(community)
len(predicted_users)    

2259

In [22]:
majorities = {}
for community, users in communities.items():
    o = 0
    s = 0
    u = 0
    for sn in users:
        sn = user_number_reversed[sn]
        if sn in opposer:
            o += 1
        elif sn in supporter:
            s += 1
        else:
            u += 1
    if max(s, o, u) == s:
        majorities[community] = 1
    elif max(s, o, u) == o:
        majorities[community] = 0
    else:
        majorities[community] = 2

In [23]:
user_pred_labels = {k: [] for k in allusers}
for community, users in communities.items():
    for user in users:
        user = user_number_reversed[user]
        user_pred_labels[user].append(majorities[community])

In [24]:
for user, labels in user_pred_labels.items():
    if len(labels) == 0:
        user_pred_labels[user] = -1
        continue
    user_pred_labels[user] = sum(user_pred_labels[user])*2//len(user_pred_labels[user])
    if user_pred_labels[user] > 1:
        user_pred_labels[user] = 1

In [111]:
accuracy_user_pred = {k: v if v >= 0 else 2 for k, v in user_pred_labels.items()}

In [112]:
accuracy_user_gt = {}
for user, label in user_pred_labels.items():
    if user in supporter:
        accuracy_user_gt[user] = 1
    elif user in opposer:
        accuracy_user_gt[user] = 0
    else:
        accuracy_user_gt[user] = 2

# Accuracy and Jaccard Scores

In [152]:
from sklearn.metrics import classification_report, jaccard_score
from sklearn.preprocessing import OneHotEncoder

In [114]:
print(classification_report(list(accuracy_user_gt.values()), list(accuracy_user_pred.values())))

precision    recall  f1-score   support

           0       0.76      0.18      0.29      3778
           1       0.70      0.31      0.43      3117
           2       0.08      0.90      0.14       425

    accuracy                           0.28      7320
   macro avg       0.51      0.46      0.29      7320
weighted avg       0.70      0.28      0.34      7320



In [143]:
jaccard_user_pred = {k: v if v >= 0 else 2 for k, v in user_pred_labels.items()}

In [144]:
jaccard_user_gt = {}
for user, label in user_pred_labels.items():
    if label == -1:
        jaccard_user_gt[user] = 0
    elif user in supporter:
        jaccard_user_gt[user] = 1
    elif user in opposer:
        jaccard_user_gt[user] = 0
    else:
        jaccard_user_gt[user] = 2

In [176]:
onehot = OneHotEncoder(handle_unknown='ignore')
jaccard_gt = onehot.fit_transform([[i] for i in list(jaccard_user_gt.values())]).toarray()
jaccard_pred = onehot.transform([[i] for i in list(jaccard_user_pred.values())]).toarray()

In [180]:
print(jaccard_score(jaccard_gt, jaccard_pred, average='samples'))

0.2244535519125683


# Discarded Users

In [29]:
discarded_users = [user for user in user_number if user not in accuracy_user_gt]

In [30]:
len(discarded_users)/(len(discarded_users) + len(accuracy_user_gt))*100

69.13934426229508

# Discarded Hashtags

In [31]:
discarded_ht = []
for tweet in tweets:
    if tweet['user']['screen_name'] in discarded_users:
        discarded_ht.extend([ht['text'].lower() for ht in tweet['entities']['hashtags']])
discarded_ht = set(discarded_ht)

all_ht = []
for tweet in tweets:
    if tweet['user']['screen_name'] in allusers:
        all_ht.extend([ht['text'].lower() for ht in tweet['entities']['hashtags']])
all_ht = set(all_ht)

In [33]:
len(discarded_ht)/(len(all_ht))*100

39.3106278210915