In [1]:
import sys, os, io, random, json, re
import pandas as pd
from collections import Counter

In [2]:
def find_exact_string(w):
    return re.compile(r'\b({0})\b'.format(w), flags=re.IGNORECASE).search

In [3]:
def load_json(fn):
    ret = None
    with io.open(fn, "r", encoding="utf-8") as f:
        ret = json.load(f)
    return ret

In [4]:
stopwords = load_json("config/stopwords.json")
stopwords = stopwords["en"]

def tokenize_sentence(text):
    words = re.split(r'(\s+)', text)
    if len(words) < 1:
        return
    tokens = []
    for w in words:
        if w is not None:
            w = w.strip()
            w = w.lower()
            if w.isspace() or w == "\n" or w == "\r":
                w = None
            if w is not None and "http" in w:
                w = None
            if w is not None and len(w) < 1:
                w = None
            if w is not None and u"…" in w:
                w = None
            if w is not None:
                tokens.append(w)
    if len(tokens) < 1:
        return []
# Remove stopwords and other undesirable tokens
    cleaned = []
    for token in tokens:
        if len(token) > 0:
            if stopwords is not None:
                if token in stopwords:
                    token = None
            if token is not None:
                if re.search(".+…$", token):
                    token = None
            if token is not None:
                if token == "#":
                    token = None
            if token is not None:
                cleaned.append(token)
    if len(cleaned) < 1:
        return []
    return cleaned

In [5]:
def save_csv(inter, fn):
    with io.open(fn, "w", encoding="utf-8") as f:
        f.write("Source,Target,Weight\n")
        for source, targets in inter.items():
            for target, count in targets.items():
                f.write(source + "," + target + "," + str(count) + "\n")

In [17]:
with io.open("data/raw.json", "r", encoding="utf-8") as f:
    for line in f:
        d = json.loads(line)
        print(json.dumps(d, indent=4))
        break

{
    "lang": "en",
    "quote_count": 0,
    "interactions": [
        "citysamuel"
    ],
    "text": "The EU's reckless desire for revenge on the UK overrides their duty to protect the people of Europe. The #Brexit ne\u2026 https://t.co/QeodZLnAtz",
    "created_at": "Mon Dec 03 12:07:15 +0000 2018",
    "hashtags": [
        "brexit"
    ],
    "retweeted": null,
    "is_quote_status": true,
    "in_reply_to_status_id": null,
    "reply_count": 0,
    "in_reply_to_screen_name": null,
    "id_str": "1069563714406412294",
    "urls": [
        "https://twitter.com/i/web/status/1069563714406412294"
    ],
    "retweet_count": 0,
    "user": {
        "default_profile": true,
        "statuses_count": 22756,
        "description": "Old friend of Charlie (of Charlie's Law). Blocked by the Guardian. Blocked by Strictly.",
        "verified": false,
        "followers_count": 102,
        "screen_name": "LawCharlies",
        "id_str": "877458106850127872",
        "default_profile_image"

In [16]:
highly_retweeted = []
highly_retweeted_ids = []
highly_liked = []
highly_liked_ids = []
highly_replied = []
highly_replied_ids = []
interactions = {}
counters = {}
magas = []
counter_names = ["users", "influencers", "amplifiers", "hashtags"]
for n in counter_names:
    counters[n] = Counter()
min_retweet_count = 1000
min_like_count = 10
min_replied_count = 100
count = 0
with io.open("data/raw.json", "r", encoding="utf-8") as f:
    for line in f:
        count += 1
        if count % 10000 == 0:
            print("Count: " + str(count))
        d = json.loads(line)
        twid = d["id_str"]
        sn = d["user"]["screen_name"]
        counters["users"][sn] += 1
        if "description" in d:
            desc = d["description"]
            if find_exact_string("maga")(desc):
                magas.append(sn)
        if "hashtags" in d:
            ht = d["hashtags"]
            for h in ht:
                counters["hashtags"][h] += 1
        if "interactions" in d:
            counters["amplifiers"][sn] += 1
            inter = d["interactions"]
            if sn not in interactions:
                interactions[sn] = {}
            for i in inter:
                counters["influencers"][i] += 1
                if i not in interactions[sn]:
                    interactions[sn][i] = 1
                else:
                    interactions[sn][i] += 1
        if "reply_count" in d:
            if d["reply_count"] is not None and d["reply_count"] > min_replied_count:
                if twid not in highly_replied_ids:
                    highly_replied_ids.append(twid)
                    highly_replied.append(d)
        if "retweet_count" in d:
            if d["retweet_count"] is not None and d["retweet_count"] > min_retweet_count:
                if twid not in highly_retweeted_ids:
                    highly_retweeted_ids.append(twid)
                    highly_retweeted.append(d)
        if "favorite_count" in d:
            if d["favorite_count"] is not None and d["favorite_count"] > min_like_count:
                if twid not in highly_liked_ids:
                    highly_liked_ids.append(twid)
                    highly_liked.append(d)
        if "retweeted_status" in d:
            rtwid = d["retweeted_status"]["id_str"]
            s = d["retweeted_status"]
            if "retweet_count" in s:
                if s["retweet_count"] is not None and s["retweet_count"] > min_retweet_count:
                    if rtwid not in highly_retweeted_ids:
                        highly_retweeted_ids.append(rtwid)
                        highly_retweeted.append(s)
            if "favorite_count" in s:
                if s["favorite_count"] is not None and s["favorite_count"] > min_like_count:
                    if rtwid not in highly_liked_ids:
                        highly_liked_ids.append(rtwid)
                        highly_liked.append(s)
            if "reply_count" in s:
                if s["reply_count"] is not None and s["reply_count"] > min_replied_count:
                    if twid not in highly_replied_ids:
                        highly_replied_ids.append(twid)
                        highly_replied.append(s)
print("Highly retweeted: " + str(len(highly_retweeted)))
print("Highly liked: " + str(len(highly_liked)))
print("Highly replied to: " + str(len(highly_replied)))
for n in counter_names:
    print("")
    print(n)
    print("---------")
    for i, c in counters[n].most_common(20):
        print(str(c) + "\t" + i)
save_csv(interactions, "data/interactions.csv")
print("")
print("Found " + str(len(magas)) + " magas.")

Highly retweeted: 63
Highly liked: 857
Highly replied to: 648

users
---------
20	GreenOakFraming
18	myerschrismyer1
16	jan_sabin
15	BlueNicht
15	TegoArcanaDei
14	RayUngerer
14	suenay682
14	dbkell
13	guse_guse
13	Vampire91399115
13	RejoinTheWorld
13	FParsonage
13	VincentKevinHo4
12	terencehooson
12	zidanesboots
12	wisteriawitch
12	GamerHazelnut
11	SpanishDan1
11	LibDemsNow
10	Ladycorvia

influencers
---------
218	femi_sorry
202	theresa_may
182	doctor_oxford
138	brexitcentral
109	borisjohnson
106	shehabkhan
100	leaveeuofficial
68	conservatives
63	rachael_swindon
55	peston
52	chukaumunna
51	jamiectaylor1
51	victorialive
50	peoplesvote_uk
49	mrjamesob
49	carolecadwalla
49	peterstefanovi2
48	thismorning
48	wcullmac
48	nigel_farage

amplifiers
---------
20	GreenOakFraming
18	myerschrismyer1
16	jan_sabin
15	BlueNicht
15	TegoArcanaDei
14	RayUngerer
14	suenay682
14	dbkell
13	guse_guse
13	Vampire91399115
13	RejoinTheWorld
13	FParsonage
13	VincentKevinHo4
12	terencehooson
12	zidanesboots
12	wist