In [None]:
from gather_analysis_helper import *
from twitter_no_rl_tool import *
import seaborn as sns
import matplotlib.pyplot as plt
from IPython.display import display

%matplotlib inline

In [None]:
def print_tweets_from_counter(cnt, num, offset=0):
    gather = num + offset
    snl = [x for x, c in cnt.most_common(gather)]
    snl = snl[offset:]
    for sn in snl:
        if sn in sn_twid:
            twidl = sn_twid[sn]
            tw = None
            tries = 0
            while tw == None and tries < 5:
                rtwid = random.choice(list(twidl))
                url = twid_url[rtwid]
                tw = Tweet(url)
                tries += 1
            display(tw)

In [None]:
save_data = False
num_plots = 5
plot_timespan = 7 * 24
num_counters = 20

In [None]:
dirname = "analysis_live"
if not os.path.exists(dirname):
    os.makedirs(dirname)

In [None]:
analysis_span = plot_timespan
current_unix = get_utc_unix_time()
start_time = unix_time_to_readable(current_unix - (3600*analysis_span))
end_time = unix_time_to_readable(current_unix + (3600*3))

#start_time = "2017-01-01 00:00:00"
#end_time = "2022-01-01 00:00:00"

print("Start time: " + start_time)
print("End time: " + end_time)


In [None]:
raw = make_file_iterator(start_time, end_time, os.path.join("data/raw.json"))
full = get_counters_and_interactions2(raw)

In [None]:
uf = full["user_fields"]
counters = full["counters"]
users = counters["users"]
sn_rsn = full["sn_rsn"]
rsn_sn = full["rsn_sn"]
sn_rep = full["sn_rep"]
rep_sn = full["rep_sn"]
sn_men = full["sn_men"]
men_sn = full["men_sn"]
sn_quo = full["sn_quo"]
quo_sn = full["quo_sn"]
rsn_twid = full["rsn_twid"]
twid_count = full["twid_count"]
twid_rt_count = full["twid_rt_count"]
twid_text = full["twid_text"]
twid_url = full["twid_url"]
twid_sn = full["twid_sn"]
sn_twid = full["sn_twid"]
sn_details = full["sn_details"]
sn_hashtag = full["sn_hashtag"]
hashtag_sn = full["hashtag_sn"]
hashtag_twid = full["hashtag_twid"]
orig_twids = full["orig_twids"]
replied_twids = full["replied_twids"]
quoted_twids = full["quoted_twids"]
retweeted_twids = full["retweeted_twids"]

In [None]:
# Show language distribution
cluster_hts = counters["lang"]

plot_data = {}
plot_data["labels"] = []
plot_data["sizes"] = []

n = 10
other = 0
otherc = 0
for ht, c in cluster_hts.most_common():
    if len(plot_data["labels"]) <= n:
        plot_data["labels"].append(ht + " (" + str(counters["lang"][ht]) + ")")
        plot_data["sizes"].append(c)
    else:
        otherc += 1
        other += c
plot_data["labels"].append("Other (" + str(other) + ")")
plot_data["sizes"].append(other)

fig = plt.figure(figsize=(12,7))
ax = fig.add_axes((0,0,.5,1))
ax.set_title("Languages")
plt.pie(plot_data["sizes"], labels=plot_data["labels"], autopct='%1.1f%%', startangle=30)
plt.axis('equal')
plt.show()

In [None]:
collect_start = unix_time_to_readable(full["oldest"])
collect_end = unix_time_to_readable(full["newest"])
print("Collection started on " + collect_start + " and ended on " + collect_end)

timespan_s = full["timespan"]
timespan_h = timespan_s/3600
timespan_d = timespan_h/24
high_vol = timespan_d*40
print("Collection duration: " + "%.2f"%timespan_d + " days.")

In [None]:
# Plot recent activity within the collected data (tweets per hour)
num_hours = analysis_span
plot_data = full["ts_data"]
plot_data = trim_plot_data(plot_data, 0, num_hours)
height = len(plot_data["count"])/3
sns.set(rc={'figure.figsize':(20,height)})
sns.set(style="whitegrid")
plt.figure()
ax = sns.barplot(y="time", x="count", palette="husl", data=plot_data)
for i, v in enumerate(plot_data["count"]):
    ax.text(v+1, i+0.25, str(v), fontweight='bold')
ax.set_title("Activity over the last " + str(num_hours) + " hours.")

In [None]:
# Here, I save some of the data in a format that gephi can eat
save_csv(sn_rsn, os.path.join(dirname, "retweet_interactions.csv"))
save_csv(sn_rep, os.path.join(dirname, "reply_interactions.csv"))
save_csv(sn_men, os.path.join(dirname, "mention_interactions.csv"))
save_csv(sn_quo, os.path.join(dirname, "quote_interactions.csv"))
save_json(sn_rsn, os.path.join(dirname, "sn_rsn.json"))
save_json(rsn_sn, os.path.join(dirname, "rsn_sn.json"))
save_json(counters["retweeters"], os.path.join(dirname, "retweeters.json"))
save_json(counters["retweeted"], os.path.join(dirname, "retweeted.json"))

In [None]:
# This function just prints some statistics about the dataset
print_counters(counters, uf, 50)

In [None]:
print_sn_counter(counters["susp_users"], 40)

In [None]:
for twid in full["susp_twids"]:
    text = twid_text[twid]
    url = twid_url[twid]
    print(twid + "\t" + text + "\t" + url)

In [None]:
stlen =  len(full["susp_twids"])
if stlen > 0:
    for twid in random.sample(list(full["susp_twids"]), min(10, stlen)):
        url = twid_url[twid]
        display(Tweet(url))

In [None]:
print_tweets_from_counter(counters["susp_users"], 20, 0)

In [None]:
# Calculate CTM for hashtags
susp_hashtags = []
print("Hashtag                         | Count | RTC   | U     | R     | F     | C  ")
print("================================================================================")
for hashtag, total_ht in counters["hashtags"].most_common():
    if total_ht < 200:
        continue
    # 1. Calculate the average number of posts per user for an item (U)
    U = 0
    if hashtag in hashtag_sn:
        ht_post_counts = []
        for sn, c in hashtag_sn[hashtag].items():
            ht_post_counts.append(c)
        U = np.mean(ht_post_counts)
        
    # 2. Calculate the ratio of original tweets to retweets for an item (R)
    R = 0
    orig_ht_count = 0
    rt_ht_count = 0
    if hashtag in hashtag_twid:
        twid_l = [x for x, c in hashtag_twid[hashtag].items()]
        for twid in twid_l:
            if twid in retweeted_twids:
                rt_ht_count += twid_count[twid]
            else:
                orig_ht_count += twid_count[twid]
    if rt_ht_count > 0:
        R = rt_ht_count/total_ht * 100

    # 3. Calculate the percentage of posts from the top 50 users (F)
    num_top50_posts = 0
    for sn, c in hashtag_sn[hashtag].most_common(50):
        num_top50_posts += c
    F = num_top50_posts/total_ht
    
    # 4. C = R/10 + F + U
    C = R/10 + F + U

    if C > 12:
        susp_hashtags.append(hashtag)
        hashtag = "(*) " + hashtag
    
    sep = "\t"
    if len(hashtag) < 24:
        sep += "\t"
    if len(hashtag) < 16:
        sep += "\t"
    if len(hashtag) < 8:
        sep += "\t"
    msg = hashtag + sep + "|"
    msg += str(orig_ht_count) + "\t|"
    msg += str(rt_ht_count) + "\t|"
    msg += "%.2f"%U + "\t|"
    msg += "%.2f"%R + "\t|"
    msg += "%.2f"%F + "\t|"
    msg += "%.2f"%C + "\t"
    print(msg)

In [None]:
susp_ht_users = Counter()
for ht in susp_hashtags:
    if ht in hashtag_sn:
        snc = hashtag_sn[ht]
        for x, c in snc.items():
            susp_ht_users[x] += c
print(len(susp_ht_users))

susp_ht_twids = Counter()
for ht in susp_hashtags:
    if ht in hashtag_twid:
        twidc = hashtag_twid[ht]
        for x, c in twidc.items():
            susp_ht_twids[x] += c
print(len(susp_ht_twids))

In [None]:
print_tweets_from_counter(susp_ht_users, 20, 0)

In [None]:
for twid, c in susp_ht_twids.most_common(20):
    text = twid_text[twid]
    url = twid_url[twid]
    print(str(c) + "\t" + twid + "\t" + text + "\t" + url)

In [None]:
if len(susp_ht_twids) > 0:
    sample_len = 10
    if len(susp_ht_twids) < sample_len:
        sample_len = len(susp_ht_twids)
    for twid in random.sample(list([x for x, c in susp_ht_twids.items()]), sample_len):
        url = twid_url[twid]
        display(Tweet(url))

In [None]:
# Show account age distribution
all_acct_ages = Counter()
for sn, d in sn_details.items():
    if "created_at" in d:
        ca = d["created_at"]
        yr = ca[-4:]
        mon = md[ca[4:7]]
        q = ""
        for qname, qvals in quarters.items():
            if int(mon) in qvals:
                q = qname
        day = ca[8:10]
        ds = str(yr) + "-" + q
        all_acct_ages[ds] += 1
plot_data = {}
plot_data["labels"] = []
plot_data["counts"] = []
for label, count in sorted(all_acct_ages.items(), reverse=True):
    plot_data["labels"].append(label)
    plot_data["counts"].append(count)
plot_data = trim_plot_data(plot_data, 0, 50)
height = len(plot_data["counts"])/3
sns.set(rc={'figure.figsize':(20,height)})
sns.set(style="whitegrid")
fig = plt.figure(figsize=(18,10))
ax = sns.barplot(x="counts", y="labels", palette="husl", data=plot_data)
for i, v in enumerate(plot_data["counts"]):
    pad = min(1.0, v/100)
    ax.text(v+pad, i+0.25, str(v), fontweight='bold')
ax.set_title("Account ages.")

In [None]:
# Show account age distribution of suspicious users
susp_set = set([x for x, c in full["counters"]["susp_users"].most_common()])
all_acct_ages = Counter()
for sn, d in sn_details.items():
    if sn not in susp_set:
        continue
    if "created_at" in d:
        ca = d["created_at"]
        yr = ca[-4:]
        mon = md[ca[4:7]]
        q = ""
        for qname, qvals in quarters.items():
            if int(mon) in qvals:
                q = qname
        day = ca[8:10]
        ds = str(yr) + "-" + q
        all_acct_ages[ds] += 1
plot_data = {}
plot_data["labels"] = []
plot_data["counts"] = []
for label, count in sorted(all_acct_ages.items(), reverse=True):
    plot_data["labels"].append(label)
    plot_data["counts"].append(count)
plot_data = trim_plot_data(plot_data, 0, 50)
height = len(plot_data["counts"])/3
sns.set(rc={'figure.figsize':(20,height)})
sns.set(style="whitegrid")
fig = plt.figure(figsize=(18,10))
ax = sns.barplot(x="counts", y="labels", palette="husl", data=plot_data)
for i, v in enumerate(plot_data["counts"]):
    pad = min(1.0, v/100)
    ax.text(v+pad, i+0.25, str(v), fontweight='bold')
ax.set_title("Suspicious user account ages.")

In [None]:
# Show account age distribution of suspicious users
if len(susp_ht_users) > 0:
    susp_set = set([x for x, c in susp_ht_users.most_common()])
    all_acct_ages = Counter()
    for sn, d in sn_details.items():
        if sn not in susp_set:
            continue
        if "created_at" in d:
            ca = d["created_at"]
            yr = ca[-4:]
            mon = md[ca[4:7]]
            q = ""
            for qname, qvals in quarters.items():
                if int(mon) in qvals:
                    q = qname
            day = ca[8:10]
            ds = str(yr) + "-" + q
            all_acct_ages[ds] += 1
    plot_data = {}
    plot_data["labels"] = []
    plot_data["counts"] = []
    for label, count in sorted(all_acct_ages.items(), reverse=True):
        plot_data["labels"].append(label)
        plot_data["counts"].append(count)
    plot_data = trim_plot_data(plot_data, 0, 50)
    height = len(plot_data["counts"])/3
    sns.set(rc={'figure.figsize':(20,height)})
    sns.set(style="whitegrid")
    fig = plt.figure(figsize=(18,10))
    ax = sns.barplot(x="counts", y="labels", palette="husl", data=plot_data)
    for i, v in enumerate(plot_data["counts"]):
        pad = min(1.0, v/100)
        ax.text(v+pad, i+0.25, str(v), fontweight='bold')
    ax.set_title("Suspicious hashtag user account ages.")

In [None]:
# Show some hashtags
cluster_hts = counters["hashtags"]

plot_data = {}
plot_data["labels"] = []
plot_data["sizes"] = []

n = 25
other = 0
otherc = 0
for ht, c in cluster_hts.most_common():
    if len(plot_data["labels"]) <= n:
        plot_data["labels"].append("#" + ht + " (" + str(counters["hashtags"][ht]) + ")")
        plot_data["sizes"].append(c)
    else:
        otherc += 1
        other += c
plot_data["labels"].append("Other (" + str(other) + ")")
plot_data["sizes"].append(other)

fig = plt.figure(figsize=(18,10))
ax = fig.add_axes((0,0,.5,1))
ax.set_title("Hashtags")
plt.pie(plot_data["sizes"], labels=plot_data["labels"], autopct='%1.1f%%', startangle=0)
plt.axis('equal')
plt.show()

In [None]:
# Show some hashtags
susp_ht_all = Counter()
for sn in susp_ht_users:
    if sn in sn_hashtag:
        for x, c in sn_hashtag[sn].items():
            susp_ht_all[x] += c
cluster_hts = susp_ht_all

plot_data = {}
plot_data["labels"] = []
plot_data["sizes"] = []

n = 25
other = 0
otherc = 0
for ht, c in cluster_hts.most_common():
    if len(plot_data["labels"]) <= n:
        plot_data["labels"].append("#" + ht + " (" + str(susp_ht_all[ht]) + ")")
        plot_data["sizes"].append(c)
    else:
        otherc += 1
        other += c
plot_data["labels"].append("Other (" + str(other) + ")")
plot_data["sizes"].append(other)

fig = plt.figure(figsize=(9,5))
ax = fig.add_axes((0,0,.5,1))
ax.set_title("Hashtags")
plt.pie(plot_data["sizes"], labels=plot_data["labels"], autopct='%1.1f%%', startangle=0)
plt.axis('equal')
plt.show()

In [None]:
# Show some hashtags
cluster_hts = counters["retweeted"]

plot_data = {}
plot_data["labels"] = []
plot_data["sizes"] = []

n = 25
other = 0
otherc = 0
for ht, c in cluster_hts.most_common():
    if len(plot_data["labels"]) <= n:
        plot_data["labels"].append(ht + " (" + str(counters["retweeted"][ht]) + ")")
        plot_data["sizes"].append(c)
    else:
        otherc += 1
        other += c
plot_data["labels"].append("Other (" + str(other) + ")")
plot_data["sizes"].append(other)

fig = plt.figure(figsize=(18,10))
ax = fig.add_axes((0,0,.5,1))
ax.set_title("Retweeted")
plt.pie(plot_data["sizes"], labels=plot_data["labels"], autopct='%1.1f%%', startangle=90)
plt.axis('equal')
plt.show()

In [None]:
# Show some url domains
domains = counters["domains"]
cluster_hts = domains
plot_data = {}
plot_data["labels"] = []
plot_data["sizes"] = []

n = 40
other = 0
otherc = 0
for ht, c in cluster_hts.most_common():
    if len(plot_data["labels"]) <= n:
        plot_data["labels"].append(ht + " (" + str(counters["domains"][ht]) + ")")
        plot_data["sizes"].append(c)
    else:
        otherc += 1
        other += c
plot_data["labels"].append("Other (" + str(other) + ")")
plot_data["sizes"].append(other)

fig = plt.figure(figsize=(18,10))
ax = fig.add_axes((0,0,.5,1))
ax.set_title("Domains")
plt.pie(plot_data["sizes"], labels=plot_data["labels"], autopct='%1.1f%%', startangle=90)
plt.axis('equal')
plt.show()

In [None]:
top_domains = [x for x, c in domains.most_common(num_plots)]
min_r = 1
dc = ""
for url_target in top_domains:
    msg = "Users that used url containing: " + url_target
    msg += " that tweeted them at least " + str(min_r) + " times"
    if len(dc) > 0:
        msg += " and whose account was created after " + dc
    msg += "."
    print(msg)
    amps = print_url_amplifiers(url_target, full,  min_retweets=min_r, date_cutoff=dc)
    print("")

In [None]:
# Print a heatmap of retweet overlaps
mapping = rsn_sn
overlaps = get_mapping_overlaps(mapping, 0, 20)
hm_labels = []
hm_items = []
hm_data = []
for label, oc in sorted(overlaps.items()):
    hm_labels.append(label[:8])
    hm_items.append(label)
for item in hm_items:
    row = [c for x, c in sorted(overlaps[item].items())]
    hm_data.append(row)
sns.set(rc={'figure.figsize':(20,10)})
sns.set(style="whitegrid")
hm = np.array(hm_data)
plt.figure()
ax = sns.heatmap(hm, annot=True, fmt="d", cmap="YlGnBu", cbar=False, xticklabels=hm_labels, yticklabels=hm_labels)
ax.xaxis.set_ticks_position('top')
ax.set_title("Retweet overlaps (those who retweeted A also retweeted B)")

In [None]:
# Print a heatmap of cluster overlaps
mapping = hashtag_sn
overlaps = get_mapping_overlaps(mapping, 0, 20)
hm_labels = []
hm_items = []
hm_data = []
for label, oc in sorted(overlaps.items()):
    hm_labels.append(label[:8])
    hm_items.append(label)
for item in hm_items:
    row = [c for x, c in sorted(overlaps[item].items())]
    hm_data.append(row)
sns.set(rc={'figure.figsize':(20,10)})
sns.set(style="whitegrid")
hm = np.array(hm_data)
plt.figure()
ax = sns.heatmap(hm, annot=True, fmt="d", cmap="YlGnBu", cbar=False, xticklabels=hm_labels, yticklabels=hm_labels)
ax.xaxis.set_ticks_position('top')
ax.set_title("Hashtag overlaps (those who used hashtag A also used hashtag B)")

In [None]:
plot_data = categorize_users(counters["users"], timespan_d)
fig = plt.figure(figsize=(8,5))
ax = fig.add_axes((0,0,.5,1))
ax.set_title('Users seen breakdown')
plt.pie(plot_data["sizes"], labels=plot_data["labels"], autopct='%1.1f%%', startangle=140)
plt.axis('equal')
plt.show()

In [None]:
plot_data = categorize_users(susp_ht_users, timespan_d)
fig = plt.figure(figsize=(8,5))
ax = fig.add_axes((0,0,.5,1))
ax.set_title('Suspicious hashtag users')
plt.pie(plot_data["sizes"], labels=plot_data["labels"], autopct='%1.1f%%', startangle=140)
plt.axis('equal')
plt.show()

In [None]:
plot_data = categorize_users(counters["retweeters"], timespan_d)
fig = plt.figure(figsize=(8,5))
ax = fig.add_axes((0,0,.5,1))
ax.set_title('Retweeters breakdown')
plt.pie(plot_data["sizes"], labels=plot_data["labels"], autopct='%1.1f%%', startangle=140)
plt.axis('equal')
plt.show()

In [None]:
plot_data = categorize_users(counters["retweeted"], timespan_d)
fig = plt.figure(figsize=(8,5))
ax = fig.add_axes((0,0,.5,1))
ax.set_title('Retweeted breakdown')
plt.pie(plot_data["sizes"], labels=plot_data["labels"], autopct='%1.1f%%', startangle=140)
plt.axis('equal')
plt.show()

In [None]:
# Print activity comparison for some hashtags
offset=0
max_lines = 8
all_x_labels = get_timestamp_range(collect_start, collect_end)
targets = [x for x, c in counters["hashtags"].most_common()][offset:]
all_plots = []
labels = []
for target in targets:
    if len(labels) >= max_lines:
        break
    if target in full["hashtag_ts_data"]:
        labels.append(target)
        plot_data = full["hashtag_ts_data"][target]
        counts = plot_data["count"]
        times = plot_data["time"]
        tc = {}
        for i, t in enumerate(times):
            tc[t] = counts[i]
        fitted = []
        for ts in all_x_labels:
            if ts in tc:
                fitted.append(tc[ts])
            else:
                fitted.append(0)
        all_plots.append(fitted)
sns.set(style="whitegrid")
x_labels = np.array(all_x_labels).T
dates = x_labels
values = np.array(all_plots).T
data = pd.DataFrame(values, dates, columns=labels)
data = data.rolling(7).mean()
ax = sns.lineplot(data=data, hue="event", style="event", dashes=False, markers=True, linewidth=1.5)
ax.xaxis.set_major_locator(plt.MaxNLocator(5))

In [None]:
# Print activity comparison for suspicious hashtags
offset=0
max_lines = 8
all_x_labels = get_timestamp_range(collect_start, collect_end)
targets = susp_hashtags
all_plots = []
labels = []
if len(targets) > 0:
    for target in targets:
        if len(labels) >= max_lines:
            break
        if target in full["hashtag_ts_data"]:
            labels.append(target)
            plot_data = full["hashtag_ts_data"][target]
            counts = plot_data["count"]
            times = plot_data["time"]
            tc = {}
            for i, t in enumerate(times):
                tc[t] = counts[i]
            fitted = []
            for ts in all_x_labels:
                if ts in tc:
                    fitted.append(tc[ts])
                else:
                    fitted.append(0)
            all_plots.append(fitted)
    sns.set(style="whitegrid")
    x_labels = np.array(all_x_labels).T
    dates = x_labels
    values = np.array(all_plots).T
    data = pd.DataFrame(values, dates, columns=labels)
    data = data.rolling(7).mean()
    ax = sns.lineplot(data=data, hue="event", style="event", dashes=False, markers=True, linewidth=1.5)
    ax.xaxis.set_major_locator(plt.MaxNLocator(5))

In [None]:
targets = [x for x, c in counters["hashtags"].most_common(num_plots*2)][1:]
min_r = 5
dc = ""
for ht_target in targets:
    msg = "Users that amplified hashtag: #" + ht_target
    msg += " that used the hashtag at least " + str(min_r) + " times"
    if len(dc) > 0:
        msg += " and whose account was created after " + dc
    msg += "."
    print(msg)
    amps = print_hashtag_amplifiers(ht_target, full,  min_retweets=min_r, date_cutoff=dc)
    print("")

In [None]:
targets = susp_hashtags
min_r = 5
dc = ""
for ht_target in targets:
    msg = "Users that amplified hashtag: #" + ht_target
    msg += " that used the hashtag at least " + str(min_r) + " times"
    if len(dc) > 0:
        msg += " and whose account was created after " + dc
    msg += "."
    print(msg)
    amps = print_hashtag_amplifiers(ht_target, full,  min_retweets=min_r, date_cutoff=dc)
    print("")

In [None]:
# Print activity comparison for some users
offset=0
max_lines = 8
all_x_labels = get_timestamp_range(collect_start, collect_end)
targets = [x for x, c in counters["users"].most_common()][offset:]
all_plots = []
labels = []
for target in targets:
    if len(labels) >= max_lines:
        break
    if target in full["sn_ts_data"]:
        labels.append(target)
        plot_data = full["sn_ts_data"][target]
        counts = plot_data["count"]
        times = plot_data["time"]
        tc = {}
        for i, t in enumerate(times):
            tc[t] = counts[i]
        fitted = []
        for ts in all_x_labels:
            if ts in tc:
                fitted.append(tc[ts])
            else:
                fitted.append(0)
        all_plots.append(fitted)
if len(labels) > 0:
    sns.set(style="whitegrid")
    x_labels = np.array(all_x_labels).T
    dates = x_labels
    values = np.array(all_plots).T
    data = pd.DataFrame(values, dates, columns=labels)
    data = data.rolling(7).mean()
    ax = sns.lineplot(data=data, hue="event", style="event", dashes=False, markers=True, linewidth=1.5)
    ax.xaxis.set_major_locator(plt.MaxNLocator(5))

In [None]:
# Print activity comparison for some users
offset=0
max_lines = 8
all_x_labels = get_timestamp_range(collect_start, collect_end)
targets = [x for x, c in susp_ht_users.most_common()][offset:]
all_plots = []
labels = []
for target in targets:
    if len(labels) >= max_lines:
        break
    if target in full["sn_ts_data"]:
        labels.append(target)
        plot_data = full["sn_ts_data"][target]
        counts = plot_data["count"]
        times = plot_data["time"]
        tc = {}
        for i, t in enumerate(times):
            tc[t] = counts[i]
        fitted = []
        for ts in all_x_labels:
            if ts in tc:
                fitted.append(tc[ts])
            else:
                fitted.append(0)
        all_plots.append(fitted)
if len(labels) > 0:
    sns.set(style="whitegrid")
    x_labels = np.array(all_x_labels).T
    dates = x_labels
    values = np.array(all_plots).T
    data = pd.DataFrame(values, dates, columns=labels)
    data = data.rolling(7).mean()
    ax = sns.lineplot(data=data, hue="event", style="event", dashes=False, markers=True, linewidth=1.5)
    ax.xaxis.set_major_locator(plt.MaxNLocator(5))

In [None]:
# Print retweet activity comparison for some accounts
offset=0
max_lines = 8
all_x_labels = get_timestamp_range(collect_start, collect_end)
targets = [x for x, c in counters["retweeted"].most_common()][offset:]
all_plots = []
labels = []
for target in targets:
    if len(labels) >= max_lines:
        break
    if target in full["rsn_ts_data"]:
        labels.append(target)
        plot_data = full["rsn_ts_data"][target]
        counts = plot_data["count"]
        times = plot_data["time"]
        tc = {}
        for i, t in enumerate(times):
            tc[t] = counts[i]
        fitted = []
        for ts in all_x_labels:
            if ts in tc:
                fitted.append(tc[ts])
            else:
                fitted.append(0)
        all_plots.append(fitted)
sns.set(style="whitegrid")
x_labels = np.array(all_x_labels).T
dates = x_labels
values = np.array(all_plots).T
data = pd.DataFrame(values, dates, columns=labels)
data = data.rolling(7).mean()
ax = sns.lineplot(data=data, hue="event", style="event", dashes=False, markers=True, linewidth=1.5)
ax.xaxis.set_major_locator(plt.MaxNLocator(5))

In [None]:
# Print activity comparison for some tweets
offset=0
max_lines = 8
all_x_labels = get_timestamp_range(collect_start, collect_end)
targets = [x for x, c in twid_count.most_common()][offset:]
all_plots = []
labels = []
for target in targets:
    if len(labels) >= max_lines:
        break
    if target in full["rtwid_ts_data"]:
        tweet_text = twid_text[target].replace("\n", " ").replace("\r", " ")[:50]
        title = target + "\n" + tweet_text
        labels.append(title)
        plot_data = full["rtwid_ts_data"][target]
        counts = plot_data["count"]
        times = plot_data["time"]
        tc = {}
        for i, t in enumerate(times):
            tc[t] = counts[i]
        fitted = []
        for ts in all_x_labels:
            if ts in tc:
                fitted.append(tc[ts])
            else:
                fitted.append(0)
        all_plots.append(fitted)
sns.set(style="whitegrid")
x_labels = np.array(all_x_labels).T
dates = x_labels
values = np.array(all_plots).T
data = pd.DataFrame(values, dates, columns=labels)
data = data.rolling(7).mean()
ax = sns.lineplot(data=data, hue="event", style="event", dashes=False, markers=True, linewidth=1.5)
ax.xaxis.set_major_locator(plt.MaxNLocator(5))

In [None]:
# This prints the top n tweets seen in the data set (by the number of times we saw them shared)
print_tweet_texts(twid_count, twid_text, twid_url, num_counters)

In [None]:
for twid, count in twid_count.most_common(20):
    url = twid_url[twid]
    tw = Tweet(url)
    display(tw)

In [None]:
print("Most amplified accounts in the dataset.")
amps = print_most_amplified(full, high_vol, include_verified=True)

**Clustering**

In [None]:
# This partitions the accounts into communities
# it requires python-igraph
# Note you can use sn_rsn, sn_rep, or sn_men
# Depending on what you're looking for
clusters = get_communities(sn_rsn)

In [None]:
# Here I just print out some of the most prominent
# accounts in each cluster
threshold = high_vol
mon_in_cluster = {}
cluster_len = {}
cluster_retweets = {}
for index, names in clusters.items():
    if len(names) > 10:
        cluster_len[index] = len(names)
    top = set()
    for x, c in counters["retweeted"].most_common():
        if c > 0:
            if x in names:
                top.add(x)
            if len(top) > 10:
                break
    rtc = 0
    for n in names:
        if n in rsn_sn:
            for sn, count in rsn_sn[n].most_common():
                rtc += count
    cluster_retweets[index] = rtc
    if len(top) > 0:
        mon_in_cluster[index] = top
summary = Counter()
cluster_names = {}
for index, count in cluster_len.items():
    names = []
    if index in mon_in_cluster:
        names = mon_in_cluster[index]
    print("")
    msg = "Cluster: " + str(index)
    msg += " [Members: " + str(count) + ", Retweet count: " + str(cluster_retweets[index]) + "]"
    print(msg)
    members = Counter()
    for n in names:
        rtc = 0
        if n in rsn_sn:
            for s, c in rsn_sn[n].items():
                rtc += c
        flag = ""
        if rtc > threshold:
            summary[n] = rtc
        print(flag + "https://twitter.com/" + n + "\t(" + str(rtc) + ")")
        members[n] = rtc
    if len(members) > 0:
        top_member, top_count = members.most_common(1)[0]
        cluster_names[index] = top_member

In [None]:
# A pie chart of the cluster sizes. Labels are derived in the
# previous step (the most retweeted account in the cluster)
labels = []
sizes = []
node_count = sum([len(c) for x, c in clusters.items()])
num_clusters = len(clusters)
other = 0
other_c = 0
other_rtc = 0
named_clusters = set()
named_cluster_c = Counter()
for x, c in sorted(clusters.items()):
    if x in cluster_names:
        named_clusters.add(x)
        named_cluster_c[x] = cluster_retweets[x]
        nc = len(c)
        labels.append("@"+cluster_names[x] + " (" + str(nc) + " / " + str(cluster_retweets[x]) + ")")
        sizes.append(len(c))
    else:
        other_c += 1
        other_rtc += cluster_retweets[x]
        other += len(c)
if other_c > 0:
    labels.append( str(other_c) + " other clusters (" + str(other) + " / " + str(other_rtc) + ")")
    sizes.append(other)
fig = plt.figure(figsize=(18,10))
ax = fig.add_axes((0,0,.5,1))
ax.set_title('Cluster distributions')
plt.pie(sizes, labels=labels, autopct='%1.1f%%', startangle=0)
plt.axis('equal')
plt.show()

In [None]:
# Print a heatmap of cluster overlaps
top_named_clusters = set([x for x, c in named_cluster_c.most_common(20)])
overlaps = get_cluster_overlaps_partial(clusters, sn_rsn, top_named_clusters)
hm_labels = []
hm_indices = []
hm_data = []
for label, oc in sorted(overlaps.items()):
    hm_labels.append(cluster_names[label][:8])
    hm_indices.append(label)   
for index in hm_indices:
    row = [c for x, c in sorted(overlaps[index].items())]
    hm_data.append(row)
sns.set(rc={'figure.figsize':(20,10)})
sns.set(style="whitegrid")
hm = np.array(hm_data)
plt.figure()
ax = sns.heatmap(hm, annot=True, fmt="d", cmap="YlGnBu", cbar=False, xticklabels=hm_labels, yticklabels=hm_labels)
ax.xaxis.set_ticks_position('top')
ax.set_title("Cluster overlaps (top 20 clusters)")

In [None]:
print("Found " + str(len(summary)) + " influencers from clustering.")
print_sn_counter(summary, len(summary))

In [None]:
top_summary = [x for x, c in summary.most_common()]
for target in top_summary[:5]:
    print("")
    print("Full analysis for account: " + target)
    print("=====================================")
    print("")
    # Find the cluster that includes _target_ account
    selected_cluster = get_cluster_for_sn(target, clusters)
    print("There were " + str(len(selected_cluster)) + " accounts in the cluster with " + target)

    # Get details of all accounts in that cluster
    cluster_details = []
    for sn in selected_cluster:
        if sn in sn_details:
            cluster_details.append(sn_details[sn])
    print("Found details for " + str(len(cluster_details)) + " accounts.")

    # Save those userids and details
    if save_data == True:
        # Get the userids of those users if the accounts are
        # not verified or protected
        cluster_ids = set()
        for d in cluster_details:
            valid = True
            if d["protected"] == True:
                valid = False
            if d["verified"] == True:
                valid = False
            if valid == True:
                cluster_ids.add(d["id_str"])
        print("Retrieved " + str(len(cluster_ids)) + " IDs from data.")
        print("")
        with open(os.path.join(dirname, "ids_cluster_" + target + ".txt"), "w") as f:
            for id_str in cluster_ids:
                f.write(id_str+"\n")
        save_json(list(cluster_ids), os.path.join(dirname, "cluster_" + target + ".json"))
        save_json(cluster_details, os.path.join(dirname, "details_cluster_" + target + ".json"))
    
    # Show account age distribution
    cluster_acct_ages = Counter()
    for d in cluster_details:
        if "created_at" in d:
            ca = d["created_at"]
            yr = ca[-4:]
            mon = md[ca[4:7]]
            q = ""
            for qname, qvals in quarters.items():
                if int(mon) in qvals:
                    q = qname
            day = ca[8:10]
            ds = str(yr) + "-" + q
            cluster_acct_ages[ds] += 1
    plot_data = {}
    plot_data["labels"] = []
    plot_data["counts"] = []
    for label, count in sorted(cluster_acct_ages.items(), reverse=True):
        plot_data["labels"].append(label)
        plot_data["counts"].append(count)
    plot_data = trim_plot_data(plot_data, 0, 50)
    height = len(plot_data["counts"])/3
    sns.set(rc={'figure.figsize':(20,height)})
    sns.set(style="whitegrid")
    fig = plt.figure(figsize=(18,10))
    ax = sns.barplot(x="counts", y="labels", palette="husl", data=plot_data)
    for i, v in enumerate(plot_data["counts"]):
        pad = min(1.0, v/100)
        ax.text(v+pad, i+0.25, str(v), fontweight='bold')
    ax.set_title("Account ages in cluster: " + target)
    
    # Show some hashtags used by the cluster
    cluster_hts = Counter()
    sn_hashtag = full["sn_hashtag"]
    for sn in selected_cluster:
        if sn in sn_hashtag:
            for ht, c in sn_hashtag[sn].items():
                cluster_hts[ht] += c

    plot_data = {}
    plot_data["labels"] = []
    plot_data["sizes"] = []

    n = 30
    other = 0
    otherc = 0
    for ht, c in cluster_hts.most_common():
        if len(plot_data["labels"]) <= n:
            plot_data["labels"].append("#" + ht)
            plot_data["sizes"].append(c)
        else:
            otherc += 1
            other += c
    plot_data["labels"].append("Other (" + str(other) + ")")
    plot_data["sizes"].append(other)

    fig = plt.figure(figsize=(12,7))
    ax = fig.add_axes((0,0,.5,1))
    ax.set_title(target + ' cluster hashtag breakdown')
    plt.pie(plot_data["sizes"], labels=plot_data["labels"], autopct='%1.1f%%', startangle=0)
    plt.axis('equal')
    plt.show()
    
    cluster_snc = Counter()
    users = counters["users"]
    for sn in selected_cluster:
        if sn in users:
            cluster_snc[sn] = users[sn]
    plot_data = categorize_users(cluster_snc, timespan_d)
    fig = plt.figure(figsize=(8,5))
    ax = fig.add_axes((0,0,.5,1))
    ax.set_title(target + ' retweet breakdown')
    plt.pie(plot_data["sizes"], labels=plot_data["labels"], autopct='%1.1f%%', startangle=140)
    plt.axis('equal')
    plt.show()
    
    # Show some tweets published by the cluster
    print("")
    print("Top tweets published by cluster including: " + target)
    print("==================================================")
    print("")
    twidc = Counter()
    sn_twid = full["sn_twid"]
    for sn in selected_cluster:
        if sn in sn_twid:
            for twid, c in sn_twid[sn].items():
                twidc[twid] += c
    for twid, count in twidc.most_common(10):
        url = twid_url[twid]
        text = twid_text[twid]
        print(str(count) + "\t" + url + "\t[" + twid + "]")
        display(Tweet(url))
    print("")
    
    print("")
    print("Top URLs published by cluster including: " + target)
    print("==================================================")
    print("")
    # Show some urls published by the cluster
    urlc = Counter()
    sn_url = full["sn_url"]
    for sn in selected_cluster:
        if sn in sn_url:
            for url, c in sn_url[sn].items():
                urlc[url] += c
    print_counter(urlc, num_counters)
    print("")
    
    print("")
    print("Accounts retweeted by cluster including: " + target)
    print("==================================================")
    print("")
    clrtw = Counter()
    for sn in selected_cluster:
        if sn in sn_rsn:
            for x, c in sn_rsn[sn].items():
                clrtw[x] += c
    print_sn_counter(clrtw, num_counters)
    print("")
    
    # Get details for the list of users that retweeted _target_
    print("")
    print("Retweet analysis for target: " + target)
    print("=========================================")
    print("")
    rtw = rsn_sn[target]
    rlist = [x for x, c in rtw.items()]
    print("Found  " + str(len(rlist)) + " accounts that retweeted " + target)

    rdetails = []
    for sn in rlist:
        if sn in sn_details:
            rdetails.append(sn_details[sn])
    print("Found details for " + str(len(rdetails)) + " accounts.")

    # Save those userids and details
    if save_data == True:
        # Get the userids of those users if the accounts are
        # not verified or protected
        relids = set()
        for d in rdetails:
            valid = True
            if d["protected"] == True:
                valid = False
            if d["verified"] == True:
                valid = False
            if valid == True:
                relids.add(d["id_str"])
        print("Retrieved " + str(len(relids)) + " IDs from data.")
        with open(os.path.join(dirname, "ids_retweeted_" + target + ".txt"), "w") as f:
            for id_str in relids:
                f.write(id_str+"\n")
        save_json(list(relids), os.path.join(dirname, "retweeted_" + target + ".json"))
        save_json(rdetails, os.path.join(dirname, "details_retweeted_" + target + ".json"))
    
    plot_data = categorize_users(rsn_sn[target], timespan_d)
    fig = plt.figure(figsize=(8,5))
    ax = fig.add_axes((0,0,.5,1))
    ax.set_title(target + ' retweet breakdown')
    plt.pie(plot_data["sizes"], labels=plot_data["labels"], autopct='%1.1f%%', startangle=140)
    plt.axis('equal')
    plt.show()
    
    rt_hts = Counter()
    sn_hashtag = full["sn_hashtag"]
    for sn in rlist:
        if sn in sn_hashtag:
            for ht, c in sn_hashtag[sn].items():
                cluster_hts[ht] += c

    plot_data = {}
    plot_data["labels"] = []
    plot_data["sizes"] = []

    n = 30
    other = 0
    otherc = 0
    for ht, c in cluster_hts.most_common():
        if len(plot_data["labels"]) <= n:
            plot_data["labels"].append("#" + ht)
            plot_data["sizes"].append(c)
        else:
            otherc += 1
            other += c
    plot_data["labels"].append("Other (" + str(other) + ")")
    plot_data["sizes"].append(other)

    fig = plt.figure(figsize=(8,5))
    ax = fig.add_axes((0,0,.5,1))
    ax.set_title(target + ' rewteeters hashtag breakdown')
    plt.pie(plot_data["sizes"], labels=plot_data["labels"], autopct='%1.1f%%', startangle=0)
    plt.axis('equal')
    plt.show()
    
    # Show account age distribution
    cluster_acct_ages = Counter()
    for d in rdetails:
        if "created_at" in d:
            ca = d["created_at"]
            yr = ca[-4:]
            mon = md[ca[4:7]]
            q = ""
            for qname, qvals in quarters.items():
                if int(mon) in qvals:
                    q = qname
            day = ca[8:10]
            ds = str(yr) + "-" + q
            cluster_acct_ages[ds] += 1
    plot_data = {}
    plot_data["labels"] = []
    plot_data["counts"] = []
    for label, count in sorted(cluster_acct_ages.items(), reverse=True):
        plot_data["labels"].append(label)
        plot_data["counts"].append(count)
    plot_data = trim_plot_data(plot_data, 0, 50)
    height = len(plot_data["counts"])/3
    sns.set(rc={'figure.figsize':(20,height)})
    sns.set(style="whitegrid")
    fig = plt.figure(figsize=(18,10))
    ax = sns.barplot(x="counts", y="labels", palette="husl", data=plot_data)
    for i, v in enumerate(plot_data["counts"]):
        pad = min(1.0, v/100)
        ax.text(v+pad, i+0.25, str(v), fontweight='bold')
    ax.set_title("Ages of accounts that retweeted: " + target)
    plt.show()

    
    print("")
    print("Accounts retweeted by accounts that retweeted: " + target)
    print("==================================================")
    print("")
    rrtw = Counter()
    for sn in rlist:
        if sn in sn_rsn:
            for x, c in sn_rsn[sn].items():
                rrtw[x] += c
    print_sn_counter(rrtw, num_counters)
    print("")   
    
    # Show some tweets published by retweeters of target
    print("")
    print("Top tweets published by retweeters of: " + target)
    print("==================================================")
    print("")
    twidc = Counter()
    sn_twid = full["sn_twid"]
    for sn in rlist:
        if sn in sn_twid:
            for twid, c in sn_twid[sn].items():
                twidc[twid] += c
    for twid, count in twidc.most_common(10):
        url = twid_url[twid]
        text = twid_text[twid]
        print(str(count) + "\t" + url + "\t[" + twid + "]")
        display(Tweet(url))
    
    # Show some urls published by the retweeters of target
    print("")
    print("URLs published by retweeters of: " + target)
    print("==================================================")
    print("")
    urlc = Counter()
    sn_url = full["sn_url"]
    for sn in rlist:
        if sn in sn_url:
            for url, c in sn_url[sn].items():
                urlc[url] += c
    print_counter(urlc, num_counters)
    
    print("")
    min_r = 5
    dc = ""
    msg = "Users that retweeted " + target + " (" + str(len(rsn_sn[target])) + ")"
    msg += " at least " + str(min_r) + " times"
    if len(dc) > 0:
        msg += " and whose account was created after " + dc
    msg += "."
    print(msg)
    amps = print_target_amplifiers(target, full, min_retweets=min_r, date_cutoff=dc)