In [None]:
from authentication_keys import get_account_credentials
from twitter_no_rl_tool import *
from gather_analysis_helper import *
from time_helpers import *
from tweepy import OAuthHandler
from tweepy import Stream
from tweepy import API
from tweepy import Cursor
import os, json, sys
from collections import Counter
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt

%matplotlib inline

In [None]:
def auth():
    acct_name, consumer_key, consumer_secret, access_token, access_token_secret = get_account_credentials()
    auth = OAuthHandler(consumer_key, consumer_secret)
    auth.set_access_token(access_token, access_token_secret)
    auth_api = API(auth)
    return acct_name, auth_api

In [None]:
def get_tweets(auth_api, target):
    tweets = []
    count = 0
    print("")
    for status_obj in Cursor(auth_api.user_timeline, id=target).items():
        status = status_obj._json
        entry = get_tweet_details(status)
        tweets.append(entry)
        count += 1
        if count % 100 == 0:
            sys.stdout.write('\r')
            sys.stdout.flush()
            sys.stdout.write("%04d"%count)
            sys.stdout.flush
    return tweets

In [None]:
acct_name, auth_api = auth()
print("Signed on with: " + acct_name)

In [None]:
target = ""
dirname = os.path.join("../twitter_analysis/user_analysis", target)
if not os.path.exists(dirname):
    print("Creating save dir: " + dirname)
    os.makedirs(dirname)

In [None]:
userobj = auth_api.get_user(target)
user = userobj._json
save_json(user, os.path.join(dirname, "user_object.json"))

In [None]:
print("Details about the account queried")
print("=================================")
print()
fields = ["id_str", "screen_name", "name", "created_at", "description", "location", "protected", "verified",
          "lang", "followers_count", "friends_count", "statuses_count"]
for f in fields:
    print(f + ": " + str(user[f]))
egg = False
if user["default_profile"] == True and user["default_profile_image"] == True:
    egg = True
print("egg: " + str(egg))
current_unix = get_utc_unix_time()
ca_unix = twitter_time_to_unix(user["created_at"])
age_seconds = current_unix - ca_unix
age_hours = age_seconds / 3600
age_days = age_hours / 24
age_years = age_days/365
msg = "Account age: "  
msg += "%.2f"%age_hours + "(h) " 
msg += "%.2f"%age_days + "(d) " 
msg += "%.2f"%age_years + "(y)"
print(msg)
sc = user["statuses_count"]
tweetsph = sc / age_hours
tweetspd = sc / age_days
print("Tweet speed: " + "%.2f"%tweetsph + "/hour " + "%.2f"%tweetspd + "/day")
print()
print("egg is True if the account has default profile picture and profile settings")
print("statuses_count is the number of tweets published by the account")
# XXX Other stuff to possibly add
# Likes per day
# Friends/followers ratio
# Retweets / replies / quotes / original ratios
# Most liked tweet
# Most retweeted users
# Most quoted users
# Most replied to users
# Image collage

In [None]:
tweets = []
saved_tweets_file = os.path.join(dirname, "tweets.json")
if not os.path.exists(saved_tweets_file):
    print("Retrieving tweets from Twitter.")
    tweets = get_tweets(auth_api, target)
    save_json(tweets, saved_tweets_file)
else:
    print("Loading saved tweets. Please delete " + saved_tweets_file + " if you want to fetch new tweets.")
    tweets = load_json(saved_tweets_file)
print("")
print("Got " + str(len(tweets)) + " tweets.")

In [None]:
full = get_counters_and_interactions2(tweets)

In [None]:
collect_start = unix_time_to_readable(full["oldest"])
collect_end = unix_time_to_readable(full["newest"])
print("Collection started on " + collect_start + " and ended on " + collect_end)

num_tweets = len(tweets)
timespan_s = full["timespan"]
timespan_10m = timespan_s/600
timespan_m = timespan_s/60
timespan_h = timespan_m/60
timespan_d = timespan_h/24
high_vol = timespan_d*40
print("Collection duration: " + "%.2f"%timespan_d + " days.")

In [None]:
uf = full["user_fields"]
counters = full["counters"]
omit = ['users', 'susp_users', 'amplifiers', 'mentioners', 'quoters', 'repliers', 'retweeters']
for o in omit:
    if o in counters:
        del(counters[o])
print("Below is a list of counters collected from the account's last " + str(num_tweets) + " tweets.")
print()
print("influencers = whenever the examined account interacts in any way (reply, mention, retweet) with another account, this counter is incremented")
print_counters(counters, uf, 50)

In [None]:
sn_rsn = full["sn_rsn"]
rsn_sn = full["rsn_sn"]
sn_rep = full["sn_rep"]
rep_sn = full["rep_sn"]
twid_count = full["twid_count"]
twid_rtc = full["twid_rtc"]
twid_text = full["twid_text"]
twid_url = full["twid_url"]
twid_sn = full["twid_sn"]
sn_twid = full["sn_twid"]

In [None]:
print("Suspicious original tweets")
print("==========================")
print("These can include:")
print(" - tweets that were retweeted more than liked")
print(" - tweets that look like follow trains")
print(" - tweets that are mostly, or all hashtags, and an image")
print()
for twid in full["susp_orig_twids"]:
    print(twid_url[twid] + "\t" + twid_text[twid])

In [None]:
print("Suspicious retweets")
print("===================")
print("These can include:")
print(" - tweets that were retweeted more than liked")
print(" - tweets that look like follow trains")
print(" - tweets that are mostly, or all hashtags, and an image")
print()
for twid in full["susp_twids"]:
    print(twid_url[twid] + "\t" + twid_text[twid])

In [None]:
print("Top 20 tweets that received the most retweets (from everyone on Twitter)")
print()
print("The number preceding the tweet is the number of retweets it received.")
print()
print_tweet_texts(twid_rtc, twid_text, twid_url, 20)

Interarrivals are counts of time deltas between tweets.

We measure the time between the previous tweet and the current tweet (in seconds). We record how many times each time delta is observed and plot a graph. Accounts that have large counts of the same time delta between tweets can be indicative of automation.

In [None]:
plot_data = make_interarrivals_plot_data(tweets)
sns.set(rc={'figure.figsize':(20,10)})
sns.set(style="whitegrid")
plt.figure()
ax = sns.barplot(x="deltas", y="counts", palette="husl", data=plot_data)
ax.set_title("Interarrivals")

In [None]:
print("Number of tweets observed per day")
print()
tweet_counts = get_sorted_tweet_counts(tweets)
print_tweet_counts(tweet_counts)

In [None]:
plot_data = get_tweet_counts_plot_data(tweets)
num_cols = len(plot_data["date"])
print(num_cols)
height = num_cols/5
sns.set(rc={'figure.figsize':(20,height)})
sns.set(style="whitegrid")
plt.figure()
ax = sns.barplot(y="date", x="count", palette="husl", data=plot_data)
for i, v in enumerate(plot_data["count"]):
    ax.text(v+1, i+0.25, str(v), fontweight='bold')
ax.set_title("Tweet counts")
#ax.yaxis.set_major_locator(plt.MaxNLocator(20))

In [None]:
print("All captured tweets from the target account, from most recent to least recent.")
print()
print("Note - for accounts that have published more than 3200 tweets, this is roughly the last 3200 tweets.")
print()
for t in tweets:
    twid = t["id_str"]
    ds = t["created_at"]
    text = twid_text[twid][:100]
    url = twid_url[twid]
    print(ds+ "\t" + text + "\t" + url)

The following few cells show at which times of the day tweets were published. This can be useful for identifying the timezone the account holder lives in. All times are in UTC.

In [None]:
plot_data = get_tweet_counts_scatter_plot_data(tweets)
sns.set(rc={'figure.figsize':(20,15)})
sns.set(style="whitegrid")
plt.figure()
ax = sns.scatterplot(x="hour", y="date", size="count", hue="count", sizes=(50, 500), data=plot_data, legend=False)
ax.xaxis.set_ticks_position('top')
ax.set_title("Tweet times (hour)")
ax.yaxis.set_major_locator(plt.MaxNLocator(20))

In [None]:
heatmap = make_short_heatmap(tweets)
print_short_heatmap(heatmap)

In [None]:
sns.set(rc={'figure.figsize':(20,5)})
sns.set(style="whitegrid")
hm = np.array(heatmap)
days = ["M", "T", "W", "T", "F", "S", "S"]
plt.figure()
ax = sns.heatmap(hm, annot=True, fmt="d", cmap="YlGnBu", cbar=False, yticklabels=days)
ax.xaxis.set_ticks_position('top')
ax.set_title("Combined heatmap")

In [None]:
long_hm = make_long_heatmap(tweets)

In [None]:
print_long_heatmap(long_hm)

In [None]:
sns.set(rc={'figure.figsize':(20,5)})
sns.set(style="whitegrid")
days = ["M", "T", "W", "T", "F", "S", "S"]
count = 0
for weeknum, htm in sorted(long_hm.items(), reverse=True):
    if count >= 20:
        break
    count += 1
    hm = np.array(htm)
    plt.figure()
    ax = sns.heatmap(hm, annot=True, fmt="d", cmap="YlGnBu", cbar=False, yticklabels=days)
    ax.xaxis.set_ticks_position('top')
    ax.set_title("Week number: " + str(weeknum))

In [None]:
print("Sources - where the tweet originated")
print("This field can be used to occasionally identify accounts that use standard automation services such as IFTTT")
print()
print_counter(counters["sources"], 100)

In [None]:
# Show source fields
cluster_hts = counters["sources"]

plot_data = {}
plot_data["labels"] = []
plot_data["sizes"] = []

n = 25
other = 0
otherc = 0
for ht, c in cluster_hts.most_common():
    if len(plot_data["labels"]) <= n:
        plot_data["labels"].append(ht)
        plot_data["sizes"].append(c)
    else:
        otherc += 1
        other += c
plot_data["labels"].append("Other (" + str(other) + ")")
plot_data["sizes"].append(other)

fig = plt.figure(figsize=(9,5))
ax = fig.add_axes((0,0,.5,1))
ax.set_title("Sources")
plt.pie(plot_data["sizes"], labels=plot_data["labels"], autopct='%1.1f%%', startangle=0)
plt.axis('equal')
plt.show()

In [None]:
# Show some hashtags
cluster_hts = counters["hashtags"]

plot_data = {}
plot_data["labels"] = []
plot_data["sizes"] = []

n = 25
other = 0
otherc = 0
for ht, c in cluster_hts.most_common():
    if len(plot_data["labels"]) <= n:
        plot_data["labels"].append("#" + ht)
        plot_data["sizes"].append(c)
    else:
        otherc += 1
        other += c
plot_data["labels"].append("Other (" + str(other) + ")")
plot_data["sizes"].append(other)

fig = plt.figure(figsize=(18,10))
ax = fig.add_axes((0,0,.5,1))
ax.set_title("Hashtags")
plt.pie(plot_data["sizes"], labels=plot_data["labels"], autopct='%1.1f%%', startangle=0)
plt.axis('equal')
plt.show()

In [None]:
# Show some hashtags
cluster_hts = counters["retweeted"]

plot_data = {}
plot_data["labels"] = []
plot_data["sizes"] = []

n = 25
other = 0
otherc = 0
for ht, c in cluster_hts.most_common():
    if len(plot_data["labels"]) <= n:
        plot_data["labels"].append(ht)
        plot_data["sizes"].append(c)
    else:
        otherc += 1
        other += c
plot_data["labels"].append("Other (" + str(other) + ")")
plot_data["sizes"].append(other)

fig = plt.figure(figsize=(18,10))
ax = fig.add_axes((0,0,.5,1))
ax.set_title("Retweeted")
plt.pie(plot_data["sizes"], labels=plot_data["labels"], autopct='%1.1f%%', startangle=0)
plt.axis('equal')
plt.show()

In [None]:
# Show some url domains
domains = counters["domains"]
cluster_hts = domains
plot_data = {}
plot_data["labels"] = []
plot_data["sizes"] = []

n = 40
other = 0
otherc = 0
for ht, c in cluster_hts.most_common():
    if len(plot_data["labels"]) <= n:
        plot_data["labels"].append(ht)
        plot_data["sizes"].append(c)
    else:
        otherc += 1
        other += c
plot_data["labels"].append("Other (" + str(other) + ")")
plot_data["sizes"].append(other)

fig = plt.figure(figsize=(18,10))
ax = fig.add_axes((0,0,.5,1))
ax.set_title("Domains")
plt.pie(plot_data["sizes"], labels=plot_data["labels"], autopct='%1.1f%%', startangle=0)
plt.axis('equal')
plt.show()

In [None]:
follower_dets_fn = os.path.join(dirname, "follower_details.json")
follower_details = []
if os.path.exists(follower_dets_fn):
    print("Loading follower details from: " + follower_dets_fn)
    follower_details = load_json(follower_dets_fn)
else:
    print("Retrieving followers from Twitter")
    follower_details = get_follower_data_sn(target)
    save_json(follower_details, follower_dets_fn)
print("Found: " + str(len(follower_details)) + " followers.")

In [None]:
# Show account age distribution
cluster_acct_ages = Counter()
quarters = {"Q1": [1, 2, 3],
            "Q2": [4, 5, 6],
            "Q3": [7, 8, 9],
            "Q4": [10, 11, 12]}
flrtw = Counter()
for d in follower_details:
    flrtw[d["screen_name"]] += 1
    if "created_at" in d:
        ca = d["created_at"]
        yr = ca[-4:]
        mon = md[ca[4:7]]
        q = ""
        for qname, qvals in quarters.items():
            if int(mon) in qvals:
                q = qname
        day = ca[8:10]
        #ds = str(yr) + "-" + str(mon)
        ds = str(yr) + "-" + q
        cluster_acct_ages[ds] += 1
plot_data = {}
plot_data["labels"] = []
plot_data["counts"] = []
for label, count in sorted(cluster_acct_ages.items(), reverse=True):
    plot_data["labels"].append(label)
    plot_data["counts"].append(count)
plot_data = trim_plot_data(plot_data, 0, 50)
height = len(plot_data["counts"])/3
sns.set(rc={'figure.figsize':(20,height)})
sns.set(style="whitegrid")
plt.figure()
ax = sns.barplot(x="counts", y="labels", palette="husl", data=plot_data)
for i, v in enumerate(plot_data["counts"]):
    pad = min(1.0, v/100)
    ax.text(v+pad, i+0.25, str(v), fontweight='bold')
ax.set_title("Ages of accounts that follow: " + target)

In [None]:
print("Field description:")
print("sc = number of tweets published")
print("fl = number of followers")
print("fr = number of accounts following")
print("egg = true, if the account has default profile and profile picture")
print("ca = date account was created")
print()
min_retweets = 1
date_cutoff = "2019_10"
print("Accounts that follow " + target + " that were created after " + date_cutoff)
s = print_alt_summary_list(follower_details, flrtw, min_retweets=min_retweets, date_cutoff=date_cutoff)

In [None]:
# Follower factory plot
plot_data = {}
plot_data["index"] = []
plot_data["timestamp"] = []
follower_details_rev = follower_details
follower_details_rev.reverse()
for index, d in enumerate(follower_details):
    ts = twitter_time_to_unix(d["created_at"])
    plot_data["index"].append(index)
    plot_data["timestamp"].append(ts)
df = pd.DataFrame({"x": plot_data["index"],
                   "y": plot_data["timestamp"]})
df.plot.scatter("x", "y", figsize=(20,5), s=0.1, title="Follower Factory: " + target)

In [None]:
friends_dets_fn = os.path.join(dirname, "friends_details.json")
friends_details = []
if os.path.exists(friends_dets_fn):
    print("Loading friends details from: " + friends_dets_fn)
    friends_details = load_json(friends_dets_fn)
else:
    print("Retrieving friends from Twitter")
    friends_details = get_friends_data_sn(target)
    save_json(friends_details, friends_dets_fn)
print("Found: " + str(len(friends_details)) + " friends.")

In [None]:
# Show account age distribution
cluster_acct_ages = Counter()
quarters = {"Q1": [1, 2, 3],
            "Q2": [4, 5, 6],
            "Q3": [7, 8, 9],
            "Q4": [10, 11, 12]}
frrtw = Counter()
for d in friends_details:
    frrtw[d["screen_name"]] += 1
    if "created_at" in d:
        ca = d["created_at"]
        yr = ca[-4:]
        mon = md[ca[4:7]]
        q = ""
        for qname, qvals in quarters.items():
            if int(mon) in qvals:
                q = qname
        day = ca[8:10]
        ds = str(yr) + "-" + q
        cluster_acct_ages[ds] += 1
plot_data = {}
plot_data["labels"] = []
plot_data["counts"] = []
for label, count in sorted(cluster_acct_ages.items(), reverse=True):
    plot_data["labels"].append(label)
    plot_data["counts"].append(count)
plot_data = trim_plot_data(plot_data, 0, 50)
height = len(plot_data["counts"])/3
sns.set(rc={'figure.figsize':(20,height)})
sns.set(style="whitegrid")
plt.figure()
ax = sns.barplot(x="counts", y="labels", palette="husl", data=plot_data)
for i, v in enumerate(plot_data["counts"]):
    pad = min(1.0, v/100)
    ax.text(v+pad, i+0.25, str(v), fontweight='bold')
ax.set_title("Ages of accounts that : " + target + " follows.")

In [None]:
min_retweets = 1
date_cutoff = "2019_10"
print("Accounts that " + target + " follows that were created after " + date_cutoff)
s = print_alt_summary_list(friends_details, frrtw, min_retweets=min_retweets, date_cutoff=date_cutoff)

In [None]:
followerids = set()
followersns = set()
friendids = set()
friendsns = set()
details_dict = {}
for d in follower_details:
    details_dict[d["screen_name"]] = d
    followersns.add(d["screen_name"])
    followerids.add(d["id_str"])
for d in friends_details:
    details_dict[d["screen_name"]] = d
    friendsns.add(d["screen_name"])
    friendids.add(d["id_str"])
union = followersns.union(friendsns)
intersection = followersns.intersection(friendsns)
print(target + " has " + str(len(intersection)) + " followers that are also friends.")
print_sn_list(intersection)

In [None]:
int_details = []
for sn in intersection:
    int_details.append(details_dict[sn])
# Show account age distribution
cluster_acct_ages = Counter()
quarters = {"Q1": [1, 2, 3],
            "Q2": [4, 5, 6],
            "Q3": [7, 8, 9],
            "Q4": [10, 11, 12]}
intrtw = Counter()
for d in int_details:
    intrtw[d["screen_name"]] += 1
    if "created_at" in d:
        ca = d["created_at"]
        yr = ca[-4:]
        mon = md[ca[4:7]]
        q = ""
        for qname, qvals in quarters.items():
            if int(mon) in qvals:
                q = qname
        day = ca[8:10]
        #ds = str(yr) + "-" + str(mon)
        ds = str(yr) + "-" + q
        cluster_acct_ages[ds] += 1
plot_data = {}
plot_data["labels"] = []
plot_data["counts"] = []
for label, count in sorted(cluster_acct_ages.items(), reverse=True):
    plot_data["labels"].append(label)
    plot_data["counts"].append(count)
plot_data = trim_plot_data(plot_data, 0, 50)
height = len(plot_data["counts"])/3
sns.set(rc={'figure.figsize':(20,height)})
sns.set(style="whitegrid")
plt.figure()
ax = sns.barplot(x="counts", y="labels", palette="husl", data=plot_data)
for i, v in enumerate(plot_data["counts"]):
    pad = min(1.0, v/100)
    ax.text(v+pad, i+0.25, str(v), fontweight='bold')
ax.set_title("Ages of accounts that: " + target + " follows and followed.")

In [None]:
min_retweets = 1
date_cutoff = "2019_10"
print("Accounts that both follow and are followed by " + target + " and were created after " + date_cutoff)
s = print_alt_summary_list(int_details, intrtw, min_retweets=min_retweets, date_cutoff=date_cutoff)