In [61]:
%matplotlib inline
import pandas as pd
import matplotlib as plt
plt.style.use('seaborn-whitegrid')
import numpy as np
import sys
import json
import os
import io
import re
from datetime import datetime, date, time, timedelta
import time

In [62]:
def twitter_time_to_object(time_string):
    twitter_format = "%a %b %d %H:%M:%S %Y"
    match_expression = "^(.+)\s(\+[0-9][0-9][0-9][0-9])\s([0-9][0-9][0-9][0-9])$"
    match = re.search(match_expression, time_string)
    if match is not None:
        first_bit = match.group(1)
        second_bit = match.group(2)
        last_bit = match.group(3)
        new_string = first_bit + " " + last_bit
        date_object = datetime.strptime(new_string, twitter_format)
        return date_object

def twitter_time_to_unix(time_string):
    return time_object_to_unix(twitter_time_to_object(time_string))

def time_object_to_unix(time_object):
    return int(time_object.strftime("%s"))

def get_utc_unix_time():
    dts = datetime.utcnow()
    epochtime = time.mktime(dts.timetuple())
    return epochtime

def seconds_since_twitter_time(time_string):
    input_time_unix = int(twitter_time_to_unix(time_string))
    current_time_unix = int(get_utc_unix_time())
    return current_time_unix - input_time_unix

In [63]:
def save_json(variable, filename):
    with io.open(filename, "w", encoding="utf-8") as f:
        f.write(unicode(json.dumps(variable, indent=4, ensure_ascii=False)))

def load_json(filename):
    ret = None
    if os.path.exists(filename):
        try:
            with io.open(filename, "r", encoding="utf-8") as f:
                ret = json.load(f)
        except:
            pass
    return ret

In [64]:
def get_data(fname):
    creation_dates = []
    suspicious = []
    users = []
    details = {}
    print("Loading " + fname)
    userlist = load_json(fname)
    if userlist is not None and len(userlist) > 0:
        print("Parsing " + fname)
        for user in userlist:
            users.append(user["screen_name"])
            fields = ["screen_name",
                      "description",
                      "friends_count",
                      "followers_count",
                      "name",
                      "location",
                      "favourites_count",
                      "statuses_count",
                      "created_at",
                      "id_str",
                      "protected",
                      "verified"]
            entry = {}
            for f in fields:
                if f in user:
                    entry[f] = user[f]
            acct_age = seconds_since_twitter_time(user["created_at"])
            if acct_age > 0:
                acct_age_days = float(acct_age)/86400.00
                entry["account_age_days"] = acct_age_days
                tweets_per_second = float(user["statuses_count"])/float(acct_age)
                tweets_per_day = tweets_per_second * (86400)
                entry["tweets_per_day"] = tweets_per_day
            details[user["screen_name"]] = entry
    print("Done.")
    print("Found " + str(len(users)) + " accounts.")
    return users, details

In [98]:
#names = ["greensboro_nc", "_north_carolina", "kimgarst", "thomaspower", "marshawright", "marshacollier"]
names = ["kimgarst", "thomaspower", "marshawright", "marshacollier"]

In [84]:
def get_details(name):
    filename = os.path.join("captures", "temp", name, name)
    if os.path.exists(filename):
        return get_data(filename)

In [96]:
def compare_sets(dataset, namelist):
    userlists = []
    for n in namelist:
        if n in dataset:
            userlists.append(set(dataset[n]["users"]))
    return list(set.intersection(*userlists))

In [100]:
datasets = {}
for n in names:
    print("Getting data for " + n)
    datasets[n] = {}
    users, details = get_details(n)
    datasets[n]["users"] = users
    datasets[n]["details"] = details

Getting data for kimgarst
Loading captures/temp/kimgarst/kimgarst
Parsing captures/temp/kimgarst/kimgarst
Done.
Found 596800 accounts.
Getting data for thomaspower
Loading captures/temp/thomaspower/thomaspower
Parsing captures/temp/thomaspower/thomaspower
Done.
Found 316600 accounts.
Getting data for marshawright
Loading captures/temp/marshawright/marshawright
Parsing captures/temp/marshawright/marshawright
Done.
Found 558200 accounts.
Getting data for marshacollier
Loading captures/temp/marshacollier/marshacollier
Parsing captures/temp/marshacollier/marshacollier
Done.
Found 227600 accounts.


In [102]:
from itertools import combinations
for x in range(2, len(names)):
    for comb in combinations(names, x):
        intersection = compare_sets(datasets, comb)
        print("Intersection between " + ",".join(comb) + " contained " + str(len(intersection)) + " accounts.")
all_intersection = compare_sets(datasets, names)
print("Intersection between " + ",".join(names) + " contained " + str(len(intersection)) + " accounts.")

Intersection between kimgarst,thomaspower contained 20750 accounts.
Intersection between kimgarst,marshawright contained 104315 accounts.
Intersection between kimgarst,marshacollier contained 91209 accounts.
Intersection between thomaspower,marshawright contained 11718 accounts.
Intersection between thomaspower,marshacollier contained 12942 accounts.
Intersection between marshawright,marshacollier contained 37662 accounts.
Intersection between kimgarst,thomaspower,marshawright contained 6282 accounts.
Intersection between kimgarst,thomaspower,marshacollier contained 8154 accounts.
Intersection between kimgarst,marshawright,marshacollier contained 27361 accounts.
Intersection between thomaspower,marshawright,marshacollier contained 4334 accounts.
Intersection between kimgarst,thomaspower,marshawright,marshacollier contained 4334 accounts.


In [106]:
pairs = [["kimgarst","marshawright"], ["kimgarst", "marshacollier"], ["kimgarst", "marshawright", "marshacollier"]]
for p in pairs:
    print("Comparing " + ",".join(p))
    intersection_names = compare_sets(datasets, p)
    intersection_details = []
    print("Getting details.")
    print("Got " + str(len(intersection_names)) + " accounts.")
    for name in intersection_names:
        intersection_details.append(datasets[p[0]]["details"][name])
    df = pd.DataFrame(intersection_details)
    filename = "int_details_" + "_".join(p) + ".csv"
    print("Saving: " + filename)
    df.to_csv(filename, encoding="utf-8")
    print("Done saving.")
print("All done.")

Comparing kimgarst,marshawright
Getting details.
Got 104315 accounts.
Saving: int_details_kimgarst_marshawright.csv
Done saving.
Comparing kimgarst,marshacollier
Getting details.
Got 91209 accounts.
Saving: int_details_kimgarst_marshacollier.csv
Done saving.
Comparing kimgarst,marshawright,marshacollier
Getting details.
Got 27361 accounts.
Saving: int_details_kimgarst_marshawright_marshacollier.csv
Done saving.
All done.
