# Bot Visualization

Import our libraries needed for the data handling.

In [1]:
import pandas as pd
import numpy as np
import json
import glob

In [2]:
#Set PANDAS to show all columns in DataFrame
pd.set_option('display.max_columns', None)
# and increase the max number of rows we can show
pd.set_option('display.max_rows', 100)

Libraries for stupid text encoding

In [3]:
from urllib2 import quote
# Unicode strings
from __future__ import unicode_literals

In [5]:
import feather

Import libraries needed for visualization.

In [4]:
import matplotlib.pyplot as plt
# Within notebook viewing
%matplotlib inline

import seaborn as sns
import bokeh

In [5]:
# Import for axes, color, etc
from pylab import *

Natural Language Processing

In [6]:
%run twokenize.py

In [7]:
# time deltas
from datetime import timedelta

### Directories

In [8]:
testDir = '../../data/external/trump-bots/'
botDir = '../../data/external/botresults/'
outDir = '../../data/processed/bot-tweets/'

Read in the data files by combining the extracted files.

In [9]:
# Crudely combine
process = []
for f in glob.glob((botDir + "*.txt")):
    with open(f, "rb") as infile:
        for line in infile:
            process.append(json.loads(line))
raw = pd.DataFrame.from_records(process)

del process

print (raw.shape)

(77722, 33)


In [11]:
feather.write_dataframe(raw, 'raw.feather')

FeatherError: Invalid: Unable to infer type of object array, were all null

##### Helper functions

In [12]:
def tknz(text):
    tokens = tokenizeRawTweetText(text)
    filt = [x for x in tokens if not (x.startswith('RT') 
                                   or x.startswith('@') or x.startswith(':') 
                                   or x.startswith('http://') or x.startswith('https://') 
                                  or x.startswith('-') or x.startswith('and') 
                                    or x.startswith('.') or x.startswith(',') 
                                    or x.startswith('?') 
                                     or x.startswith("'") or x == ('' or '"' or 'a'))]
    return filt

def hsh(tokens):
    # reads a list of tokens
    # tuped = tuple(tokens)
    tuped = ', '.join(tokens)
    hashed = hash(tuped)
    return hashed

# formatting for the datetime display
def timeDeltaDisplay(td):
    if (td.total_seconds() // 86400) < 1:
        return ('{} hours'.format((td.total_seconds() // 3600)))
    else:
        return ('{} days'.format(td.days))
    
def extractInfo(tweet):
    # User variables
    userID = tweet['user']['id_str']
    screenName = tweet['user']['screen_name']
    # Tweet Variables
    text = tweet['text']
    tokens = tknz(text)
    hashed = hsh(tokens)
    retweet = 1 if (type(tweet['retweeted_status']) == dict) else 0
    retweet_count = tweet['retweet_count']
    favorite_count = tweet['favorite_count']
    
    timestamp = tweet['timestamp_ms']
    
    return {'userID':userID, 'screenName':screenName, 'timestamp':timestamp, \
            'text':text, 'tokens':tokens, 'hash':hashed, 'retweet':retweet, \
           'retweet_count': retweet_count, 'favorite_count':favorite_count}

In [13]:
extracted = pd.DataFrame.from_records(raw.apply(lambda x: extractInfo(x), axis=1))

In [14]:
# throw out the tweets we can't classify
extracted = extracted[extracted['hash'] != 0]

# Calc variables
extracted['timestamp'] = pd.to_datetime(extracted['timestamp'], unit='ms')
extracted['token_str'] = extracted['tokens'].apply(lambda x: ', '.join(x))

extracted.head()

Unnamed: 0,favorite_count,hash,retweet,retweet_count,screenName,text,timestamp,tokens,userID,token_str
0,0,8401775961062800512,1,0,Scrufey21,RT @robinleeclark: Hillary is a murderer and a...,2015-07-06 00:12:43.046,"[Hillary, is, a, murderer, a, traitor, that's,...",490179072,"Hillary, is, a, murderer, a, traitor, that's, ..."
1,0,-7017692310915600254,1,0,Scrufey21,RT @PatriotMash: Clinton uses two aides and a ...,2015-07-06 00:17:23.293,"[Clinton, uses, two, aides, a, rope, to, lasso...",490179072,"Clinton, uses, two, aides, a, rope, to, lasso,..."
2,0,4819910249842125834,1,0,Scrufey21,RT @NoahWehrman: Press prepare for another day...,2015-07-06 00:19:58.259,"[Press, prepare, for, another, day, of, coveri...",490179072,"Press, prepare, for, another, day, of, coverin..."
3,0,8041086589735528761,0,0,CyberAnonymous,nytimes: Hillary Clinton reassures gay youth i...,2015-07-06 00:20:15.542,"[nytimes, Hillary, Clinton, reassures, gay, yo...",128334973,"nytimes, Hillary, Clinton, reassures, gay, you..."
4,0,-4371809372265221000,1,0,Scrufey21,RT @LessGovMoreFun: Truth be told: https://t....,2015-07-06 00:20:17.984,"[Truth, be, told]",490179072,"Truth, be, told"


In [16]:
agg = {
    'userID': {
        'uniq_users': pd.Series.nunique,
        'users': pd.Series.tolist
    }, 'timestamp': {
        'first_day': 'min',
        'last_day': 'max',
        # Calculate time delta # # ).days if (max(x)-min(x).hours > 24) else (max(x)-min(x)).hours
        'time_delta': lambda x: max(x)-min(x)
    }, 
    'hash': 'count',
    'favorite_count': {
        'total_favorites': 'sum',
        'avg_favorites': 'mean'
    },
    'retweet_count': {
        'total_retweets': 'sum',
        'avg_retweets': 'mean'
    }
}

aggregated = extracted.groupby(['token_str']).agg(agg)

In [17]:
# reset the index so it's easier to work with
aggregated.reset_index(col_level=1, inplace=True, drop=False)
aggregated.columns = ['token_str', 'first_day', 'time_delta', 'last_day', 'avg_retweets', 'total_retweets', 'uniq_users', 'users', \
                      'total_favorites', 'avg_favorites', 'count']    

# Turn the list into a set
aggregated['users'] = aggregated['users'].apply(lambda x: set(x))

aggregated.sort_values('count', inplace=True, ascending=False)
aggregated.head(1)

Unnamed: 0,token_str,first_day,time_delta,last_day,avg_retweets,total_retweets,uniq_users,users,total_favorites,avg_favorites,count
23714,"Donald, Trump, was, just, mentioned, during, E...",2015-10-15 01:18:02.734,121 days 22:38:49.538000,2016-02-13 23:56:52.272,0,0,1,{3920277023},0,0,1574


---

Now that we have our aggregated list of duplicate tweets, let's make a couple of `.csv`'s so that we can render `D3.js` timeline plots.

First we'll make a simple Gantt chart with the first and last Bot-Tweet appearance.

In [None]:
# write gantt chart to csv, use unicode formatting or else we'll get errors
gantt = 'gantt.csv'
#aggregated[['token_str', 'uniq_users', 'count', 'first_day', 'last_day', 'time_delta']].iloc[0:49].to_csv(gantt, index=False, encoding='utf-8')

In [25]:
worstOffenders = aggregated['token_str'].iloc[0:9].tolist()
# snippet to turn the groupby to a dict whose keys are group labels and vals are DF's
groupedTokens = extracted.groupby('token_str')

In [29]:
# iter through list and create a DF by concatenating groups
frames = []
for i in worstOffenders:
    frames.append(groupedTokens.get_group(i))
result = pd.concat(frames)

result.to_csv('top-10-bot-tweets.csv', index=False, encoding='utf-8')

In [2]:
aggregated[['token_str', 'time_delta', 'uniq_users', 'count']]\
    .sort_values('time_delta', ascending=false)\
    .head(25).style.format(timeDeltaDisplay, subset=['time_delta'])\
    .bar(subset=['count', 'total_favorites', 'total_retweets', 'time_delta', 'uniq_users'], color='#d65f5f')

NameError: name 'aggregated' is not defined

We've noticed some very interesting patterns! Several to note...

1. The time duration of a Bot's 'template' varies significantly! Thomas et Al. found that the majority of their bots had account durations less than 1 month (77% ?). These results indicate that while some campaigns are done in short bursts, an impactful portion of the bots (those who are producing the most tweets) stick around longer. Difference is because?
2. Templates, repitition in tweets. `@DonaldTrumpTVFan` is the the user with the most volume by far. It appears that he uses a template to distribute Trump appearances on TV.
3. User's. # unique users, etc.

In [40]:
# 2. Getting rid of @dnaldtrumptvfan we can look at tweets from users that arent him
# # Top tweets filtered
aggregated[aggregated['users'] != set([u'3920277023'])][['token_str', 'time_delta', 'uniq_users', 'count']].head(25)\
    .style.format(timeDeltaDisplay, subset=['time_delta'])\
    .bar(subset=['count', 'total_favorites', 'total_retweets', 'time_delta', 'uniq_users'], color='#d65f5f')

Unnamed: 0,token_str,time_delta,uniq_users,count
15245,"Dear, #MicroAggression, jump, on, the, #TrumpTrain",33 days,2,1362
2844,"#RealDonaldTrump, will, save, us",66 days,2,807
13732,"Check, out, Donald, Trump, Your, President, #Trump2016, #TrumpTrain, by, Rick, Poppe",101 days,50,146
45447,"What, can, we, do, to, make, America",1 days,1,123
1029,"#Clinton, Hillary, Clinton, releases, Web, ad, slamming, Donald, Trump, over, taxes",19.0 hours,1,103
26983,"Have, you, heard, Donald, Trump, Your, President, by, rickpoppe, on, #SoundCloud, #TrumpTrain, #Trump2016",101 days,50,99
828,"#Clinton, Hillary, Clinton, Why, Employers, Should, Give, Ex-Offenders, A, Second, Chance",22.0 hours,1,92
13733,"Check, out, Donald, Trump, Your, President, by, Rick, Poppe, #Trump2016, #TrumpTrain, …",98 days,48,88
20465,"Donald, Trump, Rally, in, Tampa, FL, at, USF, #TeamTrump, #Florida, #SouthCarolinaPrimary, #students, #WOMENFORTRUMP",2.0 hours,1,82
664,"#Clinton, Gary, Johnson, set, to, take, on, Donald, Trump, Hillary, Clinton, in, November",22.0 hours,1,81


In [51]:
# 3.
# # Look at Bot Tweet's with more than one uniq user.
# # Are there any tweets that come in bursts from a diverse group of users? <-- YES
# # # Filter out those tweets that dont have multiple users AND who all came within the same timeframe
aggregated[(aggregated['uniq_users'] > 1) & (aggregated['time_delta'] < timedelta(hours=10))][['token_str', 'time_delta', 'users', 'uniq_users', 'count']].head(25).sort_values('uniq_users', ascending=False)\
    .style.format(timeDeltaDisplay, subset=['time_delta'])\
    .bar(subset=['count', 'total_favorites', 'total_retweets', 'time_delta', 'uniq_users'], color='#d65f5f')

Unnamed: 0,token_str,time_delta,users,uniq_users,count
13242,"CLINTON, BROADSIDE, Trump, lashes, out, at, Hillary, for, selling, favors",5.0 hours,"set([u'1214297077', u'1214302297', u'372028801', u'1214307817', u'1214310979', u'1214290435', u'1214286984', u'1214293885', u'1214308945', u'1214286679', u'1214267976', u'1214306713', u'1214284272', u'2976237925', u'1214278321', u'1214281278', u'1214288034', u'1214285328', u'1214299358'])",19,20
10875,"ANYONE, BUT, HILLARY, Once, a, lock, Dems, now, looking, at, Clinton, options",3.0 hours,"set([u'1214297077', u'1214302297', u'372028801', u'1214307817', u'1214310979', u'1214290435', u'1214286984', u'1214293885', u'1214308945', u'1214286679', u'1214267976', u'1214306713', u'490179072', u'1214284272', u'1214278321', u'1214281278', u'1214288034', u'1214285328', u'1214299358'])",19,19
34227,"MONEY, PIT, Clinton, charity, will, keep, taking, foreign, cash, during, Hillary, run",0.0 hours,"set([u'1214297077', u'1214302297', u'372028801', u'1214307817', u'1214310979', u'1214290435', u'1214286984', u'1214293885', u'1214308945', u'1214286679', u'1214267976', u'1214306713', u'1214284272', u'1214278321', u'1214281278', u'1214288034', u'1214285328', u'1214299358'])",18,18
35558,"NRA, ON, BOARD, Gun, rights, group, throws, its, support, to, Donald, Trump",1.0 hours,"set([u'1214297077', u'1214302297', u'372028801', u'1214307817', u'1214310979', u'1214290435', u'1214286984', u'1214293885', u'1214308945', u'1214286679', u'1214267976', u'1214306713', u'1214284272', u'1214278321', u'1214281278', u'1214288034', u'1214285328', u'1214299358'])",18,18
29517,"Hillary, Clinton, signed, non-disclosure, agreement, to, protect, classified, info, while, secretary, of, state",0.0 hours,"set([u'1214297077', u'1214302297', u'372028801', u'1214307817', u'1214310979', u'1214290435', u'1214286984', u'1214293885', u'1214308945', u'1214286679', u'1214267976', u'1214306713', u'1214284272', u'1214278321', u'1214281278', u'1214288034', u'1214285328', u'1214299358'])",18,18
27500,"Hillary, Clinton's, car, for, sale",0.0 hours,"set([u'1214297077', u'1214302297', u'372028801', u'1214307817', u'1214310979', u'1214290435', u'1214286984', u'1214293885', u'1214308945', u'1214286679', u'1214267976', u'1214306713', u'1214284272', u'1214278321', u'1214281278', u'1214288034', u'1214285328', u'1214299358'])",18,18
25743,"Fox, News, Poll, New, high, for, Donald, Trump, new, low, for, Hillary, Clinton",0.0 hours,"set([u'1214297077', u'1214302297', u'372028801', u'1214307817', u'1214310979', u'1214290435', u'1214286984', u'1214293885', u'1214308945', u'1214286679', u'1214267976', u'1214306713', u'1214284272', u'1214278321', u'1214281278', u'1214288034', u'1214285328', u'1214299358'])",18,18
22994,"Donald, Trump, says, Bowe, Bergdahl, should, have, been, executed, VIDEO, Donald, Trump's, biggest, fan, explains, her, adm",0.0 hours,"set([u'1214297077', u'1214302297', u'372028801', u'1214307817', u'1214310979', u'1214290435', u'1214286984', u'1214293885', u'1214308945', u'1214286679', u'1214267976', u'1214306713', u'1214284272', u'1214278321', u'1214281278', u'1214288034', u'1214285328', u'1214299358'])",18,18
13161,"CAN, ‘, SPOILER, ’, TRUMP, DONALD, Kristol, touches, off, firestorm, with, claim, of, independent, candidate",0.0 hours,"set([u'1214297077', u'1214302297', u'372028801', u'1214307817', u'1214310979', u'1214290435', u'1214286984', u'1214293885', u'1214308945', u'1214286679', u'1214267976', u'1214306713', u'1214284272', u'1214278321', u'1214281278', u'1214288034', u'1214285328', u'1214299358'])",18,18
26835,"HISTORIC, NIGHT, FOR, HILLARY, In, capturing, NJ, primary, Clinton, becomes, 1st, woman, to, win, major, party's, president",0.0 hours,"set([u'1214297077', u'1214302297', u'372028801', u'1214307817', u'1214310979', u'1214290435', u'1214286984', u'1214293885', u'1214308945', u'1214286679', u'1214267976', u'1214306713', u'1214284272', u'1214278321', u'1214281278', u'1214288034', u'1214285328', u'1214299358'])",18,18


In [23]:
collusion = extracted[extracted['userID'].isin(set([u'1214297077', u'1214302297', u'372028801', u'1214307817', 
                                      u'1214310979', u'1214290435', u'1214286984', u'1214293885', 
                                      u'1214308945', u'1214286679', u'1214267976', u'1214306713', 
                                      u'490179072', u'1214284272', u'1214278321', u'1214281278', 
                                      u'1214288034', u'1214285328', u'1214299358']))]
# Double check
collusion['userID'].unique()

In [29]:
collusion.to_csv((outDir + 'collusion-network.csv'), index=False, encoding='utf-8')