# Clean tweets

In [None]:
# -*- coding: utf-8 -*-
"""
Spyder Editor

This is a temporary script file.
"""

# Load packages
import pandas as pd
import nltk
import csv
import re
pd.options.mode.chained_assignment = None


# Load data
tweets = pd.read_csv('../Data/twitter_covid_feb_small.csv', encoding='latin-1')


### Clean tweets ###


## Convert full text to string ##


# Some tweets are classified by Python as 'floats' so need to convert to string for cleaning process
tweets['full_text_str'] = tweets['full_text'].astype(str)


## Remove punctuation ##


# We are not interested in punctuation for analyses so replace them with a space
# tweets['full_text_letters'] = tweets['full_text'].apply(lambda x : re.sub(r'[^a-zA-Z0-9 ]',' ',str(x)))
# If want to get rid of numbers too use re.sub(r'[^a-zA-Z ]' - normally we would get rid of numbers as well but if we want to look at '5G' then need to leave them in

    
## Convert all words to lower case ##


# To normalise comparisons else Love and love are treated seperately (for upper case swicth to 'word.upper())
tweets['full_text_lower'] = tweets['full_text_str'].apply(  lambda x: ' '.join( [ word.lower() for word in x.split() ] ) )
del tweets['full_text_str']


## Remove stop words ##


# Remove common words such as 'a', 'the', 'on' that do not contribute to the meaning of texts through providing unncessary information
from nltk.corpus import stopwords
stop = stopwords.words("english") # Define stopwords
tweets['full_text_stop'] = tweets['full_text_lower'].apply(lambda x: ' '.join([word for word in x.split() if word not in (stop)])) # Remove from tweet
del tweets['full_text_lower']


## Replace abbreviations ##


# Convert terms such as OMG to Oh My God - I have also included RT as retweet or HT as hattip
# Code From: https://medium.com/nerd-stuff/python-script-to-turn-text-message-abbreviations-into-actual-phrases-d5db6f489222
def translator(user_string):
    user_string = user_string.split(" ")
    j = 0
    for _str in user_string:
        # File path which consists of Abbreviations.
        fileName = "./slang.txt"

        # File Access mode [Read Mode]
        with open(fileName, "r") as myCSVfile:
            # Reading file as CSV with delimiter as "=", so that abbreviation are stored in row[0] and phrases in row[1]
            dataFromFile = csv.reader(myCSVfile, delimiter="=")
            # Removing Special Characters.
            _str = re.sub('[^a-zA-Z0-9]+', '', _str)
            for row in dataFromFile:
                # Check if selected word matches short forms[LHS] in text file.
                if _str.upper() == row[0]:
                    # If match found replace it with its appropriate phrase in text file.
                    user_string[j] = row[1]
            myCSVfile.close()
        j = j + 1
    return ' '.join(user_string)

tweets['full_text_abbr'] = tweets['full_text_stop'].apply(lambda x:  translator(x)  ) 
del tweets['full_text_stop']


## Normalising language ##


# Terms may be used with different tenses which ae currently being treated seperately. We have two options:

# 1. Stemming
# Normalise language by converting terms to their base/root (i.e. removes -ing, -ed, -s etc) e.g. waiting becomes wait
# Pros: computationally efficient
# Cons: root terms may be less obvious e.g. loving becomes lov 

#ps = PorterStemmer()
#tweets['Step3_SentimentText'] = tweets['full_text_stop'].apply(lambda x: ' '.join([ps.stem(word) for word in x.split() ]))

# 2. Lemmatization
# Convert terms to their root dictionary form (or lemma) e.g. runs, running and ran are each forms of run
# Pros: greater context to root terms as uses valid words
# Cons: requires greater memory to run, does not always get to root word

# We will go with Lemmatization as more useful in interpretation of words

# nltk.download() # To install WordNet corpora
from nltk.stem.wordnet import WordNetLemmatizer
lmtzr = WordNetLemmatizer()
tweets['full_text_cleaned'] = tweets['full_text_abbr'].apply(lambda x: ' '.join([lmtzr.lemmatize(word,'v') for word in x.split() ]))
del tweets['full_text_abbr']


## Parts of speech tagging ##


# Define structure of terms as nouns, pronouns, verbs etc
#tweets['full_text_pos'] = tweets['full_text_cleaned'].apply(lambda x: nltk.pos_tag(nltk.word_tokenize(x)))


## Tokenise data ##


## Code breaks up tweets into seperate words which is neccessary to analyse and identify terms
#from nltk.tokenize import word_tokenize
#tokens = tweets.full_text_cleaned.apply(word_tokenize)


## Save ##

# Save as csv file
tweets.to_csv('../Data/twitter_covid_feb_small_cleaned.csv')


# bots

In [None]:
#########################
##### Identify bots #####
#########################

# Purpose: To classify Twitter users as bots or not.

# Libraries
# devtools::install_github("mkearney/botrnot") # This didn't work for me so needed to follow https://github.com/mkearney/tweetbotornot/issues/24
# remove.packages("tweetbotornot")
# install_github("markagreen/tweetbotornot", dependencies = TRUE) # Anyone should be able to install this
library(tweetbotornot)
#devtools::install_github("mkearney/rtweet")
library(rtweet)
library(dplyr)
library(httr)

# Set up Twitter details
consumer_key <- "VzihIPxv5oFrd3SkNuBuQk9o3"
consumer_secret <- "Iq7hi4K1cZnzgD3RC1miTM6rcrHMA4aeHj3OeCsI9OvFVtX5Ej"
access_token <- "2507558052-K6abCbi1LD59qPxIMtlsGVWL7dDdLkfyQYPuLFS"
access_secret <- "x9AwwJuLJWWdpSZMPkKGG4RAtxtZh9uXxWfebj5HsseWA"
app <- "Liv_misinformation_study"

token <- create_token(
  app = app,
  consumer_key = consumer_key,
  consumer_secret = consumer_secret,
  access_token = access_token,
  access_secret = access_secret,
  set_renv = TRUE)

rm(consumer_key, consumer_secret, access_secret, access_token, app)

# Create list of usernames in project

# January
tweets <- read.csv("../Data/twitter_covid_jan_small_cleaned.csv", header = TRUE) # Load data
users <- as.data.frame(tweets$username) # Subset usernames
users <- distinct(users) # Remove duplicates (i.e. multiple tweets within a month)
rm(tweets) # Delete

# February
tweets <- read.csv("../Data/twitter_covid_feb_small_cleaned.csv", header = TRUE) # Load data
temp <- as.data.frame(tweets$username) # Subset usernames
temp <- distinct(temp) # Remove duplicates
users <- rbind(users, temp) # Join on to longer list
users <- distinct(users) # Remove duplicates again
rm(tweets, temp)

# March
tweets <- read.csv("../Data/twitter_covid_mar_small_cleaned.csv", header = TRUE) # Load data
temp <- as.data.frame(tweets$username) # Subset usernames
temp <- distinct(temp) # Remove duplicates
users <- rbind(users, temp) # Join on to longer list
users <- distinct(users) # Remove duplicates again
rm(tweets, temp) 

# Save
write.csv(users, "./users.csv")

# Estimate likelihood of being a bot

# Running code after running above
# user_list <- as.character(users$`tweets$username`)
# predict_bots <- tweetbotornot(user_list, fast = TRUE)

# Loading in csv file
users <- read.csv("./users.csv")
user_list <- as.character(users$x)
# Split data into 90k subsets as can only run that at a time (Twitter limit)
user_list1 <- user_list[1:90000]
user_list2 <- user_list[90001:180000]
user_list3 <- user_list[180001:270000]
user_list4 <- user_list[270001:360000]
user_list5 <- user_list[360001:450000]
user_list6 <- user_list[450001:508034]

# Predict bot likelihood
predict_bots1 <- tweetbotornot(user_list1, fast = TRUE)
predict_bots2 <- tweetbotornot(user_list2, fast = TRUE)
predict_bots3 <- tweetbotornot(user_list3, fast = TRUE)
predict_bots4 <- tweetbotornot(user_list4, fast = TRUE)
predict_bots5 <- tweetbotornot(user_list5, fast = TRUE)
predict_bots6 <- tweetbotornot(user_list6, fast = TRUE)

# Join back together
predict_bots <- rbind(predict_bots1, predict_bots2, predict_bots3, predict_bots4, predict_bots5, predict_bots6)
write.csv(predict_bots, "./predict_bots.csv") # Save
rm(list = ls()) # Remove all files
gc()

# Note:
# The default [gradient boosted] model uses both users-level (bio, location, number of followers and friends, etc.) and tweets-level (number of hashtags, mentions, capital letters, etc. in a user’s most recent 100 tweets) data to estimate the probability that users are bots. For larger data sets, this method can be quite slow. Due to Twitter’s REST API rate limits, users are limited to only 180 estimates per every 15 minutes.
# To maximize the number of estimates per 15 minutes (at the cost of being less accurate), use the fast = TRUE argument. This method uses only users-level data, which increases the maximum number of estimates per 15 minutes to 90,000! Due to losses in accuracy, this method should be used with caution!
# I will use the fast method first (180 users per 15 mins means it would take ~30 days), and then update with the slower method when can run it longer

# Test code
users <- c("realdonaldtrump", "netflix_bot",
           "kearneymw", "dataandme", "hadleywickham",
           "ma_salmon", "juliasilge", "tidyversetweets", 
           "American__Voter", "mothgenerator", "hrbrmstr")

## get botornot estimates
bot_list <- tweetbotornot(users, fast = FALSE)


# Load Data

In [None]:
#!/usr/bin/env python3
# -*- coding: utf-8 -*-
"""
Created on Mon Jun  8 15:35:13 2020

@author: markagreen
"""

## To load the full JSONL file
##
#import json_lines
#
#def load_jsonl(file):
#    tweets = []
#    f=2
#    with open(file, 'rb') as f:
#        for tweet in json_lines.reader(f, broken=True):
#            tweets.append(tweet)
#    return (tweets)
#
#tweets = load_jsonl('../January/twitter_covid_jan_covidiots.jsonl') # Select file
#print(tweets[0]) # Print first tweet

# Load in data efficiently

import json_lines
import pprint

# To visualise JSONL structure better
def prettyprint(d, indent=0):
    for key, value in d.items():
        print('\t' * indent + str(key))
        if isinstance(value, dict):
            prettyprint(value, indent+1)
        else:
            print('\t' * (indent+1) + str(value))

# Loads in tweets one by one
def load_jsonl(file):
    tweets = [] # Create balnk file to read tweets into 
    with open(file, 'rb') as f:
        for tweet in json_lines.reader(f, broken=True): # For each tweet

            reduced_tweet = { # Store key details
                'created_at' : tweet['created_at'], # Time and date of tweet
                'id' : tweet['id_str'], # Unique ID of Tweet
                'username' : tweet['user']['screen_name'], # Username of Twitter profile
                'user_id' : tweet['user']['id_str'], # Unique ID for Twtter profile
                'text': tweet['text'] # Store text of tweet (140 characters max)
            }
            
            if 'extended_tweet' in tweet: # If tweet is more than 140 characters (Twitter seperates out old and current tweet lengths)
                reduced_tweet.update({'full_text':tweet['extended_tweet']['full_text']}) # Store full text (else cut off)
            elif 'retweeted_status' in tweet and 'extended_tweet' in tweet['retweeted_status']: # If a retweet and tweet more than 140 characters
                reduced_tweet.update({'full_text':tweet['retweeted_status']['extended_tweet']['full_text']}) # Store full text
            else: # Else if neither of previous two options, keep 140 characters text
                reduced_tweet.update({'full_text':tweet['text']})
            
            if 'derived' in tweet['user']: # If present in the users information
                if 'locations' in tweet['user']['derived']: # Store country
                    reduced_tweet.update({'country':tweet['user']['derived']['locations'][0]['country']})
#                else:
#                    reduced_tweet.update({}'country':''}) # If not present then store as missing
                
                if 'region' in tweet['user']['derived']['locations'][0]: # If present in the users information
                    reduced_tweet.update({'region':tweet['user']['derived']['locations'][0]['region']}) # Store region
#                else:
#                    reduced_tweet.update({'region':''}) # If not present then store as missing
            
            if 'retweeted_status' in tweet: # If a retweet (store as nested within same Tweet)
               reduced_tweet.update({'retweeted_user':{
                                       'user_id' : tweet['retweeted_status']['user']['id_str'], # Store user ID of retweeted user
                                       'username' : tweet['retweeted_status']['user']['screen_name']}}) # Store username
                
            #print("######################### ") # Prints progress (used for testing purposes to check code)
            #prettyprint(reduced_tweet)
                                
            tweets.append(reduced_tweet)
    return (tweets)

tweets = load_jsonl('../Data/twitter_covid_feb.jsonl') # Load specific file
print(tweets[10]) # Check loaded in fine

# Save
import json
with open('../Data/twitter_covid_feb_small.jsonl', 'w') as outfile:
    json.dump(tweets, outfile)

# Convert file to data frame
import pandas as pd
with open('../Data/twitter_covid_feb_small.jsonl', 'r') as f:
    data = json.load(f)
df = pd.DataFrame(data) # Converts to data frame
pd.set_option('display.max_columns', None) # So can view all columns
df.head(1) # Check has worked

# Save as csv file
df.to_csv('../Data/twitter_covid_feb_small.csv')


# twitter api

In [None]:
#!/usr/bin/env python3
# -*- coding: utf-8 -*-
"""
Created on Mon Jun  8 13:59:44 2020

@author: Mark Green
"""

# Define details of our account 
API_KEY = 'VzihIPxv5oFrd3SkNuBuQk9o3'
API_SECRET_KEY = 'Iq7hi4K1cZnzgD3RC1miTM6rcrHMA4aeHj3OeCsI9OvFVtX5Ej'
DEV_ENVIRONMENT_LABEL = 'datacollection'
API_SCOPE = 'fullarchive'  # 'fullarchive' for full archive, '30day' for last 31 days

# Define search terms 
SEARCH_QUERY = '(covid-19 OR Corona OR Coronavirus OR "The virus" OR Covid OR Covid19 OR Rona OR C19 OR COV-19 OR Corona-19 OR CV19 OR CV-19 OR CV OR NCov OR 2019nCov OR SARS-CoV-2 OR Coronaoutbreak OR Coronaapocalypse OR Carona OR Codvid OR Coronavid OR Corono OR Coron OR Covit OR Curona OR Corrona OR Covd OR Korona OR koronavirus OR Corvid OR corvid-19 OR covid_19uk OR covid-19uk OR Briefing_COVID19 OR coronavirusuk OR COVID_19uk OR covid19uk OR CoronavirusBillUK OR UKCoronavirusBill OR uklockdown OR "Chinese virus" OR chinesevirus OR "Wuhan virus" OR wuhanvirus OR Boomerremover OR Covididiot OR Covididiots OR covidiot OR covidiots OR Kung Flu) lang:en (place_country:GB OR profile_country:GB)' # Max 1024 characters

# SEARCH_QUERY = 'from:snopes' # For specific Twitter users

RESULTS_PER_CALL = 500  # 100 for sandbox, 500 for paid tiers
FROM_DATE = '2020-03-02 00:00'  # format YYYY-MM-DD HH:MM (hour and minutes optional)
TO_DATE = '2020-03-10 00:00' # format YYYY-MM-DD HH:MM (hour and minutes optional)
# E.g. TO_DATE = 2020-02-14 00:00 starts at Feb 13 23:59:59 +0000 2020 back
# FROM_DATE = 2020-02-08 00:00 ends at Feb 08 00:00:00 +0000 2020
MAX_RESULTS = 100000000  # Number of Tweets you want to collect e.g. 1000000

# Last downloaded 1st March (~81k)

# Search 1: Covid-related terms

# General terms
# covid-19 OR Corona OR Coronavirus OR The virus OR Covid OR Covid19 OR Rona OR C19 OR COV-19 OR Corona-19 OR CV19 OR 
# CV-19 OR CV OR NCov OR 2019nCov OR SARS-CoV-2 OR Coronaoutbreak OR Coronaapocalypse

# Misspellings 
# Carona OR Codvid OR Coronavid OR Corono OR Coron OR Covit OR Curona OR Corrona OR Covd OR Korona OR koronavirus OR
# Corvid OR corvid-19 OR

# UK specific
# covid_19uk OR covid-19uk OR Briefing_COVID19 OR coronavirusuk OR COVID_19uk OR covid19uk OR CoronavirusBillUK
# UKCoronavirusBill OR uklockdown

# Derogatory terms or discrimination 
# Chinese virus OR chinesevirus OR Wuhan virus OR wuhanvirus OR Boomerremover OR Covididiot OR Kung Flu

# Search 2: Public Health / policy / treatment terms 

# Testing OR trace OR PPE OR ppeshortage OR Antibody OR antibodies OR Stay alert OR control the virus OR save lives OR stayalert OR
# Stay at home OR stayathome OR stayhome OR Lockdown OR uklockdown OR Quarantine OR quarentine OR quarantaine OR Isolation OR iso OR
# Self isolate OR Wash your hands OR washyourhands OR washurhands OR Social distancing OR socialdistancing OR social distance OR
# SocialDistancingNow OR Shielding OR sheltering OR Lockdown OR lockdown fatigue OR Mask OR n95 OR hydroxychloroquine OR
# clapforcarers

# Specific searches
# lang:en # Only English tweets
# (place_country:GB OR profile_country:GB) # Only UK tweets

# Define where Tweets should be saved
FILENAME = '../Data/twitter_covid_mar.jsonl'

# Print update for every X tweets downloaded
PRINT_AFTER_X = 500

# Define YAML with key details for accessing Twitter API
import yaml
config = dict(
    search_tweets_api=dict(
        account_type='premium',
        endpoint=f"https://api.twitter.com/1.1/tweets/search/fullarchive/datacollection.json",
        consumer_key=API_KEY,
        consumer_secret=API_SECRET_KEY
    )
)

with open('twitter_keys.yaml', 'w') as config_file:
    yaml.dump(config, config_file, default_flow_style=False)

## We can test searches in the sandbox environment without adding to monthly count here
# from searchtweets import collect_results, gen_rule_payload
# rule = gen_rule_payload("beyonce", results_per_call=100) # define search terms
# tweets = collect_results(rule, max_results=100) # Collect tweets
# [print(tweet.all_text, end='\n\n') for tweet in tweets[0:10]]; # Print
    
import json
from searchtweets import load_credentials, gen_rule_payload, ResultStream

# Define rules for premium search for streaming tweets
premium_search_args = load_credentials("twitter_keys.yaml",
                                       yaml_key="search_tweets_api",
                                       env_overwrite=False)

# Put together search terms and rules from earlier
rule = gen_rule_payload(SEARCH_QUERY,
                        results_per_call=RESULTS_PER_CALL,
                        from_date=FROM_DATE,
                        to_date=TO_DATE
                        )

# Stream tweets rather than download in one go
rs = ResultStream(rule_payload=rule,
                  max_results=MAX_RESULTS,
                  **premium_search_args)

# Access API and save each tweet as single line on JSON lines file
with open(FILENAME, 'a', encoding='utf-8') as f:
    n = 0
    for tweet in rs.stream():
        n += 1
        if n % PRINT_AFTER_X == 0:
            print('{0}: {1}'.format(str(n), tweet['created_at']))
        json.dump(tweet, f)
        f.write('\n')
print('done')

# Classify Misinformation

In [None]:
#### Classifying tweets as misinformation ###

# Libraries

# Load data
jan <- read.csv("./Data/twitter_covid_jan_small_cleaned.csv", header = TRUE) # January
feb <- read.csv("./Data/twitter_covid_feb_small_cleaned.csv", header = TRUE) # February
mar <- read.csv("./Data/twitter_covid_mar_small_cleaned.csv", header = TRUE) # March
lockdown <- read.csv("./Data/twitter_covid_lockdown_mar2020_small_cleaned.csv", header = TRUE) # March

# Tidy
lockdown$X.3 <- NA
lockdown$X.2 <- NA
lockdown$X.1 <- NA

# Join together
all_months <- rbind(jan, feb, mar, lockdown)
#backup <- all_months # As takes ages to load!
rm(jan, feb, mar, lockdown)

## Part 1: Not true but not false misinformation ##

# Misinformation lookup
lkup <- read.csv("./Data/not_true_not_false.csv", header = TRUE)
lkup$twitter_url <-gsub("\\?amp=1.*","",lkup$twitter_url) # Remove part of url not needed
# Save short links as lookup
lkup_urls_short <- lkup[,c("twitter_url", "not_true_not_false")] # Keep required vars
lkup_urls_short <- lkup_urls_short[lkup_urls_short$twitter_url != "",] # Drop missing rows
# Save full links as lookup
lkup_urls_full <- lkup[,c("original_url", "not_true_not_false")] # Keep required vars
lkup_urls_full <- lkup_urls_full[lkup_urls_full$original_url != "",] # Drop missing rows
# Save tweets as seperate lookup
lkup_tw <- lkup[,c("tweet_id", "not_true_not_false")] # Keep required vars
lkup_tw <- lkup_tw[!is.na(lkup_tw$tweet_id),] # Drop missing rows
rm(lkup)

# Classify tweets

# By Tweets

# User tweets
all_months <- merge(all_months, lkup_tw, by.x = "id", by.y = "tweet_id", all.x = TRUE) # Join on misinformation tweets lookup to main data
all_months$not_true_not_false[is.na(all_months$not_true_not_false)] <- 0 # Recode variable as 0 if not a match
all_months$not_true_not_false_tweet1 <- all_months$not_true_not_false # Rename variable
all_months$not_true_not_false <- NULL # Drop variable as no longer needed

# Retweeted tweets
all_months$rt_tweet_id <- as.numeric(all_months$rt_tweet_id) # Recode as numeric as stored as 'list'
all_months <- merge(all_months, lkup_tw, by.x = "rt_tweet_id", by.y = "tweet_id", all.x = TRUE)
all_months$not_true_not_false[is.na(all_months$not_true_not_false)] <- 0
all_months$not_true_not_false_tweet2 <- all_months$not_true_not_false
all_months$not_true_not_false <- NULL

# Quoted tweets
all_months$qt_tweet_id <- as.numeric(all_months$qt_tweet_id)
all_months <- merge(all_months, lkup_tw, by.x = "qt_tweet_id", by.y = "tweet_id", all.x = TRUE)
all_months$not_true_not_false[is.na(all_months$not_true_not_false)] <- 0
all_months$not_true_not_false_tweet3 <- all_months$not_true_not_false
all_months$not_true_not_false <- NULL

# Combine together into single variable
all_months$not_true_not_false_tweet <- 0
all_months$not_true_not_false_tweet[all_months$not_true_not_false_tweet1 == 1 | 
                                      all_months$not_true_not_false_tweet2 == 1 | 
                                      all_months$not_true_not_false_tweet3 == 1] <- 1
all_months$not_true_not_false_tweet1 <- NULL
all_months$not_true_not_false_tweet2 <- NULL
all_months$not_true_not_false_tweet3 <- NULL

table(all_months$not_true_not_false_tweet) # n=1778

# By URL Links

# Links extracted from tweets
all_months$url1 <- as.character(all_months$url1) # Change to same type (char)
lkup_urls_short$twitter_url <- as.character(lkup_urls_short$twitter_url)
all_months <- merge(all_months, lkup_urls_short, by.x = "url1", by.y = "twitter_url", all.x = TRUE) # Join on lookup
all_months$not_true_not_false[is.na(all_months$not_true_not_false)] <- 0 # Recode variable as 0 if not a match
all_months$not_true_not_false_url1 <- all_months$not_true_not_false # Rename variable
all_months$not_true_not_false <- NULL # Drop variable as no longer needed

all_months$url2 <- as.character(all_months$url2)
all_months <- merge(all_months, lkup_urls_short, by.x = "url2", by.y = "twitter_url", all.x = TRUE) # Repeat for other URLs
all_months$not_true_not_false[is.na(all_months$not_true_not_false)] <- 0
all_months$not_true_not_false_url2 <- all_months$not_true_not_false
all_months$not_true_not_false <- NULL

all_months$url3 <- as.character(all_months$url3)
all_months <- merge(all_months, lkup_urls_short, by.x = "url3", by.y = "twitter_url", all.x = TRUE)
all_months$not_true_not_false[is.na(all_months$not_true_not_false)] <- 0
all_months$not_true_not_false_url3 <- all_months$not_true_not_false
all_months$not_true_not_false <- NULL

# URLs extracted from Twitter's raw tweet information

# Retweeted short URL
lkup_urls_full$original_url <- as.character(lkup_urls_full$original_url)
all_months$rt_url <- as.character(all_months$rt_url)
all_months <- merge(all_months, lkup_urls_short, by.x = "rt_url", by.y = "twitter_url", all.x = TRUE)
all_months$not_true_not_false[is.na(all_months$not_true_not_false)] <- 0
all_months$not_true_not_false_rturl <- all_months$not_true_not_false
all_months$not_true_not_false <- NULL

# Retweeted full URL
all_months$rt_expanded_url <- as.character(all_months$rt_expanded_url)
all_months <- merge(all_months, lkup_urls_full, by.x = "rt_expanded_url", by.y = "original_url", all.x = TRUE)
all_months$not_true_not_false[is.na(all_months$not_true_not_false)] <- 0
all_months$not_true_not_false_rturlfull <- all_months$not_true_not_false
all_months$not_true_not_false <- NULL

# Quoted short URL
all_months$qt_url <- as.character(all_months$qt_url)
all_months <- merge(all_months, lkup_urls_short, by.x = "qt_url", by.y = "twitter_url", all.x = TRUE)
all_months$not_true_not_false[is.na(all_months$not_true_not_false)] <- 0
all_months$not_true_not_false_qturl <- all_months$not_true_not_false
all_months$not_true_not_false <- NULL

# Quoted full URL
all_months$qt_expanded_url <- as.character(all_months$qt_expanded_url)
all_months <- merge(all_months, lkup_urls_full, by.x = "qt_expanded_url", by.y = "original_url", all.x = TRUE)
all_months$not_true_not_false[is.na(all_months$not_true_not_false)] <- 0
all_months$not_true_not_false_qturlfull <- all_months$not_true_not_false
all_months$not_true_not_false <- NULL

# Combine into single variable
all_months$not_true_not_false_url <- 0
all_months$not_true_not_false_url[all_months$not_true_not_false_url1 == 1 | 
                                    all_months$not_true_not_false_url2 == 1 | 
                                    all_months$not_true_not_false_url3 == 1 | 
                                    all_months$not_true_not_false_rturl == 1 | 
                                    all_months$not_true_not_false_rturlfull == 1 | 
                                    all_months$not_true_not_false_qturl == 1 | 
                                    all_months$not_true_not_false_qturlfull == 1] <- 1
table(all_months$not_true_not_false_url) # n=90
all_months[35:41] <- NULL # Tidy up

# Extract not true not false tweets
ntnf_tweets <- all_months[all_months$not_true_not_false_url == 1 | all_months$not_true_not_false_tweet == 1,]
ntnf_tweets$rt_username <- as.character(ntnf_tweets$rt_username) # Else will not save as csv
ntnf_tweets$qt_username <- as.character(ntnf_tweets$qt_username)
write.csv(ntnf_tweets, "./Data/ntnf_tweets.csv")


## Part 2: False misinformation ##

# Misinformation lookup
lkup <- read.csv("./Data/false_info.csv", header = TRUE)
lkup$false_urls <- lkup$false_info
lkup$twitter_url <-gsub("\\?amp=1.*","",lkup$twitter_url) # Remove part of url not needed
# Save short links as lookup
lkup_urls_short <- lkup[,c("twitter_url", "false_urls")] # Keep required vars
lkup_urls_short <- lkup_urls_short[lkup_urls_short$twitter_url != "",] # Drop missing rows
# Save full links as lookup
lkup_urls_full <- lkup[,c("original_url", "false_urls")] # Keep required vars
lkup_urls_full <- lkup_urls_full[lkup_urls_full$original_url != "",] # Drop missing rows
# Save tweets as seperate lookup
lkup_tw <- lkup[,c("tweet_id", "false_urls")] # Keep required vars
lkup_tw <- lkup_tw[!is.na(lkup_tw$tweet_id),] # Drop missing rows
rm(lkup)

# Classify tweets

# By Tweets

# User tweets
all_months <- merge(all_months, lkup_tw, by.x = "id", by.y = "tweet_id", all.x = TRUE) # Join on misinformation tweets lookup to main data
all_months$false_urls[is.na(all_months$false_urls)] <- 0 # Recode variable as 0 if not a match
all_months$false_urls_tweet1 <- all_months$false_urls # Rename variable
all_months$false_urls <- NULL # Drop variable as no longer needed

# Retweeted tweets
all_months$rt_tweet_id <- as.numeric(all_months$rt_tweet_id) # Recode as numeric as stored as 'list'
all_months <- merge(all_months, lkup_tw, by.x = "rt_tweet_id", by.y = "tweet_id", all.x = TRUE)
all_months$false_urls[is.na(all_months$false_urls)] <- 0
all_months$false_urls_tweet2 <- all_months$false_urls
all_months$false_urls <- NULL

# Quoted tweets
all_months$qt_tweet_id <- as.numeric(all_months$qt_tweet_id)
all_months <- merge(all_months, lkup_tw, by.x = "qt_tweet_id", by.y = "tweet_id", all.x = TRUE)
all_months$false_urls[is.na(all_months$false_urls)] <- 0
all_months$false_urls_tweet3 <- all_months$false_urls
all_months$false_urls <- NULL

# Combine together into single variable
all_months$false_urls_tweet <- 0
all_months$false_urls_tweet[all_months$false_urls_tweet1 == 1 | 
                                      all_months$false_urls_tweet2 == 1 | 
                                      all_months$false_urls_tweet3 == 1] <- 1
all_months$false_urls_tweet1 <- NULL
all_months$false_urls_tweet2 <- NULL
all_months$false_urls_tweet3 <- NULL

table(all_months$false_urls_tweet) # 743

# By URL Links

# Links extracted from tweets
all_months$url1 <- as.character(all_months$url1) # Change to same type (char)
lkup_urls_short$twitter_url <- as.character(lkup_urls_short$twitter_url)
all_months <- merge(all_months, lkup_urls_short, by.x = "url1", by.y = "twitter_url", all.x = TRUE) # Join on lookup
all_months$false_urls[is.na(all_months$false_urls)] <- 0 # Recode variable as 0 if not a match
all_months$false_urls_url1 <- all_months$false_urls # Rename variable
all_months$false_urls <- NULL # Drop variable as no longer needed

all_months$url2 <- as.character(all_months$url2)
all_months <- merge(all_months, lkup_urls_short, by.x = "url2", by.y = "twitter_url", all.x = TRUE) # Repeat for other URLs
all_months$false_urls[is.na(all_months$false_urls)] <- 0
all_months$false_urls_url2 <- all_months$false_urls
all_months$false_urls <- NULL

all_months$url3 <- as.character(all_months$url3)
all_months <- merge(all_months, lkup_urls_short, by.x = "url3", by.y = "twitter_url", all.x = TRUE)
all_months$false_urls[is.na(all_months$false_urls)] <- 0
all_months$false_urls_url3 <- all_months$false_urls
all_months$false_urls <- NULL

# URLs extracted from Twitter's raw tweet information

# Retweeted short URL
lkup_urls_full$original_url <- as.character(lkup_urls_full$original_url)
all_months$rt_url <- as.character(all_months$rt_url)
all_months <- merge(all_months, lkup_urls_short, by.x = "rt_url", by.y = "twitter_url", all.x = TRUE)
all_months$false_urls[is.na(all_months$false_urls)] <- 0
all_months$false_urls_rturl <- all_months$false_urls
all_months$false_urls <- NULL

# Retweeted full URL
all_months$rt_expanded_url <- as.character(all_months$rt_expanded_url)
all_months <- merge(all_months, lkup_urls_full, by.x = "rt_expanded_url", by.y = "original_url", all.x = TRUE)
all_months$false_urls[is.na(all_months$false_urls)] <- 0
all_months$false_urls_rturlfull <- all_months$false_urls
all_months$false_urls <- NULL

# Quoted short URL
all_months$qt_url <- as.character(all_months$qt_url)
all_months <- merge(all_months, lkup_urls_short, by.x = "qt_url", by.y = "twitter_url", all.x = TRUE)
all_months$false_urls[is.na(all_months$false_urls)] <- 0
all_months$false_urls_qturl <- all_months$false_urls
all_months$false_urls <- NULL

# Quoted full URL
all_months$qt_expanded_url <- as.character(all_months$qt_expanded_url)
all_months <- merge(all_months, lkup_urls_full, by.x = "qt_expanded_url", by.y = "original_url", all.x = TRUE)
all_months$false_urls[is.na(all_months$false_urls)] <- 0
all_months$false_urls_qturlfull <- all_months$false_urls
all_months$false_urls <- NULL

# Combine into single variable
all_months$false_url <- 0
all_months$false_url[all_months$false_urls_url1 == 1 | 
                                    all_months$false_urls_url2 == 1 | 
                                    all_months$false_urls_url3 == 1 | 
                                    all_months$false_urls_rturl == 1 | 
                                    all_months$false_urls_rturlfull == 1 | 
                                    all_months$false_urls_qturl == 1 | 
                                    all_months$false_urls_qturlfull == 1] <- 1
table(all_months$false_url) # n=208
all_months[37:43] <- NULL # Tidy up

# Extract not true not false_urls tweets
ntnf_tweets <- all_months[all_months$false_url == 1 | all_months$false_urls_tweet == 1,]
ntnf_tweets$rt_username <- as.character(ntnf_tweets$rt_username) # Else will not save as csv
ntnf_tweets$qt_username <- as.character(ntnf_tweets$qt_username)
write.csv(ntnf_tweets, "./Data/false_tweets.csv")


## Part 3: Classifying URL website source ##

# We can only do this via the expanded URLs for RTs and QTs unfortunately as main tweets only have t.co links saved

# Load list of websites associated with fake news
lkup_web <- read.csv("./Data/websites.csv") 
lkup_web$website <- tolower(lkup_web$website) # Convert to lower case 
lkup_web$fake_website <- 1 # Add variable on

# Retweets
all_months <- merge(all_months, lkup_web, by.x = "rt_website", by.y = "website", all.x = TRUE) # Join on misinformation websites
all_months$fake_website[is.na(all_months$fake_website)] <- 0 # Recode variable as 0 if not a match
all_months$fake_website_rt <- all_months$fake_website # Rename variable
all_months$fake_website <- NULL # Drop variable as no longer needed

# Quoted retweets
all_months <- merge(all_months, lkup_web, by.x = "qt_website", by.y = "website", all.x = TRUE) # Join on misinformation websites
all_months$fake_website[is.na(all_months$fake_website)] <- 0 # Recode variable as 0 if not a match
all_months$fake_website_qt <- all_months$fake_website # Rename variable
all_months$fake_website <- NULL # Drop variable as no longer needed

table(all_months$fake_website_rt) # n=4035
table(all_months$fake_website_qt) # n=210

# Extract tweets
fweb_tweets <- all_months[all_months$fake_website_rt == 1 | all_months$fake_website_qt == 1,]
fweb_tweets$rt_username <- as.character(fweb_tweets$rt_username) # Else will not save as csv
fweb_tweets$qt_username <- as.character(fweb_tweets$qt_username)
write.csv(fweb_tweets, "./Data/fake_web_tweets.csv")


## Part 4: Identifying accounts associated with active spread of misinformation ##

# Load lookup
lkup_acc <- read.csv("./Data/misinformation_users.csv", header = TRUE)
lkup_acc$misinfo_user <- 1 # For lookup later

# User tweets
all_months <- merge(all_months, lkup_acc, by.x = "username", by.y = "username", all.x = TRUE) # Join on misinformation tweets lookup to main data
all_months$misinfo_user[is.na(all_months$misinfo_user)] <- 0 # Recode variable as 0 if not a match
all_months$misinfo_user_tweet <- all_months$misinfo_user # Rename variable
all_months$misinfo_user <- NULL # Drop variable as no longer needed

# Retweeted tweets
#all_months$rt_username <- as.character(all_months$rt_username)
all_months <- merge(all_months, lkup_acc, by.x = "rt_username", by.y = "username", all.x = TRUE)
all_months$misinfo_user[is.na(all_months$misinfo_user)] <- 0
all_months$misinfo_user_rt <- all_months$misinfo_user
all_months$misinfo_user <- NULL

# Quoted tweets
all_months <- merge(all_months, lkup_acc, by.x = "qt_username", by.y = "username", all.x = TRUE)
all_months$misinfo_user[is.na(all_months$misinfo_user)] <- 0
all_months$misinfo_user_qt <- all_months$misinfo_user
all_months$misinfo_user <- NULL

# Combine together into single variable
all_months$misinfo_user <- 0
all_months$misinfo_user[all_months$misinfo_user_tweet == 1 | 
                              all_months$misinfo_user_rt == 1 | 
                              all_months$misinfo_user_qt == 1] <- 1
all_months$misinfo_user_tweet <- NULL
all_months$misinfo_user_rt <- NULL
all_months$misinfo_user_qt <- NULL

table(all_months$misinfo_user) # n=4905

# Extract tweets
misinfo_user_tweets <- all_months[all_months$misinfo_user == 1,]
misinfo_user_tweets$rt_username <- as.character(misinfo_user_tweets$rt_username) # Else will not save as csv
misinfo_user_tweets$qt_username <- as.character(misinfo_user_tweets$qt_username)
write.csv(misinfo_user_tweets, "./Data/misinfo_user_tweets.csv")


## Part 5: Identifying tweets matching WHO keywords associated with misinformation ##

# Convert data to a corpus
hold <- all_months[,c("id", "full_text_cleaned")] # Subset required information
# Corpus requires following column names
names(hold)[names(hold) == "id"] <- "doc_id"
names(hold)[names(hold) == "full_text_cleaned"] <- "text"
corpus <- Corpus(DataframeSource(hold)) # Convert
rm(hold)

# Tidy up corpus
corpus <- corpus %>%
  tm_map(removePunctuation) %>% # Remove punctuation
  #tm_map(removeNumbers) %>% # Remove numbers (not done this here as want to identify 5G)
  tm_map(removeWords, stopwords('en')) # %>% # Remove stope words that have little meaning e.g. and, the, of etc
#corpus <- tm_map(corpus, PlainTextDocument) 
#corpus <- Corpus(VectorSource(corpus))
#tm_map(stripWhitespace) # Remove whitespace

# Convert corpus to document term matrix
dtm <- DocumentTermMatrix(corpus)
rm(corpus)

# Tidy data
dtm_td <- tidy(dtm)
rm(dtm)

# Load lookup and add onto list of terms
lkup_who <- read.csv("./Data/who_keywords.csv")
dtm_td <- merge(dtm_td, lkup_who, by.x = "term", by.y = "keyword", all.x = TRUE)

# Create variables for whether match each of the types of misinformation
dtm_td$cause <- 0
dtm_td$cause[dtm_td$form == "cause"] <- 1
dtm_td$transmission <- 0
dtm_td$transmission[dtm_td$form == "transmission"] <- 1
dtm_td$treatment <- 0
dtm_td$treatment[dtm_td$form == "treatment"] <- 1

# Aggregate up to tweet id so can be rejoined back onto main dataset
dtm_td <- data.table(dtm_td)
lkup_who_tweets <- dtm_td[, list(cause = sum(cause), transmission = sum(transmission), treatment = sum(treatment)), by = document]
names(lkup_who_tweets)[names(lkup_who_tweets) == "document"] <- "id"
write.csv(lkup_who_tweets, "./Data/lookup_who_terms_tweets.csv")


## Part 6: Iterating through retweets of retweets ##

# Taking all of the tweets that we identified as talking about misinformation, we pull out subsequent retweets of them, and then the retweets of those, and so on... To give us the cascading discussion of tweets. Snowballing data collection.

# Load all misinformation lookups and and subset tweet ids
lkup1 <- read.csv("./Data/ntnf_tweets.csv") # Load
lkup1$misinformation <- 1 # Add in required variables for consistent lookup
lkup1$not_true_not_false <- 1 
lkup1$false <- NA
lkup1$cause <- NA
lkup1$transmission <- NA
lkup1$treatment <- NA
lkup1 <- lkup1[,c("id", "misinformation", "not_true_not_false", "false", "cause", "transmission", "treatment")] # Subset required variables

lkup2 <- read.csv("./Data/false_tweets.csv") # Repeat process
lkup2$misinformation <- 1 
lkup2$not_true_not_false <- NA
lkup2$false <- 1
lkup2$cause <- NA
lkup2$transmission <- NA
lkup2$treatment <- NA
lkup2 <- lkup2[,c("id", "misinformation", "not_true_not_false", "false", "cause", "transmission", "treatment")]

lkup3 <- read.csv("./Data/fake_web_tweets.csv")
lkup3$misinformation <- 1 
lkup3$not_true_not_false <- NA
lkup3$false <- NA
lkup3$cause <- NA
lkup3$transmission <- NA
lkup3$treatment <- NA
lkup3 <- lkup3[,c("id", "misinformation", "not_true_not_false", "false", "cause", "transmission", "treatment")]

lkup4 <- read.csv("./Data/misinfo_user_tweets.csv")
lkup4$misinformation <- 1 
lkup4$not_true_not_false <- NA
lkup4$false <- NA
lkup4$cause <- NA
lkup4$transmission <- NA
lkup4$treatment <- NA
lkup4 <- lkup4[,c("id", "misinformation", "not_true_not_false", "false", "cause", "transmission", "treatment")]

lkup5a <- read.csv("./Data/lookup_who_terms_tweets.csv")
lkup5a <- lkup5a[lkup5a$cause == 1 | lkup5a$transmission == 1 | lkup5a$treatment == 1,] # Select only tweets with matches
lkup5a["X"] <- NULL # Drop variable as not needed
lkup5a$misinformation <- 1 
lkup5a$not_true_not_false <- NA
lkup5a$false <- NA

lkup5b <- read.csv("./Data/lookup_who_terms_tweets_lockdown.csv")
lkup5b <- lkup5b[lkup5b$cause == 1 | lkup5b$transmission == 1 | lkup5b$treatment == 1,]
lkup5b["X"] <- NULL
lkup5b$misinformation <- 1 
lkup5b$not_true_not_false <- NA
lkup5b$false <- NA

# Combine into one big happy list
lkup <- rbind(lkup1, lkup2, lkup3, lkup4, lkup5a, lkup5b) # Join together
write.csv(lkup, "./Data/lookup_all_misinformation.csv") # Save
rm(lkup1, lkup2, lkup3, lkup4, lkup5a, lkup5b) # Tidy

# Subset tweets that match ids
all_months <- all_months[,c("id", "rt_tweet_id", "qt_tweet_id")] # Subset required variables to make quicker

lkup_rt <- merge(lkup, all_months, by.x = "id", by.y = "rt_tweet_id") # Match tweets onto lookup based on retweet ids
lkup_rt$id <- NULL # Tidy up variables
lkup_rt$qt_tweet_id <- NULL
names(lkup_rt)[names(lkup_rt) == "id.y"] <- "id"

lkup_qt <- merge(lkup, all_months, by.x = "id", by.y = "qt_tweet_id") # Match tweets onto lookup based on quoted retweet ids
lkup_qt$id <- NULL 
lkup_qt$rt_tweet_id <- NULL
names(lkup_qt)[names(lkup_qt) == "id.y"] <- "id"

snowball <- rbind(lkup_rt, lkup_qt) # Join back together

# Repeat process to get full cascade

# 1
lkup_rt <- merge(snowball, all_months, by.x = "id", by.y = "rt_tweet_id") 
lkup_rt$id <- NULL 
lkup_rt$qt_tweet_id <- NULL
names(lkup_rt)[names(lkup_rt) == "id.y"] <- "id"

lkup_qt <- merge(snowball, all_months, by.x = "id", by.y = "qt_tweet_id") 
lkup_qt$id <- NULL 
lkup_qt$rt_tweet_id <- NULL
names(lkup_qt)[names(lkup_qt) == "id.y"] <- "id"

hold <- rbind(lkup_rt, lkup_qt) 

# 2
lkup_rt <- merge(hold, all_months, by.x = "id", by.y = "rt_tweet_id") 
lkup_rt$id <- NULL 
lkup_rt$qt_tweet_id <- NULL
names(lkup_rt)[names(lkup_rt) == "id.y"] <- "id"

lkup_qt <- merge(hold, all_months, by.x = "id", by.y = "qt_tweet_id")
lkup_qt$id <- NULL 
lkup_qt$rt_tweet_id <- NULL
names(lkup_qt)[names(lkup_qt) == "id.y"] <- "id"

hold2 <- rbind(lkup_rt, lkup_qt) 

# Join all together
snowball <- rbind(snowball, hold, hold2)

# Save
write.csv(snowball, "./Data/lookup_snowballed_misinformation.csv") # Save


# Role of bots

In [None]:
#################################
##### Identify role of bots #####
#################################

# Purpose: Explore the extent of bots in our data and what they are tweeting about.

# Libraries
library(tidytext)
library(tm)
library(ggplot2)

## Load in Twitter data and tidy ##

# Load data
jan <- read.csv("./Data/twitter_covid_jan_small_cleaned.csv", header = TRUE) # January
jan$X.2 <- NULL # March does not have this variable so drop
feb <- read.csv("./Data/twitter_covid_feb_small_cleaned.csv", header = TRUE) # February
feb$X.2 <- NULL
mar <- read.csv("./Data/twitter_covid_mar_small_cleaned.csv", header = TRUE) # March

# Join together
all_months <- rbind(jan, feb, mar)
rm(jan, feb, mar)

# Hold date for later
timestamp <- all_months[,c("id", "created_at")]
timestamp$created_at <- as.POSIXct(timestamp$created_at, format = "%a %b %d %H:%M:%S %z %Y", tz = "GMT")  # Convert to time-date format

# Convert data to a corpus
hold <- all_months[,c("id", "full_text_cleaned")] # Subset required information
# Corpus requires following column names
names(hold)[names(hold) == "id"] <- "doc_id"
names(hold)[names(hold) == "full_text_cleaned"] <- "text"
corpus <- Corpus(DataframeSource(hold)) # Convert
#rm(all_months, hold)

# Tidy up corpus
corpus <- corpus %>%
  tm_map(removePunctuation) %>% # Remove punctuation
  #tm_map(removeNumbers) %>% # Remove numbers (not done this here as want to identify 5G)
  tm_map(removeWords, stopwords('en')) # %>% # Remove stope words that have little meaning e.g. and, the, of etc

# Convert corpus to document term matrix
dtm <- DocumentTermMatrix(corpus)
#rm(corpus)

# Tidy data
dtm_td <- tidy(dtm)
#rm(dtm)



## Load in users estimates of whether are bots ##

# The code to generate these estimates are found in the script 'bots.R'. It will not run on this machine as I had trouble installing devtools. I ran it locally instead. I am unsure over the quality of the results, but it provides a good starting point for us. There is some missing data in it where the code was unable to estimate (likely as account had been removed - so a good chance NAs are bots).
bots <- read.csv("./Data/predict_bots.csv") # Load in estimates of whether bot or not

# Let's have a quick look at this data
ggplot(bots,aes(prob_bot)) +
  geom_histogram() +
  xlab("Probability user is a bot") +
  ylab("Frequency")

# Define bot or not
# Since we get a probability rather than a binary classification, we need to define a cut off point for whether we estimate a user to be a bot or not. There is no guidance by the package creator on this figure, but I have seen both 0.7 and 0.5 discussed across the internet. This is something to review. For the example code here, let's go for 0.5 for now (~33.6% users defined as bots).
bots$bot <- NA
bots$bot[bots$prob_bot > 0.5] <- 1
bots$bot[bots$prob_bot <= 0.5] <- 0

## Examine trends in bots posting ##

# Join bots onto twitter dataset
all_months <- merge(all_months, bots, by = "user_id", all.x = TRUE)
all_months$X.y <- NULL # Delete repeated columns
all_months$screen_name <- NULL

# Aggregate counts by bot or not per day
all_months$created_at <- as.POSIXct(all_months$created_at, format = "%a %b %d %H:%M:%S %z %Y", tz = "GMT") # Convert to time-date format
all_months$day <- cut(all_months$created_at, breaks="day") # Split by day
all_months$freq <- 1 # To make next step easier
day_sum <- aggregate(freq~day+bot, all_months, sum) # Count number of bots or not per day
day_sum$day <- as.Date(day_sum$day, format='%Y-%m-%d') # Convert to date (as factor)

# Summary statistics
tab <- aggregate(freq~bot, day_sum, sum) # Sum number of tweets over period by bot or not
tab$percent <- (tab$freq / sum(tab$freq))*100 # Calculate percentage by bot (1) or not (0)
tab # So 42% of all tweets from bots but only 33% of users

# Plot
day_sum %>%
  ggplot(aes(x = day, y = freq, group = as.factor(bot), color = as.factor(bot))) +
  geom_point() +
  geom_smooth(method = "gam", se=F) + # Add smoothed line on to summarise trend (GAM is better for memory with larger datasets compared to LOESS)
  scale_color_discrete(name = NULL, labels = c("Human", "Bot", "C")) + # Add labels to plot
  ylab("Frequency") +
  xlab("Day")

# We might want to consider better ways of displaying the data, such as logging the frequency so that larger counts do not swamp earlier patterns:
day_sum$log_freq <- log(day_sum$freq)
day_sum %>%
  ggplot(aes(x = day, y = log_freq, group = as.factor(bot), color = as.factor(bot))) +
  geom_point() +
  geom_smooth(method = "gam", se=F) + 
  scale_color_discrete(name = NULL, labels = c("Human", "Bot", "C")) + 
  ylab("Logged count") +
  xlab("Day")

# What terms are used by bots and not

# So can reproduce plot
set.seed(250388)

# Join on bots classification
id_lkup <- all_months[,c("id", "user_id")] # Create lookup of tweet id and user id
id_lkup <- merge(id_lkup, bots, by = "user_id", all.x = TRUE) # Join on bots lkup
dtm_td <- merge(dtm_td, id_lkup, by.x = "document", by.y = "id", all.x = TRUE)

# Aggregate words
dtm_td_agg <- dtm_td %>%
  group_by(term, bot) %>%
  summarise(count = sum(count))

# Most common words
dtm_td[!is.na(dtm_td$bot),] %>%
  count(term, bot, sort = TRUE) %>%
  group_by(bot) %>%
  top_n(20) %>%
  ungroup() %>%
  mutate(term = reorder(term, n)) %>%
  ggplot(aes(term, n, fill = factor(bot))) +
  geom_col(show.legend = FALSE) +
  facet_wrap(~bot, scales = "free_y") +
  labs(y = "Term frequency",
       x = NULL) +
  coord_flip()

# Plot wordcloud comparing terms they differ on
dtm_td[!is.na(dtm_td$bot),] %>%
  count(term, bot, sort = TRUE) %>%
  acast(term ~ bot, value.var = "n", fill = 0) %>%
  comparison.cloud(colors = c("gray20", "gray80"),
                   max.words = 100)

# We could also compare term frequency by a scatter plot approach
# Tidy data
terms_spread <- dtm_td[!is.na(dtm_td$bot),] %>%
  count(term, bot, sort = TRUE) %>%
  spread(bot, n)
names(terms_spread)[names(terms_spread) == "1"] <- "bot"
names(terms_spread)[names(terms_spread) == "0"] <- "human"
terms_spread$human[is.na(terms_spread$human)] <- 0 # Add in missing data as 0

# Plot
terms_spread %>% 
  ggplot(aes(x = human, y = bot)) + # Scatter plot of terms across topics 1 and 2
  geom_point(alpha = 0.1) +
  coord_cartesian(xlim = c(0,250000), ylim = c(0,250000)) + # Don't display outliers
  xlab("Human Term Frequency") +
  ylab("Bot Term Frequency")
