# Project Big Data Science (May 2020)

In [1]:
import tweepy
import numpy as np
from nltk.corpus import stopwords
import re
import nltk
import matplotlib.pyplot as plt

# Download the Dutch stop words from the NLTK repository.
nltk.download('stopwords')

#"44.4415,-102.6855,1000km"

[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/wannesvanleemput/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

In [2]:
# Consumer keys and access tokens, used for OAuth
consumer_key = "H2oOuvgoFBQ4PA1K9Yd8CqdM6"        
consumer_secret = "zSa6ulXVdNAl1Xk6TMSw48nVXIm88suBF06JzmT5XNSG2AIBxH"    
access_token = "1245776529768034306-YDUC9vTttvxvyhDhRVxGjfbt01p3bd"          
access_token_secret = "7cDHZiGpHSD2Y8Fe6RdRIpe75WephsSmfU6woDhHlD5BX"

auth = tweepy.AppAuthHandler(consumer_key, consumer_secret)
#auth = tweepy.OAuthHandler(consumer_key, consumer_secret) 
#auth.set_access_token(access_token, access_token_secret)

# Calling the api: we want a lot of data in one run so we wait for the limit rather than run into error...
api = tweepy.API(
                auth,
                wait_on_rate_limit=True,
                wait_on_rate_limit_notify=True
                )
if not api:
    print("Can't authenticate. Check if credentials are correct.")

In [3]:
class HashtagFinder():
    
    def __init__(self, starting_hashtag, location_radius, language, tagignore):
        self.starting_hashtag = starting_hashtag
        self.location_radius = location_radius
        self.language = language
        self.hashtags = {}
        self.tagignore = tagignore
        
    def collect_tags(self, tag=""):
        if tag == "":
            tag = self.starting_hashtag
        items = tweepy.Cursor(api.search,
                             q = tag + " -filter:retweets",
                             geocode=self.location_radius,
                             count=100,
                             lang=self.language,
                             include_rts=False,
                             tweet_mode="extended").items(1000)
        items = list(items)
        tweets = [t.full_text for t in items]
        for index, tweet in enumerate(tweets):
            tags = self.hashtags_from_tweet(tweet)
            for tag in tags:
                if tag not in self.tagignore:
                    if tag in self.hashtags:
                        self.hashtags[tag] = self.hashtags[tag] + 1
                    else:
                        self.hashtags[tag] = 1
        
    def get_hashtags(self):
        return self.hashtags
    
    def hashtags_from_tweet(self, tweet):
        words = tweet.split()
        tags = ["#"+re.sub(r'\W+','', word) for word in words if "#" in word]
        return tags

In [4]:
"""
TODO: Best to do data mining with specific class once program becomes more complex.
"""
class DataMiner():
    
    def __init__(self, starting_hashtag, location_radius, language, tagignore = []):
        self.location_radius = location_radius
        self.language = language
        self.denial_tweets = []
        self.ids = []
        self.finder = HashtagFinder(starting_hashtag, location_radius, language, tagignore)
        
    def _collect_tweets(self):
        #find relevant hashtags to search for
        self.finder.collect_tags()
        denial_tags = self.finder.get_hashtags()
        denial_tags = {k: v for k, v in sorted(denial_tags.items(), key=lambda item: item[1]) if v >= 10}
        
        for k, v in denial_tags.items():
            print("Processing tag: " + k)
            search_term = k + " -filter:retweets"
            items = tweepy.Cursor(api.search,
                                q = search_term,
                                geocode=self.location_radius,
                                count=100,
                                lang=self.language,
                                include_rts=False,
                                tweet_mode="extended").items(500)
            items = list(items)
            for item in items:
                if item not in self.ids:
                    self.ids.append(item.id)
                    self.denial_tweets.append(item.full_text)
                    
    def mine(self):
        self._collect_tweets()
        return self.denial_tweets

In [5]:
"""
Class for the preprocessing of tweets; involves removing hyperlinks and stopwords.
"""
class PreProcessTweets():
    
    def __init__(self, 
                 tweets, 
                 remove_tags=False, 
                 remove_stopwords=False, 
                 remove_urls=False,
                 remove_mentions=False,
                 remove_punctuation=False):
        
        self.tweets = tweets
        self.remove_tags = remove_tags
        self.remove_stopwords = remove_stopwords
        self.remove_urls = remove_urls
        self.remove_mentions = remove_mentions
        self.remove_punctuation = remove_punctuation
        
    def _remove_urls(self):
        """ Remove all urls from the tweet text. """
        self.tweets = [re.sub(r'\s?http\S+', "", t) for t in self.tweets]
    
    def _remove_stopwords(self):
        """ Remove English stopwords from the text. """
        sw = set(stopwords.words("english")) 
        self.tweets = [" ".join([word for word in c.split() if word not in sw]) for c in self.tweets]

    def _remove_hashtag(self, tag=None):
        """ Remove a specific hashtag. If no tag specified, remove all tags."""
        for index, tweet in enumerate(self.tweets):
            words = tweet.split()
            no_tags = [word for word in words if "#" not in word]
            self.tweets[index] = " ".join(no_tags)
            
    def _remove_mentions(self):
        """ Remove all mentions (@user). """
        self.tweets = [re.sub(r'\s?@\S+', "", t) for t in self.tweets]
    
    def _remove_punctuation(self):
        """ Punctuation affects words: eg. 'however' is not the same word as 'however,'"""
        pass
    
    def preprocess(self):
        """ Perform the requested steps of the preprocessing. """
        
        if self.remove_tags:
            self._remove_hashtag()
            
        if self.remove_stopwords:
            self._remove_stopwords()
            
        if self.remove_urls:
            self._remove_urls()
            
        if self.remove_mentions:
            self._remove_mentions()
        
        if self.remove_punctuation:
            self._remove_punctuation()
            
        return self.tweets

## Tweet data mining

We use the Tweepy.Cursor functionality to search for tweets on the coronavirus topic in a certain geographical area in the United States. We take New York City because this region has been most affected by the virus.

In [6]:
NUM_TWEETS = 1000
SEARCH_TERM = "#CoronaHoax -filter:retweets"
location_radius = "40.7282,-73.7949,1000km"
language = "en"
starting_hashtag = "#CoronaHoax"
#ignore generic tags
tagignore = ["#Covid_19", "#coronavirus", "#COVIDãƒ¼19", "#COVID19", "#coronavirusNYC", "#coronavirusoregon", "#lockdown"]

In [None]:
miner = DataMiner(starting_hashtag, location_radius, language, tagignore)
denial_tweets = miner.mine()
print(f"Processed {len(list(denial_tweets))} tweets.")

Processing tag: #COVIDIOTS
Processing tag: #endthelockdown


Rate limit reached. Sleeping for: 32


In [None]:
items = tweepy.Cursor(api.search,
            q="coronavirus -filter:retweets", 
            geocode="40.7282,-73.7949,1000km",
            count=100,
            lang="en",
            include_rts=False,
            tweet_mode="extended").items(5000)

items = list(items)
print(f"Finished reading {len(items)} items.")
control_tweets = [t.full_text for t in items]

In [None]:
items = tweepy.Cursor(api.search,
            q="covid -filter:retweets", 
            geocode="40.7282,-73.7949,1000km",
            count=100,
            lang="en",
            include_rts=False,
            tweet_mode="extended").items(5000)

items = list(items)
print(f"Finished reading {len(items)} items.")
control_tweets.extend([t.full_text for t in items])

In [None]:
items = tweepy.Cursor(api.search,
            q="lockdown -filter:retweets", 
            geocode="40.7282,-73.7949,1000km",
            count=100,
            lang="en",
            include_rts=False,
            tweet_mode="extended").items(5000)

items = list(items)
print(f"Finished reading {len(items)} items.")
control_tweets.extend([t.full_text for t in items])

### Create a simple classification model

In [None]:
from sklearn import tree
from sklearn.naive_bayes import MultinomialNB
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfTransformer
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import roc_curve, auc, accuracy_score, f1_score, precision_score, recall_score

In [None]:
# Do some preprocessing on the text
tweets = denial_tweets + control_tweets
preprocessor = PreProcessTweets(
                                tweets.copy(), 
                                remove_tags=True,
                                remove_urls=True,
                                remove_stopwords=True,
                                remove_mentions=True
                               )
corpus = preprocessor.preprocess()
labels = [0]*len(denial_tweets) + [1]*len(control_tweets)

In [None]:
# Vectorize the input data
vectorizer = CountVectorizer()
X = vectorizer.fit_transform(corpus)

# Split into training and test sets
X_train, X_test, y_train, y_test = train_test_split(X, 
                                                    labels, 
                                                    random_state=123,
                                                    test_size=0.3
                                                   )

# Train a Naive Bayes classifier
model = MultinomialNB()
model = model.fit(X_train, y_train)

# Get some performance metrics on the training set
y_predict = model.predict(X_train)

a = accuracy_score(y_train, y_predict)
p = precision_score(y_train, y_predict)
r = recall_score(y_train, y_predict)
f = f1_score(y_train, y_predict)
print("Training performance metrics: ")
print(f"\t-Accuracy: {a:.3f},\n\t-Precision: {p:.3f}, \n\t-Recall: {r:.3f},\n\t-F1: {f:.3f}")
print("="*35)

# Get some performance metrics on the test set
y_predict = model.predict(X_test)

a = accuracy_score(y_test, y_predict)
p = precision_score(y_test, y_predict)
r = recall_score(y_test, y_predict)
f = f1_score(y_test, y_predict)
print("Test performance metrics: ")
print(f"\t-Accuracy: {a:.3f},\n\t-Precision: {p:.3f}, \n\t-Recall: {r:.3f},\n\t-F1: {f:.3f}")

### Real-world test

We can download some more tweets from the same #CoronaHoax hashtag we started with and check that these are indeed flagged correctly as "COVID denial" tweets. The tweets now originated from Los Angeles, so there is no overlap from the training data.

In [None]:
items = tweepy.Cursor(api.search,
            q="#CoronaHoax -filter:retweets", 
            geocode="34.0522,-118.2436,500km",
            count=100,
            lang="en",
            include_rts=False,
            tweet_mode="extended").items(100)

items = list(items)
print(f"Finished reading {len(items)} items.")
tweets_LA = [t.full_text for t in items]

In [None]:
preprocessorLA = PreProcessTweets(
                                tweets_LA.copy(), 
                                remove_tags=True,
                                remove_urls=True,
                                remove_stopwords=True,
                                remove_mentions=True
                               )
corpus = preprocessorLA.preprocess()

In [None]:
x = vectorizer.transform(corpus)
y_predict = model.predict(x)
print(f"Accuracy: {list(y_predict).count(0) / len(y_predict):.3f}")