# Project Big Data Science (May 2020)

In [1]:
# Import 3rd party libraries
import tweepy
import numpy as np
import nltk
#from nltk.corpus import stopwords
import re
import matplotlib.pyplot as plt
import pandas as pd

# Import our own code
from DataMiner import DataMiner
from HashtagFinder import HashtagFinder
from PreProcessTweets import PreProcessTweets
from Authentication import Authentication
from LocationService import LocationService

# Download the Dutch stop words from the NLTK repository.
nltk.download('stopwords')

#"44.4415,-102.6855,1000km"

[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/wannesvanleemput/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

In [2]:
auth = Authentication()
api = auth.get_api()

## Tweet data mining

We use the Tweepy.Cursor functionality to search for tweets on the coronavirus topic in a certain geographical area in the United States. We take New York City because this region has been most affected by the virus.

In [3]:
NUM_TWEETS = 5000
SEARCH_TERM = "#CovidHoax -filter:retweets"
loc = LocationService()
lat, lng = loc.get_coordinates("New York City", "United States")
location_radius = str(lat) + "," + str(lng) + ",100km"
language = "en"
starting_hashtag = "#Plandemic"
#ignore generic tags, since they could corrupt the denial-tweets dataset
tagignore = ["#Covid_19", "#coronavirus", "#COVIDー19", "#COVID19", "#coronavirusNYC", "#coronavirusoregon", "#lockdown", "#covid19", "#COVID", "#pandemic"]

In [4]:
miner = DataMiner(api, starting_hashtag, location_radius, language, tagignore)
denial_tweets = miner.mine()
print(f"Processed {len(list(denial_tweets))} tweets.")

Processing tag: #scamdemic
Processing tag: #Plandemic2020
Processing tag: #ObamaGate
Processing tag: #PlandemicDocumentary
Processing tag: #ConspiracyTheory
Processing tag: #plandemic
Processing tag: #Plandemic
Processed 2160 tweets.


In [5]:
denial_tweets.extend(miner.mine("#CoronaHoax"))

Processing tag: #COVID
Processing tag: #fakenews
Processing tag: #PLANdemic
Processing tag: #pandemic
Processing tag: #QAnon
Processing tag: #WWG1GWA
Processing tag: #DeepState
Processing tag: #COVIDIOTS
Processing tag: #Fauci
Processing tag: #qanon
Processing tag: #PLANDEMIC
Processing tag: #FauciFraud
Processing tag: #scamdemic
Processing tag: #Plandemic2020
Processing tag: #ObamaGate
Processing tag: #PlandemicDocumentary
Processing tag: #ConspiracyTheory
Processing tag: #plandemic
Processing tag: #Plandemic


In [6]:
panda = miner.get_dataframe()
panda.head()

Unnamed: 0,Author,Location,Tags
0,Nestor Delgado,"Brooklyn, New York",[#scamdemic]
1,Luna77,"New Jersey, USA",[#Scamdemic]
2,Kat,"Florida, USA","[#scamdemic, #controlaviris]"
3,Yoshi Yokamura,"New Jersey, USA","[#pizzagate, #Scamdemic]"
4,Yoshi Yokamura,"New Jersey, USA",[#scamdemic]


In [7]:
items = tweepy.Cursor(api.search,
            q="coronavirus -filter:retweets", 
            geocode="40.7282,-73.7949,1000km",
            count=100,
            lang="en",
            include_rts=False,
            tweet_mode="extended").items(NUM_TWEETS)

items = list(items)
print(f"Finished reading {len(items)} items.")
control_tweets = [t.full_text for t in items]

Finished reading 5000 items.


In [None]:
items = tweepy.Cursor(api.search,
            q="covid -filter:retweets", 
            geocode="40.7282,-73.7949,1000km",
            count=100,
            lang="en",
            include_rts=False,
            tweet_mode="extended").items(NUM_TWEETS)

items = list(items)
print(f"Finished reading {len(items)} items.")
control_tweets.extend([t.full_text for t in items])

In [None]:
items = tweepy.Cursor(api.search,
            q="lockdown -filter:retweets", 
            geocode="40.7282,-73.7949,1000km",
            count=100,
            lang="en",
            include_rts=False,
            tweet_mode="extended").items(NUM_TWEETS)

items = list(items)
print(f"Finished reading {len(items)} items.")
control_tweets.extend([t.full_text for t in items])

### Create a simple classification model

In [50]:
from sklearn import tree
from sklearn.naive_bayes import MultinomialNB
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfTransformer
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import roc_curve, auc, accuracy_score, f1_score, precision_score, recall_score

In [51]:
# Do some preprocessing on the text
tweets = denial_tweets + control_tweets
preprocessor = PreProcessTweets(
                                tweets.copy(),
                                remove_tags=False,
                                remove_urls=True,
                                remove_stopwords=True,
                                remove_mentions=True
                               )
corpus = preprocessor.preprocess()
labels = [0]*len(denial_tweets) + [1]*len(control_tweets)

In [52]:
# Vectorize the input data
vectorizer = CountVectorizer()
X = vectorizer.fit_transform(corpus)

# Split into training and test sets
X_train, X_test, y_train, y_test = train_test_split(X, 
                                                    labels, 
                                                    random_state=123,
                                                    test_size=0.3
                                                   )

# Train a Naive Bayes classifier
model = MultinomialNB()
model = model.fit(X_train, y_train)

# Get some performance metrics on the training set
y_predict = model.predict(X_train)

a = accuracy_score(y_train, y_predict)
p = precision_score(y_train, y_predict)
r = recall_score(y_train, y_predict)
f = f1_score(y_train, y_predict)
print("Training performance metrics: ")
print(f"\t-Accuracy: {a:.3f},\n\t-Precision: {p:.3f}, \n\t-Recall: {r:.3f},\n\t-F1: {f:.3f}")
print("="*35)

# Get some performance metrics on the test set
y_predict = model.predict(X_test)

a = accuracy_score(y_test, y_predict)
p = precision_score(y_test, y_predict)
r = recall_score(y_test, y_predict)
f = f1_score(y_test, y_predict)
print("Test performance metrics: ")
print(f"\t-Accuracy: {a:.3f},\n\t-Precision: {p:.3f}, \n\t-Recall: {r:.3f},\n\t-F1: {f:.3f}")

Training performance metrics: 
	-Accuracy: 0.975,
	-Precision: 0.978, 
	-Recall: 0.983,
	-F1: 0.981
Test performance metrics: 
	-Accuracy: 0.960,
	-Precision: 0.971, 
	-Recall: 0.967,
	-F1: 0.969


### Real-world test

We can download some more tweets from the same #CoronaHoax hashtag we started with and check that these are indeed flagged correctly as "COVID denial" tweets. The tweets now originated from Los Angeles, so there is no overlap from the training data.

In [53]:
loc = LocationService()
lat, lng = loc.get_coordinates("Los Angeles", "United States")
location = str(lat) + "," + str(lng) + ",500km"

In [72]:
items = tweepy.Cursor(api.search,
            q="coronahoax -filter:retweets", 
            geocode=location,
            count=100,
            lang="en",
            include_rts=False,
            tweet_mode="extended").items(100)

items = list(items)
print(f"Finished reading {len(items)} items.")
tweets_LA = [t.full_text for t in items]

Finished reading 74 items.


In [73]:
preprocessorLA = PreProcessTweets(
                                tweets_LA.copy(), 
                                remove_tags=False,
                                remove_urls=True,
                                remove_stopwords=True,
                                remove_mentions=True
                               )
corpus = preprocessorLA.preprocess()

In [74]:
x = vectorizer.transform(corpus)
y_predict = model.predict(x)
print(f"Accuracy: {list(y_predict).count(0) / len(y_predict):.3f}")

Accuracy: 0.824
