In [1]:
# Import libraries
from pyspark import SparkContext, SparkConf
from pyspark.sql import SparkSession, Row
import pandas as pd
import numpy as np
import string
import nltk
from nltk.corpus import stopwords
import re

# Import our own code
from Authentication import Authentication
from DataMiner import DataMiner
from PreProcessTweets import PreProcessTweets
from TweetDataIO import TweetDataIO

# Create a set of English stopwords
sw = set(stopwords.words("english")) 

# Initiate spark
sc = SparkContext('local[*]')
spark = SparkSession.builder.getOrCreate()

# Get twitter api
api = Authentication().get_api()

In [2]:
# Mine some denial tweets (no specific location)
tagignore = ["#Covid_19", "#coronavirus", "#COVIDー19", "#COVID19", "#coronavirusNYC", "#coronavirusoregon", "#lockdown", "#covid19", "#COVID", "#pandemic"]
miner = DataMiner(api, "#CoronaHoax", "", "en", tagignore=tagignore, num_tweets=10)
denial_tweets = miner.mine()

# Mine some control tweets (no specific location)
print("="*40)
tagignore = ["#CoronaHoax", "#covidhoax","#coronahoax", "#covidhoax", "#Plandemic"]
miner = DataMiner(api, "coronavirus", "", "en", tagignore=tagignore, num_tweets=50)
control_tweets = miner.mine()

Processing tag: #Agenda21
Processing tag: #BillGatesIsEvil
Processing tag: #QAnon
Processing tag: #FakeNews
Processing tag: #Corona
Processing tag: #CoronaVirus
Processing tag: #FilmYourHospital
Processing tag: #Covid19
Processing tag: #WWG1WGAWORLDWIDE
Processing tag: #OBAMAGATEGATE
Processing tag: #sos
Processing tag: #MAGA
Processing tag: #BillGates
Processing tag: #JusticeIsComing
Processing tag: #InItTogether
Processing tag: #coronabollocks
Processing tag: #covid1948
Processing tag: #thegreatawakening
Processing tag: #woke
Processing tag: #FireFauci
Processing tag: #QANONWORLDWIDE
Processing tag: #COVIDIOTS
Processing tag: #Coronabollocks
Processing tag: #NWO
Processing tag: #q
Processing tag: #DrainTheSwamp
Processing tag: #plandemic
Processing tag: #Scamdemic
Processing tag: #CovidHoax
Processing tag: #BillGatesBioTerrorist
Processing tag: #Coronahoax
Processing tag: #TheGreatAwakening
Processing tag: #FakePandemic
Processing tag: #WWG1WGA
Processing tag: #endthelockdown
Process

In [3]:
# Write the tweets to a CSV file
filename = "./training_data.csv"
io = TweetDataIO(filename, spark=spark, context=sc)
io.write(denial_tweets, label=0, append=False)
io.write(control_tweets, label=1, append=True)

In [4]:
# Read and show
ddf = io.read()
ddf.cache()
ddf.show()

+-----+--------------------+--------------------+-------------------+-------------------+---------------+
|label|            location|                text|               time|           tweet_id|           user|
+-----+--------------------+--------------------+-------------------+-------------------+---------------+
|    0|                    |Trumps $138m and ...|2020-05-18 22:45:44|1262514737331556353|      atasiding|
|    0|    Kali-Yuga, Earth|#statism has kill...|2020-05-18 22:40:21|1262513384777449476|        diksaca|
|    0|                    |#DefundWHO #Agend...|2020-05-18 22:30:10|1262510819239133184|franktalk108211|
|    0|       San Francisco|Thanks to The Con...|2020-05-18 22:09:54|1262505718298501120|      RosaKoire|
|    0|                    |@POTUS #ObamaGate...|2020-05-18 22:06:29|1262504861100376064|franktalk108211|
|    0|    British Columbia|@WeAreCanProud Tr...|2020-05-18 21:50:16|1262500779899809792|   BCHOMEHUNTER|
|    0|New Delhi, Rajast...|Do you believe th.

In [5]:
# Remove duplicates 
ddf.orderBy("tweet_id").dropDuplicates().show()

+-----+--------------------+--------------------+-------------------+-------------------+---------------+
|label|            location|                text|               time|           tweet_id|           user|
+-----+--------------------+--------------------+-------------------+-------------------+---------------+
|    0|    Southern England|MORE #Coronabollo...|2020-05-18 17:38:05|1262437316519436289|SourGra32364954|
|    0|   Brighton, England|Note that they re...|2020-05-18 17:43:50|1262438762228916225| BrightonMighty|
|    0|The Holy City of ...|@JonnyHilton5 Wha...|2020-05-18 17:54:21|1262441407387783170|  Sheikh_Anvakh|
|    0|      United Kingdom|Northern Ireland ...|2020-05-18 18:47:15|1262454722998554625|     TelegramJf|
|    0|    North Kilt Town |This lockdown was...|2020-05-18 18:56:24|1262457022936764416|   irishyiddo13|
|    0|      United Kingdom|How dare you trav...|2020-05-18 19:24:36|1262464119392940033|   OSIANBYLEVID|
|    0|https://soundclou...|👉💉🦠The #NewAbn...|

### Perform the preprocessing on the pyspark dataframe

In [6]:
p = PreProcessTweets(ddf, True, True, True, True, True)
ddf = p.preprocess()

Preprocessing...
>> Removing stopwords...
>> Removing urls...
>> Removing hashtags...
>> Removing user mentions...
>> Removing punctuation...
>> Removing whitespace...
Finished preprocessing!
+-----+--------------------+--------------------+-------------------+-------------------+---------------+--------------------+
|label|            location|                text|               time|           tweet_id|           user|      processed_text|
+-----+--------------------+--------------------+-------------------+-------------------+---------------+--------------------+
|    0|                    |Trumps $138m and ...|2020-05-18 22:45:44|1262514737331556353|      atasiding|Trumps 138m ID202...|
|    0|    Kali-Yuga, Earth|#statism has kill...|2020-05-18 22:40:21|1262513384777449476|        diksaca|killed people vir...|
|    0|                    |#DefundWHO #Agend...|2020-05-18 22:30:10|1262510819239133184|franktalk108211|Look news article...|
|    0|       San Francisco|Thanks to The Con.

### Make some predictions based on the data

In [7]:
from sklearn.naive_bayes import MultinomialNB
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.metrics import roc_curve, auc, accuracy_score, f1_score, precision_score, recall_score

In [8]:
# Have a look at the new data
df = ddf.toPandas()

# Remove duplicates
df.sort_values(["label"]).drop_duplicates("tweet_id")

Unnamed: 0,label,location,text,time,tweet_id,user,processed_text
0,0,,Trumps $138m and ID2020: Gates (White-House)Ta...,2020-05-18 22:45:44,1262514737331556353,atasiding,Trumps 138m ID2020 Gates WhiteHouseTask Force ...
265,0,,@MaiuroSalvatore @BillWalls20 It is cheap wide...,2020-05-18 23:00:03,1262518338821001216,TSeeker999,It cheap widely available cannot monetize like...
264,0,eartH 777 🇨🇦🇦🇺🇬🇧🇧🇪🇵🇹,All ties in with what @DrJudyAMikovits has bee...,2020-05-18 23:00:04,1262518343938068481,DougalMcStanley,All ties saying et al
263,0,"Longmont, CO",I’m an Investigative Journalist. These Are the...,2020-05-18 23:00:35,1262518473437007872,EditorNHays,I’m Investigative Journalist These Are Questio...
262,0,at altitude,"""Has anyone tried telling these #covid19 #plan...",2020-05-18 23:00:49,1262518533432340481,SirJacksonPeaks,Has anyone tried telling protesters leave coun...
...,...,...,...,...,...,...,...
437,1,India,India #COVID19 cases crossed 1 lakh. #coronavi...,2020-05-18 23:04:16,1262519400512421889,covidindia19,India cases crossed 1 lakh
438,1,Enugu (Nigeria),"Essentially, family of Doctor gets tested for ...",2020-05-18 23:04:14,1262519394808352768,OscarEGB,Essentially family Doctor gets tested We proje...
439,1,Where it's at,DAY 60 of 12 WEEK Isolation Lockdown Complete!...,2020-05-18 23:04:11,1262519380107132928,heartboi,DAY 60 12 WEEK Isolation Lockdown Complete Sho...
486,1,,Since no media have questioned her (despite me...,2020-05-18 23:05:14,1262519645040410633,TheInformer1999,Since media questioned despite contacting rega...


In [9]:
corpus = df.processed_text
labels = df.label

# Vectorize the input data
vectorizer = CountVectorizer()
X = vectorizer.fit_transform(corpus)

In [10]:
# Split into training and test sets
X_train, X_test, y_train, y_test = train_test_split(X, 
                                                    labels, 
                                                    random_state=123,
                                                    test_size=0.3
                                                   )

# Train a Naive Bayes classifier
model = MultinomialNB()
model = model.fit(X_train, y_train)

# Get some performance metrics on the training set
y_predict = model.predict(X_train)

a = accuracy_score(y_train, y_predict)
p = precision_score(y_train, y_predict)
r = recall_score(y_train, y_predict)
f = f1_score(y_train, y_predict)
print("="*30)
print("Training performance metrics: ")
print(f"\t-Accuracy: {a:.3f},\n\t-Precision: {p:.3f}, \n\t-Recall: {r:.3f},\n\t-F1: {f:.3f}")
print("="*30)

# Get some performance metrics on the test set
y_predict = model.predict(X_test)

a = accuracy_score(y_test, y_predict)
p = precision_score(y_test, y_predict)
r = recall_score(y_test, y_predict)
f = f1_score(y_test, y_predict)
print("Test performance metrics: ")
print(f"\t-Accuracy: {a:.3f},\n\t-Precision: {p:.3f}, \n\t-Recall: {r:.3f},\n\t-F1: {f:.3f}")
print("="*30)

Training performance metrics: 
	-Accuracy: 0.966,
	-Precision: 0.942, 
	-Recall: 0.956,
	-F1: 0.949
Test performance metrics: 
	-Accuracy: 0.763,
	-Precision: 0.672, 
	-Recall: 0.651,
	-F1: 0.661
