In [1]:
# Import libraries
from pyspark import SparkContext, SparkConf
from pyspark.sql import SparkSession, Row
import pandas as pd
import numpy as np
import string
import nltk
from nltk.corpus import stopwords
import re

# Import our own code
from Authentication import Authentication
from DataMiner import DataMiner
from PreProcessTweets import PreProcessTweets
from TweetDataIO import TweetDataIO
from DenialPredictor import DenialPredictor

# Create a set of English stopwords
sw = set(stopwords.words("english")) 

# Initiate spark
sc = SparkContext('local[*]')
spark = SparkSession.builder.getOrCreate()

# Get twitter api
api = Authentication().get_api()

In [2]:
# Mine some denial tweets (no specific location)
tagignore = ["#Covid_19", "#coronavirus", "#COVIDー19", "#COVID19", "#coronavirusNYC", "#coronavirusoregon", "#lockdown", "#covid19", "#COVID", "#pandemic"]
miner = DataMiner(api, "#CoronaHoax", "", "en", tagignore=tagignore, num_tweets=10)
denial_tweets = miner.mine()

# Mine some control tweets (no specific location)
print("="*40)
tagignore = ["#CoronaHoax", "#covidhoax","#coronahoax", "#covidhoax", "#Plandemic"]
miner = DataMiner(api, "coronavirus", "", "en", tagignore=tagignore, num_tweets=50)
control_tweets = miner.mine()

Processing tag: #BillGatesIsEvil
Processing tag: #Agenda21
Processing tag: #FakeNews
Processing tag: #QAnon
Processing tag: #Corona
Processing tag: #CoronaVirus
Processing tag: #FilmYourHospital
Processing tag: #BillGates
Processing tag: #Covid19
Processing tag: #WWG1WGAWORLDWIDE
Processing tag: #OBAMAGATEGATE
Processing tag: #sos
Processing tag: #MAGA
Processing tag: #JusticeIsComing
Processing tag: #InItTogether
Processing tag: #coronabollocks
Processing tag: #covid1948
Processing tag: #thegreatawakening
Processing tag: #woke
Processing tag: #Coronabollocks
Processing tag: #FireFauci
Processing tag: #QANONWORLDWIDE
Processing tag: #COVIDIOTS
Processing tag: #NWO
Processing tag: #q
Processing tag: #DrainTheSwamp
Processing tag: #CovidHoax
Processing tag: #Scamdemic
Processing tag: #plandemic
Processing tag: #BillGatesBioTerrorist
Processing tag: #Coronahoax
Processing tag: #TheGreatAwakening
Processing tag: #FakePandemic
Processing tag: #WWG1WGA
Processing tag: #endthelockdown
Process

In [3]:
# Write the tweets to a CSV file
filename = "./training_data.csv"
io = TweetDataIO(filename, spark=spark, context=sc)
io.write(denial_tweets, label=0, append=False)
io.write(control_tweets, label=1, append=True)

In [4]:
# Read and show
ddf = io.read()
ddf.cache()
ddf.show()

+-----+--------------------+--------------------+-------------------+-------------------+---------------+
|label|            location|                text|               time|           tweet_id|           user|
+-----+--------------------+--------------------+-------------------+-------------------+---------------+
|    0| North Carolina, USA|@GregMannarino It...|2020-05-18 23:39:33|1262528281238212610|POSITIVECAT6961|
|    0|Montreal, Quebec,...|OPERATION WARP SP...|2020-05-18 23:39:22|1262528233813291008|FreeRadioRevolu|
|    0|                    |@VdBiggelaar @ec_...|2020-05-18 23:35:14|1262527195836276737|        CV454US|
|    0|England, United K...|CORONOVIRUS. THIS...|2020-05-18 23:15:03|1262522114093060102|   islamdaily__|
|    0|Ashland, Massachu...|#NowWatching FW: ...|2020-05-18 23:13:58|1262521844218818560|quantumtimerepo|
|    0|   Dallas, Texas USA|Italian politicia...|2020-05-18 23:03:15|1262519145159106561|    EdwinRankin|
|    0|                    |Most of your conv.

In [5]:
# Remove duplicates 
ddf.orderBy("tweet_id").dropDuplicates().show()

+-----+--------------------+--------------------+-------------------+-------------------+---------------+
|label|            location|                text|               time|           tweet_id|           user|
+-----+--------------------+--------------------+-------------------+-------------------+---------------+
|    0|The Holy City of ...|@JonnyHilton5 Wha...|2020-05-18 17:54:21|1262441407387783170|  Sheikh_Anvakh|
|    0|      United Kingdom|Northern Ireland ...|2020-05-18 18:47:15|1262454722998554625|     TelegramJf|
|    0|    North Kilt Town |This lockdown was...|2020-05-18 18:56:24|1262457022936764416|   irishyiddo13|
|    0|      United Kingdom|How dare you trav...|2020-05-18 19:24:36|1262464119392940033|   OSIANBYLEVID|
|    0|https://soundclou...|👉💉🦠The #NewAbn...|2020-05-18 20:00:31|1262473158407290880| EinsteinsMagic|
|    0|                    |@tnfeline @rodney...|2020-05-18 20:46:58|1262484850310774786|       GetClass|
|    0|         Chicago, IL|@CAPAction Hospit...|

### Perform the preprocessing on the pyspark dataframe

In [6]:
p = PreProcessTweets(ddf, True, True, True, True, True)
ddf = p.preprocess()

Preprocessing...
>> Removing stopwords...
>> Removing urls...
>> Removing hashtags...
>> Removing user mentions...
>> Removing punctuation...
>> Removing whitespace...
Finished preprocessing!


### Make some predictions based on the data

In [7]:
# Convert to pandas for predictions
df = ddf.toPandas()

# Split data (corpus and labels) into train and test sets
predictor = DenialPredictor(df)
X_train, X_test, y_train, y_test = predictor.train_test_split(split=0.3)

# Fit the model
predictor.fit_model(X_train, y_train)

# Calculate some metrics to evaluate performance
predictor.calc_metrics(X_train, y_train)
predictor.calc_metrics(X_test, y_test)

Performance metrics: 
	-Accuracy: 0.955,
	-Precision: 0.907, 
	-Recall: 0.933,
	-F1: 0.920
Performance metrics: 
	-Accuracy: 0.790,
	-Precision: 0.604, 
	-Recall: 0.711,
	-F1: 0.653
