In [2]:
# Import libraries
from pyspark import SparkContext, SparkConf
from pyspark.sql import SparkSession, Row
import pandas as pd
import numpy as np
import string
import nltk
from nltk.corpus import stopwords
import re

# Import our own code
from Authentication import Authentication
from DataMiner import DataMiner
from PreProcessTweets import PreProcessTweets
from TweetDataIO import TweetDataIO
from DenialPredictor import DenialPredictor

# Create a set of English stopwords
sw = set(stopwords.words("english")) 

# Initiate spark
#sc = SparkContext('local[*]')
spark = SparkSession.builder.getOrCreate()

# Get twitter api
api = Authentication().get_api()

In [3]:
# Mine some denial tweets (no specific location)
tagignore = ["#Covid_19", "#coronavirus", "#COVIDー19", "#COVID19", "#coronavirusNYC", "#coronavirusoregon", "#lockdown", "#covid19", "#COVID", "#pandemic"]
miner = DataMiner(api, "#CoronaHoax", "", "en", tagignore=tagignore, num_tweets=10)
denial_tweets = miner.mine()

# Mine some control tweets (no specific location)
print("="*40)
tagignore = ["#CoronaHoax", "#covidhoax","#coronahoax", "#covidhoax", "#Plandemic"]
miner = DataMiner(api, "coronavirus", "", "en", tagignore=tagignore, num_tweets=50)
control_tweets = miner.mine()

Processing tag: #coronaHoax
Processing tag: #BillGatesIsEvil
Processing tag: #CoronaVirus
Processing tag: #Covid19
Processing tag: #FilmYourHospital
Processing tag: #scamdemic
Processing tag: #MAGA
Processing tag: #QAnon
Processing tag: #BillGates
Processing tag: #Plandemic2020
Processing tag: #POTUS
Processing tag: #ResistTheNewWorldOrder
Processing tag: #CORONAHOAX
Processing tag: #Corona
Processing tag: #Coronabollocks
Processing tag: #sos
Processing tag: #WWG1WGA
Processing tag: #coronabollocks
Processing tag: #NWO
Processing tag: #Scamdemic
Processing tag: #CovidHoax
Processing tag: #plandemic
Processing tag: #q
Processing tag: #woke
Processing tag: #thegreatawakening
Processing tag: #DrainTheSwamp
Processing tag: #Coronahoax
Processing tag: #BillGatesBioTerrorist
Processing tag: #endthelockdown
Processing tag: #FakePandemic
Processing tag: #ObamaGate
Processing tag: #Plandemic
Processing tag: #coronahoax
Processing tag: #CoronaHoax
Processing tag: #covid19
Processing tag: #Corona

In [4]:
# Write the tweets to a CSV file
filename = "./training_data.csv"
io = TweetDataIO(filename, spark=spark, context=sc)
io.write(denial_tweets, label=0, append=False)
io.write(control_tweets, label=1, append=True)

In [5]:
# Read and show
ddf = io.read()
ddf.cache()
ddf.show()

+-----+--------------------+--------------------+--------------------+-------------------+-------------------+---------------+
|label|            location|                tags|                text|               time|           tweet_id|           user|
+-----+--------------------+--------------------+--------------------+-------------------+-------------------+---------------+
|    0|                    |#CoronaHoax|#Boyc...|I think this #Cor...|2020-05-20 14:34:33|1263115901454266369|     DravanianW|
|    0|     New Orleans, LA|#CoronaBorg™,|#Co...|Yes! Resistance t...|2020-05-20 14:29:52|1263114726004924417|    TheKingDude|
|    0|         Houston, TX|#coronaHoax|#Lock...|@AmandaLeeHouse V...|2020-05-20 14:24:39|1263113413603057667|     rickjm1961|
|    0|      Pittsburgh, PA|         #CoronaHoax|@uniquely_q @Egbe...|2020-05-20 14:22:26|1263112852828807168|  sisisunflower|
|    0| Mumbai Meri Jaan ❤️|#India|#Tanzania|...|@biswarooproy The...|2020-05-20 14:14:24|1263110833598414849| 

In [6]:
# Remove duplicates 
ddf.orderBy("tweet_id").dropDuplicates().show()

+-----+--------------------+--------------------+--------------------+-------------------+-------------------+---------------+
|label|            location|                tags|                text|               time|           tweet_id|           user|
+-----+--------------------+--------------------+--------------------+-------------------+-------------------+---------------+
|    0|         Dunedin, FL|#FilmYourHospital...|SUICIDED &amp ASS...|2020-05-20 00:11:46|1262898777377181697|     elkabong62|
|    0|     Dunedin, FL USA|#FilmYourHospital...|SUICIDED &amp ASS...|2020-05-20 00:12:10|1262898878090809344|    thetimewall|
|    0|Situbondo, Indonesia|#musica|#MusicPro...|Get This on Displ...|2020-05-20 01:39:28|1262920846215188481|     Ridwan3030|
|    0|                    |#scamdemic|#pland...|@KTVU They also f...|2020-05-20 04:31:10|1262964058644336640|     ElGoogKcuf|
|    0|    Aichi-ken, Japan|#ResistTheNewWorl...|I #ResistTheNewWo...|2020-05-20 04:58:20|1262970891748757505| 

### Perform the preprocessing on the pyspark dataframe

In [7]:
p = PreProcessTweets(ddf, True, True, True, True, True)
ddf = p.preprocess()

Preprocessing...
>> Removing stopwords...
>> Removing urls...
>> Removing hashtags...
>> Removing user mentions...
>> Removing punctuation...
>> Removing whitespace...
Finished preprocessing!


### Make some predictions based on the data

In [8]:
# Convert to pandas for predictions
df = ddf.toPandas()

# Split data (corpus and labels) into train and test sets
predictor = DenialPredictor(df)
X_train, X_test, y_train, y_test = predictor.train_test_split(split=0.3)

# Fit the model
predictor.fit_model(X_train, y_train)

# Calculate some metrics to evaluate performance
predictor.calc_metrics(X_train, y_train)
predictor.calc_metrics(X_test, y_test)

Performance metrics: 
	-Accuracy: 0.973,
	-Precision: 0.955, 
	-Recall: 0.982,
	-F1: 0.968
Performance metrics: 
	-Accuracy: 0.802,
	-Precision: 0.768, 
	-Recall: 0.797,
	-F1: 0.783
