In [358]:
import urllib.request
import zipfile
import pandas
import numpy
import re
import nltk

from langdetect import detect
from googletrans import Translator

In [628]:
#Fetching dataset
url = "https://github.com/signerebassoo/COMP3222/blob/master/assignment-comp3222-comp6246-mediaeval2015-dataset.zip?raw=true"
filename = "mediaeval.zip"
urllib.request.urlretrieve(url, filename)

#Extracting zip
zfile = zipfile.ZipFile("mediaeval.zip", "r")
zfile.extractall()
zfile.close()

In [629]:
trainData = pandas.read_csv("mediaeval-2015-trainingset.txt", sep="	")
testData = pandas.read_csv("mediaeval-2015-testset.txt", sep="	")

#Creating DataFrames for training and testing
df_train = pandas.DataFrame(data = trainData)
df_test = pandas.DataFrame(data = testData)

# Data Characterization

In [158]:
trainData.head()

Unnamed: 0,tweetId,tweetText,userId,imageId(s),username,timestamp,label
0,263046056240115712,¿Se acuerdan de la película: “El día después d...,21226711,sandyA_fake_46,iAnnieM,Mon Oct 29 22:34:01 +0000 2012,fake
1,262995061304852481,@milenagimon: Miren a Sandy en NY! Tremenda i...,192378571,sandyA_fake_09,CarlosVerareal,Mon Oct 29 19:11:23 +0000 2012,fake
2,262979898002534400,"Buena la foto del Huracán Sandy, me recuerda a...",132303095,sandyA_fake_09,LucasPalape,Mon Oct 29 18:11:08 +0000 2012,fake
3,262996108400271360,Scary shit #hurricane #NY http://t.co/e4JLBUfH,241995902,sandyA_fake_29,Haaaaarryyy,Mon Oct 29 19:15:33 +0000 2012,fake
4,263018881839411200,My fave place in the world #nyc #hurricane #sa...,250315890,sandyA_fake_15,princess__natt,Mon Oct 29 20:46:02 +0000 2012,fake


In [159]:
trainData.info() #Metadata of training data, including size

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 14277 entries, 0 to 14276
Data columns (total 7 columns):
tweetId       14277 non-null int64
tweetText     14277 non-null object
userId        14277 non-null int64
imageId(s)    14277 non-null object
username      14277 non-null object
timestamp     14277 non-null object
label         14277 non-null object
dtypes: int64(2), object(5)
memory usage: 780.9+ KB


In [92]:
df_test.shape #Size of testing data

(3755, 7)

In [630]:
#Determine events covered and their frequency by image names in training data
df_train.rename(columns = {'imageId(s)':'imgs'}, inplace = True)
imgCount = df_train.groupby(df_train.imgs.str.split('_').str[0])['tweetId'].nunique()
print (imgCount)

imgs
boston                 546
bringback              131
columbianChemicals     185
elephant                13
livr                     9
malaysia               501
passport                46
pigFish                 14
sandyA                9695
sandyB                2621
sochi                  402
underwater             112
Name: tweetId, dtype: int64


In [631]:
#Determine events covered and their frequency by image names in testing data
df_test.rename(columns = {'imageId(s)':'imgs'}, inplace = True)
imgCount = df_test.groupby(df_test.imgs.str.split('_').str[0])['tweetId'].nunique()
print (imgCount)

imgs
eclipse        277
garissa         77
nepal         1353
samurai        218
syrianboy     1769
varoufakis      61
Name: tweetId, dtype: int64


In [309]:
#Helper to look into the tweetText of a particular event image to determine what the event is
selector = []
for imgs in df_train['imgs']:
    if "sandy" in imgs:
        selector.append(True)
    else:
        selector.append(False)
        
isEvent = pandas.Series(selector)

df_event = df_train[isEvent].head(61)

for tweet in df_event['tweetText']:
    print(tweet)

¿Se acuerdan de la película: “El día después de mañana”? Me recuerda a lo que está pasando con el huracán #Sandy.
Buena la foto del Huracán Sandy, me recuerda a la película Día de la Independencia #ID4 #Sandy
Scary shit #hurricane #NY
My fave place in the world #nyc #hurricane #sandy #statueofliberty
42nd #time #square #NYC #subway #hurricane
Just in time for #halloween a photo of #hurricane #sandy #frankenstorm
Crazy pic of #Hurricane #Sandy prayers go out to family and friends on the East Coast
#sandy #newyork #hurricane #statueofliberty #USA
#nyc #hurricane
robertosalibaba god be with u brother #sandy #hurricane #newyork
#Crazy #Hurricane #Sandy
#shark #newjersey #swim #sandy #hurricane
Good luck #ny #newyork #usa #hurricane #sandy
Wow.... Fishing anyone? #hurricane #sandy
Well #howdy there #hurricane #sandy . Just wanted to let you know that you took my power, internet, happi
Just known this bcs of #jason #chen updated the pic! Everyone be safe! #newyork #sandy #hurricane #nature #

In [294]:
langs = dict()

for tweet in df_train['tweetText']:
    try:
        lan = detect(tweet)
    except:
        pass
        lan = "Unknown"
        print(tweet)
    if lan in langs.keys():
        langs[lan] = langs[lan] + 1
    else:
        langs[lan] = 1

Man sandy Foreal??  ⚡⚡⚡☔☔⚡🌊🌊☁🚣⛵💡🔌🚬🚬🚬🔫🔫🔒🔒🔐🔑🔒🚪🚪🚪🔨🔨🔨🏊🏊🏊🏊🎣🎣🎣😱😰😖😫😩😤💨💨💨💨💦💦💦💧💦💥💥💥👽💩🙌🙌🙌🙌🙌🏃🏃🏃🏃🏃👫👭💏👪👪👬👭💑🙇🌕🌕🌕🌎 http://t.co/vEWVXy10


In [295]:
print (langs)

{'es': 1293, 'en': 10955, 'sq': 7, 'ru': 60, 'it': 102, 'no': 36, 'fr': 218, 'mk': 2, 'ro': 7, 'nl': 89, 'bg': 8, 'pt': 159, 'de': 130, 'tl': 315, 'cy': 117, 'ja': 22, 'ar': 81, 'vi': 13, 'ca': 34, 'sk': 15, 'hu': 6, 'sv': 42, 'so': 119, 'fi': 15, 'pl': 42, 'id': 173, 'da': 26, 'af': 70, 'el': 5, 'lt': 4, 'he': 1, 'hr': 6, 'tr': 33, 'zh-cn': 10, 'fa': 3, 'sl': 7, 'sw': 13, 'et': 11, 'ko': 6, 'th': 17, 'cs': 2, 'Unknown': 1, 'hi': 1, 'lv': 1}


# Data Preprocessing

In [632]:
#Changing 'humor' to 'fake'
df_train.loc[(df_train.label == 'humor'),'label'] = 'fake'
df_test.loc[(df_test.label == 'humor'),'label'] = 'fake'

In [633]:
#Removing retweets, reposts, and modified tweets
rtPattern1 = "(RT|rt|MT|mt|RP|rp):? @\w*:?"
rtPattern2 = "(\bRT\b|\brt\b|\bMT\b|\bmt\b|\bRP\b|\brp\b)"
rtPattern3 = "(@\w*:)"
rtPattern4 = "(#rt|#RT|#mt|#MT|#rp|#retweet|#Retweet|#modifiedtweet|#modifiedTweet|#ModifiedTweet|#repost|#Repost)"
rtPattern5 = "(via @\w*)"

retweets = df_train['tweetText'].str.contains(rtPattern1)
df_train = df_train[~retweets]

retweets = df_train['tweetText'].str.contains(rtPattern2)
df_train = df_train[~retweets]

retweets = df_train['tweetText'].str.contains(rtPattern3)
df_train = df_train[~retweets]

retweets = df_train['tweetText'].str.contains(rtPattern4)
df_train = df_train[~retweets]

retweets = df_train['tweetText'].str.contains(rtPattern5)
df_train = df_train[~retweets]

df_train.reset_index(drop=True, inplace=True)
df_train.shape

  return func(self, *args, **kwargs)


(11550, 7)

In [700]:
#Removing remaining twitter handles @username
df_train['tweetText'] = df_train['tweetText'].apply(lambda text: re.sub(r'@\w*', "", text))

In [635]:
#Removing emojis
emojis = re.compile("["
                    u"\U0001F600-\U0001F64F"
                    u"\U0001F300-\U0001F5FF"
                    u"\U0001F680-\U0001F6FF"
                    u"\U0001F1E0-\U0001F1FF"
                    u"\U00002702-\U000027B0"
                    u"\U000024C2-\U0001F251"
                    "]+", flags=re.UNICODE)

df_train['tweetText'] = df_train['tweetText'].apply(lambda text: emojis.sub(r'', text) if emojis.search(text) else text)

In [636]:
#Cleaning symbols - ampersand and newline
df_train['tweetText'] = df_train['tweetText'].apply(lambda text: re.sub(r'&amp;|\\n', '', text))

In [637]:
#Removing urls
df_train['tweetText'] = df_train['tweetText'].apply(lambda text: re.sub(r'http\S+', '', text))
df_train['tweetText'] = df_train['tweetText'].apply(lambda text: re.sub(r'\\\/\S+', '', text))

In [82]:
#NOT USED
#Translating to English

i = 0
for tweet in df_train['tweetText']:
    try:
        lan = detect(tweet)
    except:
        continue
    if lan != 'en':
        tr = Translator()
        trText = tr.translate(tweet).text
        print(trText)
        #TODO replace cell value to translation
    i = i + 1

Do you remember the movie: "The after tomorrow"? It reminds me of what's happening with Hurricane #Sandy.
Good photo of Hurricane Sandy, reminds me of the movie Independence Day # ID4 #Sandy
UMMMM #shark #hurricane #nj WOAHH
#sandy #hurricane #fun #usa
Shark on the highway highway #hurricane #sandy
I never imagined imagine this scene in real life ... #chocada #eusoualenda #iamlegend #ny #newyork #hurricane #
Holy frankenstorm! #newyork #frankenstorm #hurricane #sandy #insane
All quiet #hurricane
#newyork #hurricane beautiful and scary
#Crazy picture. #Hurricane
Wow ... What violently ... Hope mn family there remains unharmed .. #hurricane #Sandy
Shark on the highway highway #hurricane #sandy
HURRICANE SANDY: Sharks are found in the streets of New Jersey due to the advance of the sea caused by Hurricane Look:
#SANDY: Sharks are found in the streets of New Jersey due to the advance of the sea caused by Hurricane Sandy. look:
Sharks are found in the streets of New Jersey due to the advanc

JSONDecodeError: Expecting value: line 1 column 1 (char 0)

In [701]:
#Removing whitespace
df_train['tweetText'] = df_train['tweetText'].apply(lambda text: " ".join(text.split()))

In [702]:
#Initialise stopwords
stopwords = nltk.corpus.stopwords.words()
stopwords.extend([':', ';', '[', ']', '"', "'", '(', ')', '.', '?', '#', '@', '...'])

In [703]:
#Removing stopwords
df_train['filteredTweet'] = df_train['tweetText'].apply(lambda x: ' '.join([w for w in x.split() if w not in stopwords]))

In [745]:
#Lemmatising
tokeniser = nltk.tokenize.WhitespaceTokenizer()
lemmatiser = nltk.stem.WordNetLemmatizer()

df_train['lemmatisedTweet'] = df_train['filteredTweet'].apply(lambda x: ' '.join([lemmatiser.lemmatize(w) for w in tokeniser.tokenize(x)]))
df_train.head(10)

Unnamed: 0,tweetId,tweetText,userId,imgs,username,timestamp,label,filteredTweet,lemmatisedTweet
0,263046056240115712,¿Se acuerdan de la película: “El día después d...,21226711,sandyA_fake_46,iAnnieM,Mon Oct 29 22:34:01 +0000 2012,fake,¿Se acuerdan película: “El día después mañana”...,¿Se acuerdan película: “El día después mañana”...
1,262979898002534400,"Buena la foto del Huracán Sandy, me recuerda a...",132303095,sandyA_fake_09,LucasPalape,Mon Oct 29 18:11:08 +0000 2012,fake,"Buena foto Huracán Sandy, recuerda película Dí...","Buena foto Huracán Sandy, recuerda película Dí..."
2,262996108400271360,Scary shit #hurricane #NY,241995902,sandyA_fake_29,Haaaaarryyy,Mon Oct 29 19:15:33 +0000 2012,fake,Scary shit #hurricane #NY,Scary shit #hurricane #NY
3,263018881839411200,My fave place in the world #nyc #hurricane #sa...,250315890,sandyA_fake_15,princess__natt,Mon Oct 29 20:46:02 +0000 2012,fake,My fave place world #nyc #hurricane #sandy #st...,My fave place world #nyc #hurricane #sandy #st...
4,263364439582060545,42nd #time #square #NYC #subway #hurricane,163674788,sandyA_fake_23,classycg,Tue Oct 30 19:39:10 +0000 2012,fake,42nd #time #square #NYC #subway #hurricane,42nd #time #square #NYC #subway #hurricane
5,262927032705490944,Just in time for #halloween a photo of #hurric...,246153081,sandyA_fake_14,j_unit87,Mon Oct 29 14:41:04 +0000 2012,fake,Just time #halloween photo #hurricane #sandy #...,Just time #halloween photo #hurricane #sandy #...
6,263321078884077568,Crazy pic of #Hurricane #Sandy prayers go out ...,199565482,sandyA_fake_29,MrBlakMagik,Tue Oct 30 16:46:52 +0000 2012,fake,Crazy #Hurricane #Sandy prayers go family frie...,Crazy #Hurricane #Sandy prayer go family frien...
7,263111677485142017,#sandy #newyork #hurricane #statueofliberty #USA,78475739,sandyA_fake_15,safi37,Tue Oct 30 02:54:46 +0000 2012,fake,#sandy #newyork #hurricane #statueofliberty #USA,#sandy #newyork #hurricane #statueofliberty #USA
8,262977091983785985,#nyc #hurricane,869777653,sandyA_fake_29,kingmichael03,Mon Oct 29 17:59:59 +0000 2012,fake,#nyc #hurricane,#nyc #hurricane
9,262989009930833920,robertosalibaba god be with u brother #sandy #...,359592461,sandyA_fake_08,Michael_Saliba,Mon Oct 29 18:47:20 +0000 2012,fake,robertosalibaba god brother #sandy #hurricane ...,robertosalibaba god brother #sandy #hurricane ...


# Algorithm Design and Training

In [734]:
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.linear_model import PassiveAggressiveClassifier, SGDClassifier
from sklearn.naive_bayes import MultinomialNB, BernoulliNB
from sklearn import metrics

In [735]:
#Define features and target for training and testing
tar_train = df_train.label
ft_train = df_train.lemmatisedTweet
tar_test = df_test.label
ft_test = df_test.tweetText

In [742]:
#Init Bag-of-Words
count_vectoriser = CountVectorizer(stop_words='english')
count_train = count_vectoriser.fit_transform(ft_train)
count_test = count_vectoriser.transform(ft_test)

In [743]:
#Init N-Gram
ngram_vectoriser = CountVectorizer(analyzer='char_wb', ngram_range=(2, 2))
ngram_train = ngram_vectoriser.fit_transform(ft_train)
ngram_test = ngram_vectoriser.transform(ft_test)

In [744]:
#Init TF-IDF
tfidf_vectoriser = TfidfVectorizer(stop_words='english', max_df=0.2)
tfidf_train = tfidf_vectoriser.fit_transform(ft_train)
tfidf_test = tfidf_vectoriser.transform(ft_test)

In [739]:
clf = MultinomialNB()

In [708]:
clf = BernoulliNB()

In [711]:
clf = PassiveAggressiveClassifier()

In [716]:
clf = SGDClassifier()

In [683]:
#Bag-of-Words
clf.fit(count_train, tar_train)

pred = clf.predict(count_test)
score = metrics.accuracy_score(tar_test, pred)

print("accuracy:   %0.3f" % score)

accuracy:   0.811


In [695]:
#N-Grams
clf.fit(ngram_train, tar_train)

pred = clf.predict(ngram_test)
score = metrics.accuracy_score(tar_test, pred)

print("accuracy:   %0.3f" % score)

accuracy:   0.458


In [740]:
#TF-IDF
clf.fit(tfidf_train, tar_train)

pred = clf.predict(tfidf_test)
score = metrics.accuracy_score(tar_test, pred)

print("accuracy:   %0.3f" % score)

accuracy:   0.862


In [741]:
#Calculating F1 score
TP = 0 
FP = 0
TN = 0
FN = 0

for true, guess in zip(tar_test, pred):
    if(true == 'fake' and guess == 'fake'):
        TP = TP + 1
    if(true == 'real' and guess == 'fake'):
        FP = FP + 1
    if(true == 'real' and guess == 'real'):
        TN = TN + 1
    if(true == 'fake' and guess == 'real'):
        FN = FN + 1
        
precision = TP / (TP + FP)
recall = TP / (TP + FN)

f1 = 2 * ((precision * recall) / (precision + recall))
print("TP: %d FP: %d TN: %d FN: %d" % (TP, FP, TN, FN))
print("f1: %0.3f" % f1)

TP: 2319 FP: 290 TN: 919 FN: 227
f1: 0.900
