In [65]:
# using keras tokenizer here
import os
import re
import pandas as pd
import numpy as np
import string

from pathlib import Path
import matplotlib.pyplot as plt
import seaborn as sns
import scipy.stats as st
from tqdm import tqdm

import xgboost as xgb

from spellchecker import SpellChecker

plt.style.use('ggplot')

# check if text is english
from langdetect import detect

from sklearn.svm import SVC
from keras.models import Sequential
from keras.layers.recurrent import LSTM, GRU
from keras.layers.core import Dense, Activation, Dropout
from keras.layers.embeddings import Embedding
from keras.layers.normalization import BatchNormalization
from keras.utils import np_utils
from sklearn import preprocessing, decomposition, model_selection, metrics, pipeline
from sklearn.model_selection import GridSearchCV
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer
from sklearn.decomposition import TruncatedSVD
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
from sklearn.naive_bayes import MultinomialNB
from keras.layers import GlobalMaxPooling1D, Conv1D, MaxPooling1D, Flatten, Bidirectional, SpatialDropout1D
from keras.preprocessing import sequence, text
from keras.callbacks import EarlyStopping

import nltk
nltk.download('stopwords')
nltk.download('wordnet')
nltk.download('averaged_perceptron_tagger')
from nltk import word_tokenize
from nltk.corpus import stopwords
from nltk.util import ngrams
from nltk.stem import WordNetLemmatizer
from nltk.corpus import stopwords, wordnet

stop=set(stopwords.words('english'))

pd.set_option('display.max_colwidth', -1)
pd.set_option('display.max_rows', None)

%store -r EMOTICONS_EMO
%store -r UNICODE_EMO
%store -r EMOTICONS

test_dir = 'data/test.txt'

test_path = Path(test_dir)

[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/riccardoandronache/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     /Users/riccardoandronache/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     /Users/riccardoandronache/nltk_data...
[nltk_data]   Package averaged_perceptron_tagger is already up-to-
[nltk_data]       date!
  pd.set_option('display.max_colwidth', -1)


In [95]:
test_df = pd.read_csv(test_path, error_bad_lines=False, delimiter='\t', encoding='utf-8')

In [96]:
test_df[:20]

Unnamed: 0,tweetId,tweetText,userId,imageId(s),username,timestamp,label
0,578854927457349632,kereeen RT @Shyman33: Eclipse from ISS.... http://t.co/je2hcFpVfN,70824972,eclipse_01,peay_s,Fri Mar 20 09:45:43 +0000 2015,fake
1,578874632670953472,Absolutely beautiful! RT @Shyman33: Eclipse from ISS.... http://t.co/oqwtTL0ThS,344707006,eclipse_01,JaredUcanChange,Fri Mar 20 11:04:02 +0000 2015,fake
2,578891261353984000,‚Äú@Shyman33: Eclipse from ISS.... http://t.co/C0VfboScRj‚Äù Ïö∞Ï£ºÏóêÏÑúÎ≥∏ 3.20 ÏùºÏãù Wow! amazing!,224839607,eclipse_01,tpjp1231,Fri Mar 20 12:10:06 +0000 2015,fake
3,578846612312748032,Eclipse from ISS.... http://t.co/En87OtvsU6,134543073,eclipse_01,Shyman33,Fri Mar 20 09:12:41 +0000 2015,fake
4,578975333841551360,@ebonfigli: √âclipse vue de l'ISS... Autre chose... http://t.co/yNBN7c4O51\n\nLa cr√©ation divine n'a pas de limite üòç,1150728872,eclipse_01,Epimethee_,Fri Mar 20 17:44:11 +0000 2015,fake
5,579274670853226496,‚Äú@ebonfigli: √âclipse vue de l'ISS... Autre chose... http://t.co/xlAyuoDRVF‚Äùmagnifique,470889709,eclipse_01,BusineMi,Sat Mar 21 13:33:38 +0000 2015,fake
6,578861590482665472,√âclipse vue de l'ISS... Autre chose... http://t.co/IqZEQXkTPB,383831305,eclipse_01,ebonfigli,Fri Mar 20 10:12:12 +0000 2015,fake
7,578976098052091904,@ebonfigli: √âclipse vue de l'ISS... Autre chose... http://t.co/yNBN7c4O51\n\nLa cr√©ation divine n'a pas de limite üòç,3044246089,eclipse_01,OumNur,Fri Mar 20 17:47:13 +0000 2015,fake
8,578844275061981184,Dit dus \0/ RT ‚Äú@News_Executive: The Solar eclipse seen from International Space Station. #SolarEclipse #ISS #Space http://t.co/0Y00h85ECN‚Äù,291020879,eclipse_01,PatriciaKusters,Fri Mar 20 09:03:24 +0000 2015,fake
9,578838737448235008,Photo: The Solar eclipse as seen from the International Space Station. #SolarEclipse #ISS #Space http://t.co/mvmWFks6Jg,364810202,eclipse_01,News_Executive,Fri Mar 20 08:41:23 +0000 2015,fake


In [68]:
languages = []
language_punctuation = string.punctuation + '¬ø‚Äú‚Äù¬ª¬´‚Ä¢'
count_failed_trans = 0
emoji_pattern = re.compile("["
        u"\U0001F600-\U0001F64F"  # emoticons
        u"\U0001F300-\U0001F5FF"  # symbols & pictographs
        u"\U0001F680-\U0001F6FF"  # transport & map symbols
        u"\U0001F1E0-\U0001F1FF"  # flags (iOS)
                           "]+", flags=re.UNICODE)

''' Custom function to check for valid words '''
def is_valid_word(word):
    return (re.search(r'^[a-zA-Z][a-z0-9A-Z\._]*$', word) is not None)

''' Custom function to remove the punctuation '''
def remove_punctuation(tweet, punctuation):
    return tweet.translate(str.maketrans('', '', punctuation))

def de_emojify(tweet):
    return emoji_pattern.sub(r'', tweet)

def guess_language(tweet):
    # Lower case
    tweet = tweet.lower()
    # Remove URLs, User mentions, RT
    tweet = re.sub(r'https?://\S+', '', tweet)
    tweet = re.sub(r'@[\S]+', '', tweet)
    tweet = re.sub(r'\brt\b', '', tweet)
    # Remove punctuation
    tweet = remove_punctuation(tweet, language_punctuation)
    # Remove emojis and strip end and beginning white spaces
    tweet = de_emojify(tweet).strip()
    # Filter out non-valid words
    valid_words = []
    
    try:
        words = tweet.split()
        for word in words:
            if is_valid_word(word):
                valid_words.append(word)
        normalized_tweet = ' '.join(valid_words)
        language = detect(normalized_tweet)
    except:
        language = 'unknown'
        print(f'This row throws an error\n {normalized_tweet}')
    
    return language

languages = [guess_language(x) for x in tqdm(test_df['tweetText'])]

 30%|‚ñà‚ñà‚ñâ       | 1108/3755 [00:05<00:12, 209.20it/s]

This row throws an error
 
This row throws an error
 


100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 3755/3755 [00:16<00:00, 226.09it/s]


In [69]:
nr = 0

for lan in languages:
    if lan == 'unknown':
        nr += 1
print(f'Total number of unknown tweet languages is {nr}')

Total number of unknown tweet languages is 2


In [70]:
# We discarded almost 4000 tweets for not being written in english

test_df['language'] = languages
test_en_df = test_df[test_df['language'] == 'en'].copy()

print(f'Original dataset length {len(test_df)} \n')
print(f'Only english tweets dataset length {len(test_en_df)} \n')

print(test_en_df.label.value_counts())

Original dataset length 3755 

Only english tweets dataset length 3512 

fake    2363
real    1149
Name: label, dtype: int64


In [71]:
test_en_df.drop(['imageId(s)', 'username', 'timestamp', 'userId'], axis=1, inplace=True)

In [72]:
def preprocess_word(word):
    word = re.sub(r'(.)\1+', r'\1\1', word)
    return word

def preprocess_tweet(tweet):
    preprocessed_tweet = []
    
    # Convert to lower case
    tweet = tweet.lower()
    # Replaces URLs with the word URL
    tweet = re.sub(r'https?://\S+', '', tweet)
    # Replace @handle with the word USER_MENTION
    tweet = re.sub(r'@[\S]+', '', tweet)
    # Remove RT (retweet)
    tweet = re.sub(r'\brt\b', '', tweet)
    
    tweet = re.sub(r'&amp;?',r'and', tweet)
    tweet = re.sub(r'&lt;',r'<', tweet)
    tweet = re.sub(r'&gt;',r'>', tweet)
    
    for word in tweet.split():
        word = preprocess_word(word)
        preprocessed_tweet.append(word)
    
    tweet = ' '.join(preprocessed_tweet)
    
    tweet = re.sub(r'([\w\d]+)([^\w\d ]+)', r'\1 \2', tweet)
    tweet = re.sub(r'([^\w\d ]+)([\w\d]+)', r'\1 \2', tweet)

    return tweet.strip()

In [73]:
prep_tweets = [preprocess_tweet(x) for x in tqdm(test_en_df['tweetText'])]

100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 3512/3512 [00:00<00:00, 14485.93it/s]


In [74]:
test_en_df['preprocessed_text'] = prep_tweets

In [77]:
test_en_df['preprocessed_text_length'] = [len(text.split(' ')) for text in test_en_df.preprocessed_text]

In [78]:
mask_less = test_en_df['preprocessed_text_length'] < 85
mask_more = test_en_df['preprocessed_text_length'] > 3

test_en_df = test_en_df[(mask_less) & (mask_more)]

# drop duplicates 10090
test_en_df = test_en_df.drop_duplicates(subset=['preprocessed_text'])

In [80]:
len(test_en_df)

1671

In [81]:
#¬†BERT preprocess
from pytorch_pretrained_bert import BertTokenizer

test_en_df['preprocessed_text_bert'] = '[CLS] ' + test_en_df['preprocessed_text']
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
test_en_df['preprocessed_text_bertbase_length'] = [len(tokenizer.tokenize(sent)) for sent in test_en_df['preprocessed_text_bert']]

tokenizer = BertTokenizer.from_pretrained('bert-large-uncased')
test_en_df['preprocessed_text_bertlarge_length'] = [len(tokenizer.tokenize(sent)) for sent in test_en_df['preprocessed_text_bert']]

In [83]:
label_dict = dict()
for i, l in enumerate(list(test_en_df['label'].value_counts().keys())):
    label_dict.update({l : i})

test_en_df['information_label'] = [label_dict[label] for label in test_en_df['label']]

In [84]:
test_en_df.to_csv('bert_test.csv')

In [99]:
ex = pd.read_csv('bert_train.csv', index_col=0)

In [101]:
ex.label.value_counts()

fake    5762
real    2878
Name: label, dtype: int64

In [4]:
import pandas as pd
test_df = pd.read_csv('metrics_csv.csv')

In [5]:
test_df

Unnamed: 0.1,Unnamed: 0,loss,accuracy,matthews coef,precision,recall,f1,aucroc
0,false,0.542329,0.788565,0.470423,0.842788,0.865945,0.854209,0.837358
1,real,0.542329,0.788565,0.470423,0.638202,0.594142,0.615385,0.837358
