In [None]:
# to print all output for a cell instead of only last one 
from IPython.core.interactiveshell import InteractiveShell
InteractiveShell.ast_node_interactivity = 'all'

In [None]:
# IMPORTS 

from pathlib import Path 
import os 

import pandas as pd
import numpy as np
from collections import OrderedDict, namedtuple

In [None]:
# PATHS 
DATA_FOLDER = Path(*Path().absolute().parts[:-1]) / 'data' # directory containing the notebook

Load data


In [None]:
import json 

json_file_path_train = DATA_FOLDER / 'Twibot-20/train.json'
json_file_path_val = DATA_FOLDER / 'Twibot-20/dev.json'
json_file_path_test = DATA_FOLDER / 'Twibot-20/test.json'

with open(json_file_path_train, 'r') as tr:
     contents = json.loads(tr.read())
     train_df = pd.json_normalize(contents)
     train_df['split'] = 'train'

with open(json_file_path_val, 'r') as vl:
     contents = json.loads(vl.read())
     val_df = pd.json_normalize(contents) 
     val_df['split'] = 'val'

with open(json_file_path_test, 'r') as ts:
     contents = json.loads(ts.read())
     test_df = pd.json_normalize(contents) 
     test_df['split'] = 'test'

df = pd.concat([train_df,val_df,test_df],ignore_index=True) # merge three datasets
df.dropna(subset=['tweet'], inplace=True)  # remove rows withot any tweet 
df.set_index(keys='ID',inplace=True) # reset index

# split dataframe in two : tweet and account data 
tweets_df = df[['tweet','label','split']].reset_index()
tweets_df = tweets_df.explode('tweet').reset_index(drop=True)
tweets_df.rename(columns={"ID": "account_id"}, inplace=True)

account_df = df.drop('tweet',axis=1).reset_index()
account_df.rename(columns={"ID": "account_id"}, inplace=True)


DATA PROCESSING AND CLEANING  

In [None]:
import emoji
from nltk.tokenize import TweetTokenizer

from ttp import ttp 
parser = ttp.Parser(include_spans=True)

from emot.core import emot
emot_obj = emot()

import re 

tk = TweetTokenizer(reduce_len=True,preserve_case=False)

CASHTAG = "(?<!\S)\$[A-Z]+(?:\.[A-Z]+)?(?!\S)"   # to check  (?:\.[A-Z]+)?
EMAIL = r"""[\w.+-]+@[\w-]+\.(?:[\w-]\.?)+[\w-]"""
MONEY = "[$£][0-9]+(?:[.,]\d+)?[k+B]?|[0-9]+(?:[.,]\d+)?[k+B]?[$£]"  
NUMBERS = r"""(?<!\S)(?:[+\-]?\d+(?:%|(?:[,/.:-]\d+[+\-]?)?))"""   # r"""(?:[+\-]?\d+[,/.:-]\d+[+\-]?)"""   
HASHTAG = r"""(?:\#+[\w_]+[\w\'_\-]*[\w_]+)"""
HANDLES = r"""(?:@[\w_]+)""" 

TO_REPLACE = [CASHTAG, EMAIL, MONEY, NUMBERS, HASHTAG, HANDLES]
REPLACE_WITH = [' stock ',' email ',' money ',' number ',' hashtag ',' username ']


def replace(word : str):
    if not word.isascii():
        return ['']
    elif bool(re.search(r'http[s]?|.com',word)):
        return ['url']
    elif bool(re.search(r'\d',word)):
        return ['number']
    elif bool(re.search(r'haha|ahah|jaja|ajaj',word)):
        return ['ahah']
    elif bool(re.search(r'\n',word)):
        return ['']
    elif bool(re.search('-',word)):
        return re.sub('-',' ',word).split()
    else :
        return [word] 
    

def further_process(sentence: str):
        #replace urls 
        result = parser.parse(sentence, html=False)
        urls = dict(result.urls).keys()
        for url in urls:
                sentence = sentence.replace(url,' url ')
        
        #replace emoticons 
        emoticons = emot_obj.emoticons(sentence)
        for emoticon in emoticons['value']:
                sentence = sentence.replace(emoticon,' emoticon ')
        
        #replace emoji
        sentence = emoji.replace_emoji(sentence,' emoji ')

        #tokenize
        sentence = tk.tokenize(sentence)

        #replace residual wrong words 
        sentence = [w for word in sentence for w in replace(word)]
        
        #remove empty strings 
        sentence = [word for word in sentence if word != '']
                
        return sentence
        
#apply preprocessing         
tweets_df['processed_tweet'] = tweets_df['tweet'].replace(TO_REPLACE,REPLACE_WITH,regex=True,inplace=False)
tweets_df['processed_tweet'] = tweets_df['processed_tweet'].apply(further_process)

tweets_df.to_pickle(DATA_FOLDER / 'processed_dataset.pkl')   #save to file 

In [None]:
tweets_df

DOWNLOAD TWITTER GLOVE EMBEDDINGS

In [None]:
import gensim
import gensim.downloader as gloader
from gensim.models import KeyedVectors

glove_model_cached_path = DATA_FOLDER / 'glove_vectors.txt'
glove_model_download_path = 'glove-twitter-200'
force_download = False  # to download glove model even if the vectors model has been already stored. Mainly for testing purposes

if os.path.exists(glove_model_cached_path) and not force_download: 
    print('found cached glove vectors in data folder, retrieving the file...')
    emb_model = KeyedVectors.load_word2vec_format(glove_model_cached_path, binary=True)
    print('vectors loaded')

else:
    print('downloading glove embeddings...')        
    emb_model = gloader.load(glove_model_download_path)

    print('saving glove embeddings to file')  
    emb_model.save_word2vec_format(glove_model_cached_path, binary=True)

In [None]:
unique_words : list = tweets_df['processed_tweet'].explode().unique().tolist()

import gensim
import random 

def check_OOV_terms(embedding_model: gensim.models.keyedvectors.KeyedVectors, unique_words):
    """
        Given the embedding model and the unique words in the dataframe, determines the out-of-vocabulary words 
    """
    oov_words = []
    idx_oov_words = []

    if embedding_model is None:
        print('WARNING: empty embeddings model')

    else: 
        for word in unique_words:
            try: 
                embedding_model[word]
            except:
                oov_words.append(word) 
        
        print("Total number of unique words in dataset:",len(unique_words))
        print("Total OOV terms: {0} which is ({1:.2f}%)".format(len(oov_words), (float(len(oov_words)) / len(unique_words))*100))
        print("Some OOV terms:",random.sample(oov_words,10))
    
    return oov_words

oov = check_OOV_terms(emb_model,unique_words)
print(oov)

In [None]:
emb_model.most_similar("'ve")

CUSTOM DATA HANDLING  

In [None]:
from torch.utils.data import Dataset
import torch.nn.utils.rnn as rnn

import torch 

Vocab = namedtuple('Vocabulary',['word2int','int2word','unique_words'])

class TwitterDataset(Dataset):

    def __init__(self, dataframe: pd.DataFrame):
        self.tweet = dataframe['processed_tweet']
        self.label = dataframe['label']

    def __len__(self):
        return len(self.label)

    def __getitem__(self, idx):
        return {
            'tweet': self.tweet[idx],
            'label': self.label[idx],
            }

class TwitterDataManager():

    def __init__(self, dataframe : pd.DataFrame, device ):

        self.device = device 

        self.dataset = dataframe.copy(deep=True)
        self.train_ds = TwitterDataset(self.dataset[self.dataset['split'] == 'train'].reset_index(drop=True))
        self.val_ds = TwitterDataset(self.dataset[self.dataset['split'] == 'val'].reset_index(drop=True))
        self.test_ds = TwitterDataset(self.dataset[self.dataset['split'] == 'test'].reset_index(drop=True))

    def custom_collate(self, batch):
        
        tweet_lengths = torch.tensor([len(example['tweet']) for example in batch], device=self.device) 

        numerized_tweets = [self.numericalize(example['tweet']) for example in batch]
        padded_tweets = rnn.pad_sequence(numerized_tweets, batch_first = True, padding_value = self.vocab.word2int['<pad>']).to(self.device)

        labels = torch.tensor([example['label'] for example in batch],device=self.device) #(5)

        return {
            'text': padded_tweets,
            'label': labels,
            'lenght': tweet_lengths
        }
    
    def numericalize(self, something):  #TODO something 

        assert self.vocab is not None, "you have to build the vocab first, call build_vocab method to do it"
        return NotImplementedError()
    
    def build_vocab(self, something): #TODO something 

        unique_words : list = self.dataset['processed_tweet'].explode().unique().tolist()
        unique_words.extend(['<pad>','<unk>'])

        word2int = OrderedDict({'<pad>':0,'<unk>':1})
        int2word = OrderedDict({0:'<pad>',1:'<unk>'})

        for i, word in enumerate(unique_words):
            word2int[word] = i+2           #plus 2 since the 0 will be used as pad token and 1 as unknown  
            int2word[i+2] = word
        
        self.vocab = Vocab(word2int,int2word,unique_words)

        print(f'the number of unique words is {len(unique_words)}')
    
    def build_emb_matrix(self, something): #TODO something 
        return NotImplementedError()
    
    def getDataloader(self, split : str):
        return NotImplementedError()

In [None]:

Vocab = namedtuple('Vocabulary',['word2int','int2word','unique_words'])

def build_vocab(unique_words : list[str]): 
    """
        Builds the dictionaries word2int, int2word and put them in the Vocabulary
    """
    word2int = OrderedDict()
    int2word = OrderedDict()

    for i, word in enumerate(unique_words):
        word2int[word] = i+1           #plus 1 since the 0 will be used as tag token 
        int2word[i+1] = word
    
    return Vocab(word2int,int2word,unique_words)