# Initial setup

In [None]:
# to print all output for a cell instead of only last one 
from IPython.core.interactiveshell import InteractiveShell
InteractiveShell.ast_node_interactivity = 'all'

In [None]:
#import all libraries and modules 

import os

import requests
import zipfile
import random
import string 

import torch

import numpy as np
import pandas as pd

import gensim
import gensim.downloader as gloader
from gensim.models import KeyedVectors

import time 
import logging

from collections import OrderedDict, namedtuple

import json
from pandas import json_normalize
	
import pandas as pd
import numpy as np

import matplotlib.pyplot as plt

In [None]:
print("Current work directory: {}".format(os.getcwd())) #print the current working directory 

data_folder = os.path.join(os.getcwd(),"data") # directory containing the notebook

if not os.path.exists(data_folder):   #create folder where all data will be stored 
    os.makedirs(data_folder)

In [None]:
# Fix data seed to achieve reproducible results
torch.manual_seed(0)
random.seed(0)
np.random.seed(0)
torch.backends.cudnn.benchmark = False
torch.backends.cudnn.deterministic = True

#setup logging 
log = logging.getLogger('logger')
log.setLevel(logging.DEBUG)
fh = logging.FileHandler('data/log.txt')
fh.setLevel(logging.DEBUG)
log.addHandler(fh)

In [None]:
json_file_path_train = "data/train.json"
json_file_path_val = "data/dev.json"
json_file_path_test = "data/test.json"

with open(json_file_path_train, 'r') as tr:
     contents = json.loads(tr.read())
     train = json_normalize(contents)

with open(json_file_path_val, 'r') as vl:
     contents = json.loads(vl.read())
     val = json_normalize(contents) 

with open(json_file_path_test, 'r') as ts:
     contents = json.loads(ts.read())
     test = json_normalize(contents) 

# Dataset inspection

In [None]:
train.head(3)

In [None]:
print(f"Training set size: {len(train)}")
print(f"Validation set size: {len(val)}")
print(f"Test set size: {len(test)}")

In [None]:
print(f"N° of features per account: {len(train.columns)}")
print(f"Features:\n{list(train.columns)}") #TODO: rimuovere tutto ciò che è prima del punto e rinominare le colonne?

In [None]:
print(f"N° of accounts without tweets: {train['tweet'].isna().sum()}")
print(f"N° of tweets per account analytics:\n{np.round(train['tweet'].dropna().apply(len)).describe()}")


# Tweets preprocessing

In [None]:
import unidecode
import nltk
import emoji
from nltk.stem import WordNetLemmatizer, PorterStemmer
from nltk.corpus import stopwords

nltk.download('wordnet', quiet=True) 
nltk.download('stopwords', quiet=True)

def lemmatize_and_remove_non_ascii(sentence:str):
    """Remove unnecessary spaces, remove words with non ASCII characters and lemmatize"""
    lemmatizer = WordNetLemmatizer()
    sentence = [lemmatizer.lemmatize(word) for word in sentence if word.isascii()] #if a word has all ASCII characters: lemmatize, else: remove
    return sentence

def stemm_and_remove_non_ascii(sentence: str):
    ps = PorterStemmer()
    sentence = [ps.stem(word) for word in sentence if word.isascii()]#if a word has all ASCII characters: stemm, else: remove
    return sentence

def preprocess_pipeline(sentence:str):
    """Apply standard preprocessing"""

    #put everything to lowercase
    sentence = sentence.lower()

    #remove all unnecessary spaces and return a list of words
    sentence = sentence.split()

    #TODO: decidere con cosa sostituire i caratteri speciali (@, link, RT, emoticons...)
    #replace link string with 'https'
    sentence = ['https' if 'http' in word else word for word in sentence]

    #replace emoticons with their descriptions
    sentence = [emoji.demojize(word) if emoji.is_emoji(word) else word for word in sentence]

    #transliterates any UNICODE string into the closest possible representation in ASCII text
    sentence = [unidecode.unidecode(word) for word in sentence]

    #remove all punctuation
    sentence = [word.translate(str.maketrans(dict.fromkeys(string.punctuation,''))) for word in sentence]

    # sentence = [word for word in sentence if word != '']

    return sentence

def preprocess_type1(sentence:str):
    """Apply standard preprocessing and return a list of words"""

    sentence = preprocess_pipeline(sentence)

    return sentence

def preprocess_type2(sentence:str):
    """Apply standard preprocessing, transliterates UNICODE characters in ASCII, 
    remove words with non ASCII characters, lemmatize and return a list of words"""

    sentence = preprocess_pipeline(sentence)

    #remove non-ascii words
    sentence = lemmatize_and_remove_non_ascii(sentence) #TODO: lemmatization doesn't work

    return sentence

def preprocess_type3(sentence: str):
    """
        Apply standard preprocessing, removes stop-words and non ascii's,  and stemmes.
    """
    sentence = preprocess_pipeline(sentence)
    stemmed = stemm_and_remove_non_ascii(sentence)
    stop_words = set(stopwords.words('english'))
    filter_stop_words = [word for word in stemmed if not word in stop_words]
    return filter_stop_words

In [None]:
print(f"Original tweet:\n{train.loc[0, 'tweet'][0]}")
print(f"Processed tweet:\n{preprocess_type1(train.loc[0, 'tweet'][0])}") #TODO: le emoticon stanno dentro a GloVe?

In [None]:
# Remove all accounts without tweets
train.dropna(subset=['tweet'], inplace=True)
val.dropna(subset=['tweet'], inplace=True)
test.dropna(subset=['tweet'], inplace=True)

train = train.explode('tweet').reset_index(drop=True)
val = val.explode('tweet').reset_index(drop=True)
test = test.explode('tweet').reset_index(drop=True)

# OLD VERSION vvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvv
# # Preprocess all tweets
# def preprocess_tweets(data, preprocess_function):
#     for row in range(len(data)):
#         tweets = []
#         for tweet in data['tweet'].iloc[row]:
#             tweets.append(preprocess_function(tweet))
#         data.loc[:, 'tweet'].iloc[row] = tweets
#     return data

# train = preprocess_tweets(train, preprocess_type1)
# val = preprocess_tweets(val, preprocess_type1)
# test = preprocess_tweets(test, preprocess_type1)
# ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^

# Preprocess all tweets
train['tweet'] = train['tweet'].apply(preprocess_type1)
val['tweet'] = val['tweet'].apply(preprocess_type1)
test['tweet'] = test['tweet'].apply(preprocess_type1)

train.head(3)

# Vocabulary

In [None]:
Vocab = namedtuple('Vocabulary',['word2int','int2word','unique_words'])

def build_vocab(unique_words : list[str]): 
    """
        Builds the dictionaries word2int, int2word and put them in the Vocabulary
    """
    word2int = OrderedDict()
    int2word = OrderedDict()

    for i, word in enumerate(unique_words):
        word2int[word] = i+1           #plus 1 since the 0 will be used as tag token 
        int2word[i+1] = word
    
    return Vocab(word2int,int2word,unique_words)

In [None]:
unique_words = train['tweet'].explode().unique().tolist()

print('The number of unique words in the entire dataset is:', len(unique_words))

In [None]:
vocab = build_vocab(unique_words)

In [None]:
def build_indexed_dataframe(df: pd.DataFrame):

    df['idx_tweet'] = df.tweet.apply(lambda x:list(map(vocab.word2int.get,x)))

    return df 

def check_dataframe_numberization(df,vocab):

    """
       Checks if the numberized dataframe will lead to the normal dataframe usind the reverse mapping 
    """

    tweet = df['tweet']

    idx_to_tweet = df.idx_tweet.apply(lambda x:list(map(vocab.int2word.get,x)))

    if tweet.equals(idx_to_tweet):
        print('CHECK COMPLETED: All right with dataset numberization')
    else:
        raise Exception('There are problems with Dataset numberization')

train = build_indexed_dataframe(train)
val = build_indexed_dataframe(val)
test = build_indexed_dataframe(test)

check_dataframe_numberization(train,vocab)

In [None]:
train.head(3)

In [None]:
from torchtext.data import BucketIterator
from torch.utils.data import Dataset

class DataframeDataset(Dataset):

    def __init__(self, dataframe: pd.DataFrame):

        dataframe = dataframe.copy()
        self.tweet = dataframe['idx_tweet']      #column of numberized tweets
        self.label = dataframe['label']       #column of categorical label 
        self.id = dataframe['ID']          #column of claim ids 

    def __len__(self):
        return len(self.tweet)

    def __getitem__(self, idx):
        return {'tweet': self.tweet[idx],
                'label': self.label[idx],
                'ID': self.id[idx]}

def create_dataloaders(b_s : int, train: pd.DataFrame, val: pd.DataFrame, test: pd.DataFrame):     #b_s = batch_size

    #create DataframeDataset objects for each split 
    train_dataset = DataframeDataset(train)
    val_dataset = DataframeDataset(val)
    test_dataset = DataframeDataset(test)


    # Group similar length text sequences together in batches and return an iterator for each split.
    train_dataloader,val_dataloader,test_dataloader = BucketIterator.splits((train_dataset,val_dataset,test_dataset),
                                                        batch_sizes=(b_s,b_s,b_s), sort_key=lambda x: (len(x['tweet'])), 
                                                        repeat=True, sort=False, shuffle=True, sort_within_batch=True)
    
    return train_dataloader,val_dataloader,test_dataloader 

In [None]:
temp_batch_size = 128
train_dataloader, val_dataloader, test_dataloader = create_dataloaders(temp_batch_size, train, val, test)
random_idx = random.randint(0, temp_batch_size-1)
train_dataloader.init_epoch()
for batch_id, batch in enumerate(train_dataloader.batches):
    print("Tweet: ", batch[random_idx]['tweet'])
    print("Label: ", batch[random_idx]['label'])
    print("Account Id: ", batch[random_idx]['ID'], "\n")
    print("Corresponding row in the dataset:")
    train[train['idx_tweet'].apply(lambda x: x == batch[random_idx]['tweet'])]
    break