In [1]:
import pandas as pd
import seaborn as sns
import numpy as np
import nltk
import os
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
from nltk.stem.wordnet import WordNetLemmatizer
from nltk.stem.porter import PorterStemmer
from nltk.stem import SnowballStemmer
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.manifold import TSNE
from sklearn.manifold import MDS
from sklearn.decomposition import PCA
import matplotlib.patheffects as PathEffects
import matplotlib.pyplot as plt
import matplotlib.cm as cm
import string
import re
from collections import Counter
from tqdm import tqdm
import time
tqdm.pandas()


nltk.download('stopwords')
nltk.download('punkt')
nltk.download('wordnet')

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\mihat\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\mihat\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\mihat\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


True

In [90]:
cwd = os.getcwd()
df = pd.read_csv( os.path.join(cwd, 'full_dataset_all_labels.csv'))
stop_words=set(stopwords.words('english') + list(string.punctuation))
stop_words.add('rt') # add word rt (meaning retweet) to stop words
df = pd.read_csv('full_dataset_all_labels.csv')
#df = df.sample(10000)

In [91]:
def print_some_texts(columns, df):
    text_idxs = [47, 7240, 7241, 8013, 14500, 16500, 16304, 18300,  21750, 34036, 45159, 71920]
    for i in text_idxs:
        for column in columns:
            print(df[column].iloc[i])
#print_some_texts(['text'])

def tokenize(text):
    #print(text)
    text = preprocess_text(text)
    #print(text)
    tokens = word_tokenize(text)
    filtered_tokens = []
    # Filter out any tokens not containing letters (e.g., numeric tokens, raw punctuation). (adapted from lab example)
    for token in tokens:
        if re.search('[a-zA-Z]', token):
            if token not in stop_words and len(token) > 2:
                filtered_tokens.append(token)
    return filtered_tokens
    

def preprocess_text(text):
    text = re.sub(r"http\S+", " ", text)            # remove urls
    text = re.sub("@[A-Za-z0-9]+","", text)         # remove twitter handle
    text = re.sub("&amp;","", text)                  # &amp; is a special character for ampersand
    text = re.sub('<USER>', '', text)               # remove '<USER>' as there are some such strings as user or url is masked with this string
    text = re.sub('<URL>', '', text)
    text = text.lower() 
    text = re.sub('[^a-zA-Z]', ' ', text)           # Remove punctuations
    text = text.lower()                             # Convert to lowercase
    text = re.sub("&lt;/?.*?&gt;"," &lt;&gt; ",text)#remove tags
    text = re.sub("(\\d|\\W)+"," ",text)            # remove special characters and digits
    return text
    
    
def stemming(tokens):
    stemmer = SnowballStemmer("english")
    stems = [stemmer.stem(token) for token in tokens]
    return stems

def lemmatizing(tokens):
    lemmatizer = WordNetLemmatizer()
    lemmas = [lemmatizer.lemmatize(token) for token in tokens]
    return lemmas

In [92]:
df['preprocessed_text']=df['text'].apply(preprocess_text)

In [93]:
df['appended'] = df['preprocessed_text']+', this is '+df['label']

# BERT

## DEFAULT BERT

In [94]:
from transformers import BertTokenizer, BertModel
import torch

In [95]:
model = BertModel.from_pretrained('bert-base-uncased', output_hidden_states = True,)
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')

In [96]:
def get_bert_embeddings(tokens_tensor, segments_tensors, model):
    """Get embeddings from an embedding model
    
    Args:
        tokens_tensor (obj): Torch tensor size [n_tokens]
            with token ids for each token in text
        segments_tensors (obj): Torch tensor size [n_tokens]
            with segment ids for each token in text
        model (obj): Embedding model to generate embeddings
            from token and segment ids
    
    Returns:
        list: List of list of floats of size
            [n_tokens, n_embedding_dimensions]
            containing embeddings for each token
    
    """
    
    
    # Gradient calculation id disabled
    # Model is in inference mode
    with torch.no_grad():
        outputs = model(tokens_tensor, segments_tensors)
        # Removing the first hidden state
        # The first state is the input state
        hidden_states = outputs[2][1:]
    
    # Getting embeddings from the final BERT layer
    token_embeddings = hidden_states[-1]
    # Collapsing the tensor into 1-dimension
    token_embeddings = torch.squeeze(token_embeddings, dim=0)
    # Converting torchtensors to lists
    list_token_embeddings = [token_embed.tolist() for token_embed in token_embeddings]

    return np.array(list_token_embeddings)

In [97]:
def bert_text_preparation(text, tokenizer):
    """Preparing the input for BERT
    
    Takes a string argument and performs
    pre-processing like adding special tokens,
    tokenization, tokens to ids, and tokens to
    segment ids. All tokens are mapped to seg-
    ment id = 1.
    
    Args:
        text (str): Text to be converted
        tokenizer (obj): Tokenizer object
            to convert text into BERT-re-
            adable tokens and ids
        
    Returns:
        list: List of BERT-readable tokens
        obj: Torch tensor with token ids
        obj: Torch tensor segment ids
    
    
    """
    marked_text = "[CLS] " + text + " [SEP]"
    tokenized_text = tokenizer.tokenize(marked_text)
    indexed_tokens = tokenizer.convert_tokens_to_ids(tokenized_text)
    segments_ids = [1]*len(indexed_tokens)

    # Convert inputs to PyTorch tensors
    tokens_tensor = torch.tensor([indexed_tokens])
    segments_tensors = torch.tensor([segments_ids])
    return pd.Series([tokenized_text, tokens_tensor, segments_tensors], index = ['tokenized_text', 'tokens_tensor', 'segments_tensors'])

In [98]:
labels = df['label'].unique()
dict_labels_len = {}
for l in labels:
    x = bert_text_preparation(l, tokenizer)
    dict_labels_len[l] = len(x[0])-2

In [99]:
df[['tokenized_text', 'tokens_tensor', 'segments_tensors']] = df['appended'].progress_apply(bert_text_preparation, tokenizer = tokenizer)

100%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 77668/77668 [02:27<00:00, 526.68it/s]


In [100]:
df['leng'] = df['tokens_tensor'].apply(lambda x: x.size()[1])

df = df[df['leng']  <= 512]
df.drop(['leng'], axis = 1, inplace=True)

In [101]:
df['bert_emmbeding'] = df.progress_apply(lambda x: get_bert_embeddings(x['tokens_tensor'], x['segments_tensors'], model), axis=1)

100%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 77194/77194 [6:31:33<00:00,  3.29it/s]


In [102]:
df.progress_apply(lambda x: x['tokenized_text'][- dict_labels_len[x['label']]-1:-1], axis = 1)

100%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 77194/77194 [00:09<00:00, 8254.39it/s]


0         [abuse]
1         [abuse]
2         [abuse]
3         [abuse]
4         [abuse]
           ...   
77663    [vulgar]
77664    [vulgar]
77665    [vulgar]
77666    [vulgar]
77667    [vulgar]
Length: 77194, dtype: object

In [103]:
df['bert_emmbeding'] = df.progress_apply(lambda x: np.average(x['bert_emmbeding'][- dict_labels_len[x['label']]-1:-1], axis = 0), axis = 1)
df.sample()

100%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 77194/77194 [01:51<00:00, 692.84it/s]


Unnamed: 0,text,label,preprocessed_text,appended,tokenized_text,tokens_tensor,segments_tensors,bert_emmbeding
23753,"""\n\n Stop your fucking spamming \n\nI have to...",insult,stop your fucking spamming i have told you an...,stop your fucking spamming i have told you an...,"[[CLS], stop, your, fucking, spa, ##mming, i, ...","[[tensor(101), tensor(2644), tensor(2115), ten...","[[tensor(1), tensor(1), tensor(1), tensor(1), ...","[0.14338752627372742, -0.4059569239616394, -0...."


## BERT WITH SENTENCE EMMBEDINGS

In [104]:
from sentence_transformers import SentenceTransformer
model = SentenceTransformer('stsb-mpnet-base-v2')

In [105]:
sentences = df['preprocessed_text'].to_numpy()
sentence_embeddings = model.encode(sentences)

In [106]:
df['bert_sentence_emm'] = sentence_embeddings.tolist()

# ELMO

In [2]:
import tensorflow as tf
import tensorflow_hub as hub

In [3]:
elmo = hub.load("https://tfhub.dev/google/elmo/3").signatures['default']

In [4]:
df = pd.read_pickle('first.pkl')

In [5]:
df = df.drop(['tokens_tensor', 'segments_tensors', 'tokenized_text', 'text'], axis = 1)

In [6]:
dfs = np.array_split(df, 1000)

In [7]:
df.sample(10)

Unnamed: 0,label,preprocessed_text,appended,bert_emmbeding,bert_sentence_emm
31681,obscene,continuous vandalism by turkish users and sham...,continuous vandalism by turkish users and sham...,"[0.520959734916687, -0.18024805188179016, -0.2...","[0.06637714058160782, -0.01583930104970932, -0..."
6100,harassment,white boys are taking over whitepower,"white boys are taking over whitepower , this i...","[0.3563718795776367, 0.1874101459980011, -0.25...","[-0.05086810886859894, 0.3138611316680908, 0.0..."
29717,insult,no you have not explained how x is y is someh...,no you have not explained how x is y is someh...,"[0.05588417500257492, -0.3176025152206421, -0....","[0.019041109830141068, 0.16298703849315643, 0...."
23029,insult,i m sorry i missed the whole personal attack i...,i m sorry i missed the whole personal attack i...,"[0.032701119780540466, -0.5897225737571716, -0...","[0.08567946404218674, 0.030805688351392746, 0...."
36257,obscene,what a cowardly jew lmfao stick to picking on ...,what a cowardly jew lmfao stick to picking on ...,"[0.042388223111629486, 0.32024621963500977, -0...","[0.08007101714611053, 0.0718226283788681, -0.0..."
1338,abuse,hi dave fuck you and fuck your company have a ...,hi dave fuck you and fuck your company have a ...,"[0.9103267192840576, 1.017844557762146, 0.3483...","[-0.029665006324648857, -0.21877291798591614, ..."
26896,insult,i was speaking the truth u son of a bitch go f...,i was speaking the truth u son of a bitch go f...,"[0.2123800367116928, -0.02327074483036995, 0.0...","[0.04634561017155647, 0.12414926290512085, 0.0..."
7844,harassment,fucking mexicans theyre trying to get me drunk...,fucking mexicans theyre trying to get me drunk...,"[0.2118196040391922, -0.35716161131858826, 0.1...","[-0.05534539371728897, 0.1591799110174179, 0.0..."
32764,obscene,dear stuttering self righteous self absorbed i...,dear stuttering self righteous self absorbed i...,"[0.09141566604375839, 0.2472246289253235, 0.10...","[0.10966053605079651, 0.19132399559020996, -0...."
54986,offensive,the internet will make you think these bitches...,the internet will make you think these bitches...,"[0.42969536781311035, 0.41116148233413696, -0....","[0.20444633066654205, 0.12598277628421783, -0...."


In [None]:
for i,small in enumerate(dfs):
    start = time.time()
    lst =  small['preprocessed_text'].tolist()
    #lst2 = small['appended'].tolist()
    #embeddings_words = elmo(tf.constant(lst))["elmo"]
    embeddings_sent = elmo(tf.constant(lst))["default"]
    small['elmo_sentence'] = embeddings_sent.numpy().tolist()
    #small['elmo_word'] = embeddings_words.numpy().tolist()
    #small['idx'] = small.progress_apply(lambda x: len(x['appended'].split()), axis = 1)
    #small['shape'] = small.progress_apply(lambda x: len(x['elmo_word']), axis = 1)
    #small['idx'] = small.progress_apply(lambda x: min(x['idx'], x['shape']), axis = 1)
    #small['elmo_word'] = small.progress_apply(lambda x: x['elmo_word'][x['idx']-1], axis = 1)
    #small = small.drop([ 'idx', 'shape'], axis = 1)
    print(f"Čas za to iteracijo, {time.time()-start}, smo na: {(i+1)/len(dfs)*100}%")

Čas za to iteracijo, 18.73149538040161, smo na: 0.1%
Čas za to iteracijo, 26.078139066696167, smo na: 0.2%
Čas za to iteracijo, 8.836330890655518, smo na: 0.3%
Čas za to iteracijo, 78.21854329109192, smo na: 0.4%
Čas za to iteracijo, 17.50725793838501, smo na: 0.5%
Čas za to iteracijo, 161.9465880393982, smo na: 0.6%
Čas za to iteracijo, 18.48947525024414, smo na: 0.7000000000000001%
Čas za to iteracijo, 8.145770072937012, smo na: 0.8%
Čas za to iteracijo, 29.316986560821533, smo na: 0.8999999999999999%
Čas za to iteracijo, 59.076897382736206, smo na: 1.0%
Čas za to iteracijo, 76.24508857727051, smo na: 1.0999999999999999%
Čas za to iteracijo, 59.525686740875244, smo na: 1.2%
Čas za to iteracijo, 42.69805669784546, smo na: 1.3%
Čas za to iteracijo, 31.684549570083618, smo na: 1.4000000000000001%
Čas za to iteracijo, 33.45064568519592, smo na: 1.5%
Čas za to iteracijo, 43.79299783706665, smo na: 1.6%
Čas za to iteracijo, 20.168543338775635, smo na: 1.7000000000000002%
Čas za to iteracij

In [None]:
embeddings_words.numpy().shape

In [None]:
embeddings_sent.numpy().shape

Just for last word (this is racist etc) embedding.

In [None]:
df['elmo_word'] = embeddings_words.numpy().tolist()
df['elmo_word'] = df.progress_apply(lambda x: x['elmo_word'][x['idx']-1], axis = 1)

In [None]:
df['elmo_sentence'] = embeddings_sent.numpy().tolist()

In [None]:
df['idx'] = df.progress_apply(lambda x: len(x['appended'].split()), axis = 1)
df['shape'] = df.progress_apply(lambda x: len(x['elmo_word']), axis = 1)
df['idx'] = df.progress_apply(lambda x: min(x['idx'], x['shape']), axis = 1)

In [None]:
df.tail(2)

In [None]:
df = df.drop(['elmo_words', 'idx', 'shape'], axis = 1)

In [None]:
df

In [None]:
df.to_pickle('elmobert.pkl')