In [1]:
import pandas as pd
import seaborn as sns
import numpy as np
import nltk
import os
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
from nltk.stem.wordnet import WordNetLemmatizer
from nltk.stem.porter import PorterStemmer
from nltk.stem import SnowballStemmer
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.manifold import TSNE
from sklearn.manifold import MDS
from sklearn.decomposition import PCA
import matplotlib.patheffects as PathEffects
import matplotlib.pyplot as plt
import matplotlib.cm as cm
import string
import re
from collections import Counter
from tqdm import tqdm
tqdm.pandas()


nltk.download('stopwords')
nltk.download('punkt')
nltk.download('wordnet')

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\mihat\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\mihat\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\mihat\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


True

In [90]:
cwd = os.getcwd()
df = pd.read_csv( os.path.join(cwd, 'full_dataset_all_labels.csv'))
stop_words=set(stopwords.words('english') + list(string.punctuation))
stop_words.add('rt') # add word rt (meaning retweet) to stop words
df = pd.read_csv('full_dataset_all_labels.csv')
#df = df.sample(10000)

In [91]:
def print_some_texts(columns, df):
    text_idxs = [47, 7240, 7241, 8013, 14500, 16500, 16304, 18300,  21750, 34036, 45159, 71920]
    for i in text_idxs:
        for column in columns:
            print(df[column].iloc[i])
#print_some_texts(['text'])

def tokenize(text):
    #print(text)
    text = preprocess_text(text)
    #print(text)
    tokens = word_tokenize(text)
    filtered_tokens = []
    # Filter out any tokens not containing letters (e.g., numeric tokens, raw punctuation). (adapted from lab example)
    for token in tokens:
        if re.search('[a-zA-Z]', token):
            if token not in stop_words and len(token) > 2:
                filtered_tokens.append(token)
    return filtered_tokens
    

def preprocess_text(text):
    text = re.sub(r"http\S+", " ", text)            # remove urls
    text = re.sub("@[A-Za-z0-9]+","", text)         # remove twitter handle
    text = re.sub("&amp;","", text)                  # &amp; is a special character for ampersand
    text = re.sub('<USER>', '', text)               # remove '<USER>' as there are some such strings as user or url is masked with this string
    text = re.sub('<URL>', '', text)
    text = text.lower() 
    text = re.sub('[^a-zA-Z]', ' ', text)           # Remove punctuations
    text = text.lower()                             # Convert to lowercase
    text = re.sub("&lt;/?.*?&gt;"," &lt;&gt; ",text)#remove tags
    text = re.sub("(\\d|\\W)+"," ",text)            # remove special characters and digits
    return text
    
    
def stemming(tokens):
    stemmer = SnowballStemmer("english")
    stems = [stemmer.stem(token) for token in tokens]
    return stems

def lemmatizing(tokens):
    lemmatizer = WordNetLemmatizer()
    lemmas = [lemmatizer.lemmatize(token) for token in tokens]
    return lemmas

In [92]:
df['preprocessed_text']=df['text'].apply(preprocess_text)

In [93]:
df['appended'] = df['preprocessed_text']+', this is '+df['label']

# BERT

## DEFAULT BERT

In [94]:
from transformers import BertTokenizer, BertModel
import torch

In [95]:
model = BertModel.from_pretrained('bert-base-uncased', output_hidden_states = True,)
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')

In [96]:
def get_bert_embeddings(tokens_tensor, segments_tensors, model):
    """Get embeddings from an embedding model
    
    Args:
        tokens_tensor (obj): Torch tensor size [n_tokens]
            with token ids for each token in text
        segments_tensors (obj): Torch tensor size [n_tokens]
            with segment ids for each token in text
        model (obj): Embedding model to generate embeddings
            from token and segment ids
    
    Returns:
        list: List of list of floats of size
            [n_tokens, n_embedding_dimensions]
            containing embeddings for each token
    
    """
    
    
    # Gradient calculation id disabled
    # Model is in inference mode
    with torch.no_grad():
        outputs = model(tokens_tensor, segments_tensors)
        # Removing the first hidden state
        # The first state is the input state
        hidden_states = outputs[2][1:]
    
    # Getting embeddings from the final BERT layer
    token_embeddings = hidden_states[-1]
    # Collapsing the tensor into 1-dimension
    token_embeddings = torch.squeeze(token_embeddings, dim=0)
    # Converting torchtensors to lists
    list_token_embeddings = [token_embed.tolist() for token_embed in token_embeddings]

    return np.array(list_token_embeddings)

In [97]:
def bert_text_preparation(text, tokenizer):
    """Preparing the input for BERT
    
    Takes a string argument and performs
    pre-processing like adding special tokens,
    tokenization, tokens to ids, and tokens to
    segment ids. All tokens are mapped to seg-
    ment id = 1.
    
    Args:
        text (str): Text to be converted
        tokenizer (obj): Tokenizer object
            to convert text into BERT-re-
            adable tokens and ids
        
    Returns:
        list: List of BERT-readable tokens
        obj: Torch tensor with token ids
        obj: Torch tensor segment ids
    
    
    """
    marked_text = "[CLS] " + text + " [SEP]"
    tokenized_text = tokenizer.tokenize(marked_text)
    indexed_tokens = tokenizer.convert_tokens_to_ids(tokenized_text)
    segments_ids = [1]*len(indexed_tokens)

    # Convert inputs to PyTorch tensors
    tokens_tensor = torch.tensor([indexed_tokens])
    segments_tensors = torch.tensor([segments_ids])
    return pd.Series([tokenized_text, tokens_tensor, segments_tensors], index = ['tokenized_text', 'tokens_tensor', 'segments_tensors'])

In [98]:
labels = df['label'].unique()
dict_labels_len = {}
for l in labels:
    x = bert_text_preparation(l, tokenizer)
    dict_labels_len[l] = len(x[0])-2

In [99]:
df[['tokenized_text', 'tokens_tensor', 'segments_tensors']] = df['appended'].progress_apply(bert_text_preparation, tokenizer = tokenizer)

100%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 77668/77668 [02:27<00:00, 526.68it/s]


In [100]:
df['leng'] = df['tokens_tensor'].apply(lambda x: x.size()[1])

df = df[df['leng']  <= 512]
df.drop(['leng'], axis = 1, inplace=True)

In [101]:
df['bert_emmbeding'] = df.progress_apply(lambda x: get_bert_embeddings(x['tokens_tensor'], x['segments_tensors'], model), axis=1)

100%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 77194/77194 [6:31:33<00:00,  3.29it/s]


In [102]:
df.progress_apply(lambda x: x['tokenized_text'][- dict_labels_len[x['label']]-1:-1], axis = 1)

100%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 77194/77194 [00:09<00:00, 8254.39it/s]


0         [abuse]
1         [abuse]
2         [abuse]
3         [abuse]
4         [abuse]
           ...   
77663    [vulgar]
77664    [vulgar]
77665    [vulgar]
77666    [vulgar]
77667    [vulgar]
Length: 77194, dtype: object

In [103]:
df['bert_emmbeding'] = df.progress_apply(lambda x: np.average(x['bert_emmbeding'][- dict_labels_len[x['label']]-1:-1], axis = 0), axis = 1)
df.sample()

100%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 77194/77194 [01:51<00:00, 692.84it/s]


Unnamed: 0,text,label,preprocessed_text,appended,tokenized_text,tokens_tensor,segments_tensors,bert_emmbeding
23753,"""\n\n Stop your fucking spamming \n\nI have to...",insult,stop your fucking spamming i have told you an...,stop your fucking spamming i have told you an...,"[[CLS], stop, your, fucking, spa, ##mming, i, ...","[[tensor(101), tensor(2644), tensor(2115), ten...","[[tensor(1), tensor(1), tensor(1), tensor(1), ...","[0.14338752627372742, -0.4059569239616394, -0...."


## BERT WITH SENTENCE EMMBEDINGS

In [104]:
from sentence_transformers import SentenceTransformer
model = SentenceTransformer('stsb-mpnet-base-v2')

In [105]:
sentences = df['preprocessed_text'].to_numpy()
sentence_embeddings = model.encode(sentences)

In [106]:
df['bert_sentence_emm'] = sentence_embeddings.tolist()

# ELMO

In [10]:
df = pd.read_pickle('first.pkl')

In [4]:
import tensorflow as tf
import tensorflow_hub as hub

In [5]:
elmo = hub.load("https://tfhub.dev/google/elmo/3").signatures['default']

In [11]:
dfs = df.drop(['tokens_tensor', 'segments_tensors', 'tokenized_text', 'text'], axis = 1)
dfs = df.sample(10000)
dfs

Unnamed: 0,text,label,preprocessed_text,appended,tokenized_text,tokens_tensor,segments_tensors,bert_emmbeding,bert_sentence_emm
1129,all trump supporters are worthless pieces of s...,abuse,all trump supporters are worthless pieces of s...,all trump supporters are worthless pieces of s...,"[[CLS], all, trump, supporters, are, worthless...","[[tensor(101), tensor(2035), tensor(8398), ten...","[[tensor(1), tensor(1), tensor(1), tensor(1), ...","[0.7415121793746948, 0.31940165162086487, 0.24...","[0.04157046601176262, 0.04623369872570038, 0.0..."
37731,Fuck you and Fuck your mom. And her dog.,obscene,fuck you and fuck your mom and her dog,"fuck you and fuck your mom and her dog , this ...","[[CLS], fuck, you, and, fuck, your, mom, and, ...","[[tensor(101), tensor(6616), tensor(2017), ten...","[[tensor(1), tensor(1), tensor(1), tensor(1), ...","[0.3871220350265503, 0.4998416602611542, 0.015...","[0.03612189739942551, 0.05468646064400673, 0.0..."
21804,this guy is a dirty jew213.152.254.36,identity_hate,this guy is a dirty jew,"this guy is a dirty jew , this is identity_hate","[[CLS], this, guy, is, a, dirty, jew, ,, this,...","[[tensor(101), tensor(2023), tensor(3124), ten...","[[tensor(1), tensor(1), tensor(1), tensor(1), ...","[0.9331061045328776, 0.1975609560807546, 0.583...","[0.1243976503610611, 0.19353410601615906, 0.08..."
15216,your boyfriend s a fucking faggot,homophobic,your boyfriend s a fucking faggot,"your boyfriend s a fucking faggot, this is hom...","[[CLS], your, boyfriend, s, a, fucking, fa, ##...","[[tensor(101), tensor(2115), tensor(6898), ten...","[[tensor(1), tensor(1), tensor(1), tensor(1), ...","[0.2926476337015629, 0.347992368042469, 0.5345...","[0.028020694851875305, 0.05802285671234131, 0...."
69906,hey nazi punk i wiol troll your ass you think ...,slur,hey nazi punk i wiol troll your ass you think ...,hey nazi punk i wiol troll your ass you think ...,"[[CLS], hey, nazi, punk, i, wi, ##ol, troll, y...","[[tensor(101), tensor(4931), tensor(6394), ten...","[[tensor(1), tensor(1), tensor(1), tensor(1), ...","[0.5055870562791824, 0.091542087495327, 0.6692...","[0.09904444962739944, 0.05369359999895096, 0.0..."
...,...,...,...,...,...,...,...,...,...
22026,You son of a bitch \nI was the one who recogni...,identity_hate,you son of a bitch i was the one who recognize...,you son of a bitch i was the one who recognize...,"[[CLS], you, son, of, a, bitch, i, was, the, o...","[[tensor(101), tensor(2017), tensor(2365), ten...","[[tensor(1), tensor(1), tensor(1), tensor(1), ...","[0.8317039310932159, 0.05167794662217299, 0.63...","[0.02894088625907898, -0.21220093965530396, -0..."
24738,"Yeah, all you have to do is stop being such a ...",insult,yeah all you have to do is stop being such a b...,yeah all you have to do is stop being such a b...,"[[CLS], yeah, all, you, have, to, do, is, stop...","[[tensor(101), tensor(3398), tensor(2035), ten...","[[tensor(1), tensor(1), tensor(1), tensor(1), ...","[0.19321022927761078, -0.2766666114330292, -0....","[0.03563609719276428, -0.02736884355545044, 0...."
53855,RT @spiffytwiley: @vintage_monroe_ bitch you g...,offensive,rt monroe bitch you gorgeous,"rt monroe bitch you gorgeous, this is offensive","[[CLS], rt, monroe, bitch, you, gorgeous, ,, t...","[[tensor(101), tensor(19387), tensor(9747), te...","[[tensor(1), tensor(1), tensor(1), tensor(1), ...","[-0.23898236453533173, 0.9837923049926758, 0.3...","[-0.025485828518867493, -0.03901668265461922, ..."
74112,<USER> shana i'm shanna with two n's that bitch,vulgar,shana i m shanna with two n s that bitch,"shana i m shanna with two n s that bitch, thi...","[[CLS], shan, ##a, i, m, shan, ##na, with, two...","[[tensor(101), tensor(17137), tensor(2050), te...","[[tensor(1), tensor(1), tensor(1), tensor(1), ...","[-0.36011314392089844, -0.5255359411239624, -0...","[0.10889077931642532, 0.15816475450992584, 0.0..."


In [9]:
lst =  df['preprocessed_text'].tolist()
lst2 = df['appended'].tolist()
embeddings_words = elmo(tf.constant(lst))["elmo"]
embeddings_sent = elmo(tf.constant(lst2))["default"]
embeddings_words

<tf.Tensor: shape=(20, 70, 1024), dtype=float32, numpy=
array([[[-1.3958732 , -0.47634852, -0.42686164, ...,  0.36568496,
          0.49042425,  0.2354556 ],
        [ 0.4454992 ,  0.2643395 , -0.31102926, ...,  0.7670144 ,
          0.26002738,  0.53752214],
        [-0.6416162 , -0.8899971 ,  0.257272  , ..., -0.10393001,
          0.38832465,  0.10515966],
        ...,
        [ 0.21973157,  0.16580486, -0.19386515, ..., -0.20244905,
          1.0564073 , -0.2821011 ],
        [ 0.22624248, -0.9736665 ,  0.12904175, ..., -0.46746704,
          0.19730377,  0.03653371],
        [-0.28542387, -0.1519949 , -0.02292022, ...,  0.10154832,
         -0.04778115,  0.05461949]],

       [[-1.0019374 ,  0.06765231,  0.02528628, ...,  0.07119475,
          0.6233085 , -0.04681645],
        [-0.6316018 , -0.16417497, -0.40858147, ...,  0.54096615,
         -0.05943623, -0.41254646],
        [ 0.24779448, -0.18716079, -0.11601172, ...,  0.23703824,
          0.12029931, -0.11658347],
        ...

In [None]:
embeddings_words.numpy().shape

In [None]:
embeddings_sent.numpy().shape

Just for last word (this is racist etc) embedding.

In [None]:
df['elmo_sentence'] = embeddings_sent.numpy().tolist()

In [None]:
df['idx'] = df.progress_apply(lambda x: len(x['appended'].split()), axis = 1)
df['shape'] = df.progress_apply(lambda x: len(x['elmo_word']), axis = 1)
df['idx'] = df.progress_apply(lambda x: min(x['idx'], x['shape']), axis = 1)

In [None]:
df.tail(2)

In [None]:
df['elmo_word'] = embeddings_words.numpy().tolist()
df['elmo_word'] = df.progress_apply(lambda x: x['elmo_word'][x['idx']-1], axis = 1)

In [None]:
df = df.drop(['elmo_words', 'idx', 'shape'], axis = 1)

In [None]:
df

In [None]:
df.to_pickle('elmobert.pkl')