In [1]:
import pandas as pd
import seaborn as sns
import numpy as np
import nltk
import os
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
from nltk.stem.wordnet import WordNetLemmatizer
from nltk.stem.porter import PorterStemmer
from nltk.stem import SnowballStemmer
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.manifold import TSNE
from sklearn.manifold import MDS
from sklearn.decomposition import PCA
import matplotlib.patheffects as PathEffects
import matplotlib.pyplot as plt
import matplotlib.cm as cm
import string
import re
from collections import Counter
from tqdm import tqdm
import time
tqdm.pandas()


nltk.download('stopwords')
nltk.download('punkt')
nltk.download('wordnet')

  from pandas import Panel


[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.
[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.
[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Unzipping corpora/wordnet.zip.


True

In [None]:
cwd = os.getcwd()
df = pd.read_csv( os.path.join(cwd, 'full_dataset_all_labels.csv'))
stop_words=set(stopwords.words('english') + list(string.punctuation))
stop_words.add('rt') # add word rt (meaning retweet) to stop words
df = pd.read_csv('full_dataset_all_labels.csv')
#df = df.sample(10000)

In [None]:
def print_some_texts(columns, df):
    text_idxs = [47, 7240, 7241, 8013, 14500, 16500, 16304, 18300,  21750, 34036, 45159, 71920]
    for i in text_idxs:
        for column in columns:
            print(df[column].iloc[i])
#print_some_texts(['text'])

def tokenize(text):
    #print(text)
    text = preprocess_text(text)
    #print(text)
    tokens = word_tokenize(text)
    filtered_tokens = []
    # Filter out any tokens not containing letters (e.g., numeric tokens, raw punctuation). (adapted from lab example)
    for token in tokens:
        if re.search('[a-zA-Z]', token):
            if token not in stop_words and len(token) > 2:
                filtered_tokens.append(token)
    return filtered_tokens
    

def preprocess_text(text):
    text = re.sub(r"http\S+", " ", text)            # remove urls
    text = re.sub("@[A-Za-z0-9]+","", text)         # remove twitter handle
    text = re.sub("&amp;","", text)                  # &amp; is a special character for ampersand
    text = re.sub('<USER>', '', text)               # remove '<USER>' as there are some such strings as user or url is masked with this string
    text = re.sub('<URL>', '', text)
    text = text.lower() 
    text = re.sub('[^a-zA-Z]', ' ', text)           # Remove punctuations
    text = text.lower()                             # Convert to lowercase
    text = re.sub("&lt;/?.*?&gt;"," &lt;&gt; ",text)#remove tags
    text = re.sub("(\\d|\\W)+"," ",text)            # remove special characters and digits
    return text
    
    
def stemming(tokens):
    stemmer = SnowballStemmer("english")
    stems = [stemmer.stem(token) for token in tokens]
    return stems

def lemmatizing(tokens):
    lemmatizer = WordNetLemmatizer()
    lemmas = [lemmatizer.lemmatize(token) for token in tokens]
    return lemmas

In [None]:
df['preprocessed_text']=df['text'].apply(preprocess_text)

In [None]:
df['appended'] = df['preprocessed_text']+', this is '+df['label']

# BERT

## DEFAULT BERT

In [None]:
from transformers import BertTokenizer, BertModel
import torch

In [None]:
model = BertModel.from_pretrained('bert-base-uncased', output_hidden_states = True,)
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')

In [None]:
def get_bert_embeddings(tokens_tensor, segments_tensors, model):
    """Get embeddings from an embedding model
    
    Args:
        tokens_tensor (obj): Torch tensor size [n_tokens]
            with token ids for each token in text
        segments_tensors (obj): Torch tensor size [n_tokens]
            with segment ids for each token in text
        model (obj): Embedding model to generate embeddings
            from token and segment ids
    
    Returns:
        list: List of list of floats of size
            [n_tokens, n_embedding_dimensions]
            containing embeddings for each token
    
    """
    
    
    # Gradient calculation id disabled
    # Model is in inference mode
    with torch.no_grad():
        outputs = model(tokens_tensor, segments_tensors)
        # Removing the first hidden state
        # The first state is the input state
        hidden_states = outputs[2][1:]
    
    # Getting embeddings from the final BERT layer
    token_embeddings = hidden_states[-1]
    # Collapsing the tensor into 1-dimension
    token_embeddings = torch.squeeze(token_embeddings, dim=0)
    # Converting torchtensors to lists
    list_token_embeddings = [token_embed.tolist() for token_embed in token_embeddings]

    return np.array(list_token_embeddings)

In [None]:
def bert_text_preparation(text, tokenizer):
    """Preparing the input for BERT
    
    Takes a string argument and performs
    pre-processing like adding special tokens,
    tokenization, tokens to ids, and tokens to
    segment ids. All tokens are mapped to seg-
    ment id = 1.
    
    Args:
        text (str): Text to be converted
        tokenizer (obj): Tokenizer object
            to convert text into BERT-re-
            adable tokens and ids
        
    Returns:
        list: List of BERT-readable tokens
        obj: Torch tensor with token ids
        obj: Torch tensor segment ids
    
    
    """
    marked_text = "[CLS] " + text + " [SEP]"
    tokenized_text = tokenizer.tokenize(marked_text)
    indexed_tokens = tokenizer.convert_tokens_to_ids(tokenized_text)
    segments_ids = [1]*len(indexed_tokens)

    # Convert inputs to PyTorch tensors
    tokens_tensor = torch.tensor([indexed_tokens])
    segments_tensors = torch.tensor([segments_ids])
    return pd.Series([tokenized_text, tokens_tensor, segments_tensors], index = ['tokenized_text', 'tokens_tensor', 'segments_tensors'])

In [None]:
labels = df['label'].unique()
dict_labels_len = {}
for l in labels:
    x = bert_text_preparation(l, tokenizer)
    dict_labels_len[l] = len(x[0])-2

In [None]:
df[['tokenized_text', 'tokens_tensor', 'segments_tensors']] = df['appended'].progress_apply(bert_text_preparation, tokenizer = tokenizer)

100%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 77668/77668 [02:27<00:00, 526.68it/s]


In [None]:
df['leng'] = df['tokens_tensor'].apply(lambda x: x.size()[1])

df = df[df['leng']  <= 512]
df.drop(['leng'], axis = 1, inplace=True)

In [None]:
df['bert_emmbeding'] = df.progress_apply(lambda x: get_bert_embeddings(x['tokens_tensor'], x['segments_tensors'], model), axis=1)

100%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 77194/77194 [6:31:33<00:00,  3.29it/s]


In [None]:
df.progress_apply(lambda x: x['tokenized_text'][- dict_labels_len[x['label']]-1:-1], axis = 1)

100%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 77194/77194 [00:09<00:00, 8254.39it/s]


0         [abuse]
1         [abuse]
2         [abuse]
3         [abuse]
4         [abuse]
           ...   
77663    [vulgar]
77664    [vulgar]
77665    [vulgar]
77666    [vulgar]
77667    [vulgar]
Length: 77194, dtype: object

In [None]:
df['bert_emmbeding'] = df.progress_apply(lambda x: np.average(x['bert_emmbeding'][- dict_labels_len[x['label']]-1:-1], axis = 0), axis = 1)
df.sample()

100%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 77194/77194 [01:51<00:00, 692.84it/s]


Unnamed: 0,text,label,preprocessed_text,appended,tokenized_text,tokens_tensor,segments_tensors,bert_emmbeding
23753,"""\n\n Stop your fucking spamming \n\nI have to...",insult,stop your fucking spamming i have told you an...,stop your fucking spamming i have told you an...,"[[CLS], stop, your, fucking, spa, ##mming, i, ...","[[tensor(101), tensor(2644), tensor(2115), ten...","[[tensor(1), tensor(1), tensor(1), tensor(1), ...","[0.14338752627372742, -0.4059569239616394, -0...."


## BERT WITH SENTENCE EMMBEDINGS

In [None]:
from sentence_transformers import SentenceTransformer
model = SentenceTransformer('stsb-mpnet-base-v2')

In [None]:
sentences = df['preprocessed_text'].to_numpy()
sentence_embeddings = model.encode(sentences)

In [None]:
df['bert_sentence_emm'] = sentence_embeddings.tolist()

# ELMO

In [2]:
import tensorflow as tf
import tensorflow_hub as hub

In [3]:
elmo = hub.load("https://tfhub.dev/google/elmo/3").signatures['default']

In [4]:
df = pd.read_csv('/content/drive/MyDrive/Colab Notebooks/data.csv')

In [5]:
df = df.drop(['tokens_tensor', 'segments_tensors', 'tokenized_text', 'text'], axis = 1)

In [6]:
dfs = np.array_split(df, 1000)

In [19]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [None]:
df.sample(10)

In [13]:
from numba import cuda

cuda.select_device(0)
cuda.close()

In [9]:
!nvidia-smi

Wed May 12 14:35:40 2021       
+-----------------------------------------------------------------------------+
| NVIDIA-SMI 465.19.01    Driver Version: 460.32.03    CUDA Version: 11.2     |
|-------------------------------+----------------------+----------------------+
| GPU  Name        Persistence-M| Bus-Id        Disp.A | Volatile Uncorr. ECC |
| Fan  Temp  Perf  Pwr:Usage/Cap|         Memory-Usage | GPU-Util  Compute M. |
|                               |                      |               MIG M. |
|   0  Tesla K80           Off  | 00000000:00:04.0 Off |                    0 |
| N/A   73C    P0    76W / 149W |  10902MiB / 11441MiB |      0%      Default |
|                               |                      |                  N/A |
+-------------------------------+----------------------+----------------------+
                                                                               
+-----------------------------------------------------------------------------+
| Proces

In [12]:
for i,small in enumerate(dfs):
    if i < 268 or (i > 500 and i < 788) or (i > 800 and i < 907):
      continue
    for j, sm in enumerate(np.array_split(small, 10)):
      start = time.time()
      lst =  sm['preprocessed_text'].tolist()
      #lst2 = small['appended'].tolist()
      #embeddings_words = elmo(tf.constant(lst))["elmo"]
      embeddings_sent = elmo(tf.constant(lst))["default"]
      sm['elmo_sentence'] = embeddings_sent.numpy().tolist()
      #small['elmo_word'] = embeddings_words.numpy().tolist()
      #small['idx'] = small.progress_apply(lambda x: len(x['appended'].split()), axis = 1)
      #small['shape'] = small.progress_apply(lambda x: len(x['elmo_word']), axis = 1)
      #small['idx'] = small.progress_apply(lambda x: min(x['idx'], x['shape']), axis = 1)
      #small['elmo_word'] = small.progress_apply(lambda x: x['elmo_word'][x['idx']-1], axis = 1)
      #small = small.drop([ 'idx', 'shape'], axis = 1)
      sm.to_pickle("/content/drive/MyDrive/small"+str(i)+'_'+str(j)+'.pkl')
    print(f"Čas za to iteracijo, {time.time()-start}, smo na: {(i+1)/len(dfs)*100}%")

Čas za to iteracijo, 0.9205386638641357, smo na: 26.900000000000002%
Čas za to iteracijo, 1.1117866039276123, smo na: 27.0%
Čas za to iteracijo, 0.9936702251434326, smo na: 27.1%
Čas za to iteracijo, 0.8657057285308838, smo na: 27.200000000000003%
Čas za to iteracijo, 1.2392606735229492, smo na: 27.3%
Čas za to iteracijo, 1.3586018085479736, smo na: 27.400000000000002%
Čas za to iteracijo, 0.8889343738555908, smo na: 27.500000000000004%
Čas za to iteracijo, 3.846491575241089, smo na: 27.6%
Čas za to iteracijo, 0.8743689060211182, smo na: 27.700000000000003%
Čas za to iteracijo, 2.3347771167755127, smo na: 27.800000000000004%
Čas za to iteracijo, 0.3335139751434326, smo na: 27.900000000000002%
Čas za to iteracijo, 2.2854647636413574, smo na: 28.000000000000004%
Čas za to iteracijo, 0.35138583183288574, smo na: 28.1%
Čas za to iteracijo, 1.7053346633911133, smo na: 28.199999999999996%
Čas za to iteracijo, 0.6368484497070312, smo na: 28.299999999999997%
Čas za to iteracijo, 0.590544223785

In [10]:
import os
import glob

In [11]:
big_df = pd.DataFrame()

In [12]:
i = 0
for f in glob.glob('/content/drive/MyDrive/*.pkl'):
  temp = pd.read_pickle(f)
  big_df = big_df.append(temp)
  i+=1
  if i % 100 == 0:
    print(100*i/(len(glob.glob('/content/drive/MyDrive/*.pkl'))))
big_df

2.4691358024691357
4.938271604938271
7.407407407407407
9.876543209876543
12.345679012345679
14.814814814814815
17.28395061728395
19.753086419753085
22.22222222222222
24.691358024691358
27.160493827160494
29.62962962962963
32.098765432098766
34.5679012345679
37.03703703703704
39.50617283950617
41.97530864197531
44.44444444444444
46.91358024691358
49.382716049382715
51.851851851851855
54.32098765432099
56.79012345679013
59.25925925925926
61.72839506172839
64.19753086419753
66.66666666666667
69.1358024691358
71.60493827160494
74.07407407407408
76.54320987654322
79.01234567901234
81.48148148148148
83.95061728395062
86.41975308641975
88.88888888888889
91.35802469135803
93.82716049382717
96.29629629629629
98.76543209876543


Unnamed: 0.1,Unnamed: 0,label,preprocessed_text,appended,bert_emmbeding,bert_sentence_emm,elmo_sentence
61340,61773,severe_toxic,suck my cock motherfuckers why don t you fuck...,suck my cock motherfuckers why don t you fuck...,[ 9.30619538e-01 3.85009815e-01 2.40492993e-...,"[0.2155110239982605, 0.20219053328037262, -0.0...","[0.2443406879901886, -0.5525169372558594, 0.01..."
61341,61774,severe_toxic,fap so i herd u were still a fucking wanker is...,fap so i herd u were still a fucking wanker is...,[ 7.34545668e-01 1.64129293e-01 2.00675428e-...,"[0.07144057750701904, 0.10810859501361847, -0....","[0.03207191079854965, -0.38199320435523987, 0...."
61342,61775,severe_toxic,josef mengele never worked on a scientific ba...,josef mengele never worked on a scientific ba...,[ 8.32445830e-01 5.54360906e-01 -1.58358643e-...,"[0.007855305448174477, 0.2709958851337433, 0.0...","[-0.021751997992396355, -0.32511425018310547, ..."
61343,61776,severe_toxic,my cock you failed again to block me wikipedia...,my cock you failed again to block me wikipedia...,[ 5.33574671e-01 1.86936411e-01 6.40570079e-...,"[0.18784163892269135, 0.05127450078725815, -0....","[-0.007243484258651733, -0.4541970193386078, 0..."
61344,61777,severe_toxic,oh fuck you you partisan asshole you never ste...,oh fuck you you partisan asshole you never ste...,[ 8.83090933e-01 4.76757526e-01 4.16040421e-...,"[0.05299034342169762, -0.22730588912963867, 0....","[0.07033226639032364, -0.326628714799881, 0.04..."
...,...,...,...,...,...,...,...
3973,3976,abuse,how can you expect anyone to take you seriousl...,how can you expect anyone to take you seriousl...,[ 9.41075504e-01 3.37572306e-01 -8.61335248e-...,"[0.0325782410800457, 0.044886041432619095, -0....","[0.036360640078783035, -0.2946488559246063, -0..."
3974,3977,abuse,testing the mothefucker new and improved capab...,testing the mothefucker new and improved capab...,[ 4.67681468e-01 -3.06223109e-02 1.85525734e-...,"[-0.10289394110441208, 0.1475052386522293, 0.0...","[0.2204449623823166, -0.04802954941987991, -0...."
3975,3978,abuse,op saw the word manbabies in the url and said ...,op saw the word manbabies in the url and said ...,[ 2.20974147e-01 3.83333176e-01 2.78720379e-...,"[-0.01329519972205162, 0.027182534337043762, 0...","[-0.1446760594844818, -0.2553308308124542, 0.1..."
3976,3979,abuse,yea fuck you gay people and brown people i was...,yea fuck you gay people and brown people i was...,[ 8.51975501e-01 7.69343555e-01 2.46158354e-...,"[-0.006593125406652689, 0.19076307117938995, 0...","[0.0014787140535190701, -0.3578803539276123, 0..."


In [13]:
big_df.to_pickle("final_df.pkl")

In [20]:
big_df.sample(2)

Unnamed: 0.1,Unnamed: 0,label,preprocessed_text,appended,bert_emmbeding,bert_sentence_emm,elmo_sentence
18919,18935,hostile,it is awful that i do not feel safe as an immi...,it is awful that i do not feel safe as an immi...,[-2.20606923e-01 -2.42669612e-01 -5.48121512e-...,"[-0.09696795791387558, 0.17563417553901672, 0....","[-0.26823073625564575, -0.23854795098304749, -..."
46127,46484,offensive,i lost all my respect for my brother cause you...,i lost all my respect for my brother cause you...,[-1.42929375e-01 4.66040432e-01 -6.47009015e-...,"[-0.008973411284387112, 0.04470272734761238, -...","[-0.1051262766122818, -0.21873603761196136, -0..."


In [21]:
big_df.to_pickle("/content/drive/MyDrive/Android/final_df.pkl")

In [None]:
embeddings_words.numpy().shape

In [22]:
pd.read_pickle("/content/drive/MyDrive/Android/final_df.pkl")

Unnamed: 0.1,Unnamed: 0,label,preprocessed_text,appended,bert_emmbeding,bert_sentence_emm,elmo_sentence
61340,61773,severe_toxic,suck my cock motherfuckers why don t you fuck...,suck my cock motherfuckers why don t you fuck...,[ 9.30619538e-01 3.85009815e-01 2.40492993e-...,"[0.2155110239982605, 0.20219053328037262, -0.0...","[0.2443406879901886, -0.5525169372558594, 0.01..."
61341,61774,severe_toxic,fap so i herd u were still a fucking wanker is...,fap so i herd u were still a fucking wanker is...,[ 7.34545668e-01 1.64129293e-01 2.00675428e-...,"[0.07144057750701904, 0.10810859501361847, -0....","[0.03207191079854965, -0.38199320435523987, 0...."
61342,61775,severe_toxic,josef mengele never worked on a scientific ba...,josef mengele never worked on a scientific ba...,[ 8.32445830e-01 5.54360906e-01 -1.58358643e-...,"[0.007855305448174477, 0.2709958851337433, 0.0...","[-0.021751997992396355, -0.32511425018310547, ..."
61343,61776,severe_toxic,my cock you failed again to block me wikipedia...,my cock you failed again to block me wikipedia...,[ 5.33574671e-01 1.86936411e-01 6.40570079e-...,"[0.18784163892269135, 0.05127450078725815, -0....","[-0.007243484258651733, -0.4541970193386078, 0..."
61344,61777,severe_toxic,oh fuck you you partisan asshole you never ste...,oh fuck you you partisan asshole you never ste...,[ 8.83090933e-01 4.76757526e-01 4.16040421e-...,"[0.05299034342169762, -0.22730588912963867, 0....","[0.07033226639032364, -0.326628714799881, 0.04..."
...,...,...,...,...,...,...,...
3973,3976,abuse,how can you expect anyone to take you seriousl...,how can you expect anyone to take you seriousl...,[ 9.41075504e-01 3.37572306e-01 -8.61335248e-...,"[0.0325782410800457, 0.044886041432619095, -0....","[0.036360640078783035, -0.2946488559246063, -0..."
3974,3977,abuse,testing the mothefucker new and improved capab...,testing the mothefucker new and improved capab...,[ 4.67681468e-01 -3.06223109e-02 1.85525734e-...,"[-0.10289394110441208, 0.1475052386522293, 0.0...","[0.2204449623823166, -0.04802954941987991, -0...."
3975,3978,abuse,op saw the word manbabies in the url and said ...,op saw the word manbabies in the url and said ...,[ 2.20974147e-01 3.83333176e-01 2.78720379e-...,"[-0.01329519972205162, 0.027182534337043762, 0...","[-0.1446760594844818, -0.2553308308124542, 0.1..."
3976,3979,abuse,yea fuck you gay people and brown people i was...,yea fuck you gay people and brown people i was...,[ 8.51975501e-01 7.69343555e-01 2.46158354e-...,"[-0.006593125406652689, 0.19076307117938995, 0...","[0.0014787140535190701, -0.3578803539276123, 0..."


In [None]:
embeddings_sent.numpy().shape

Just for last word (this is racist etc) embedding.

In [None]:
df['elmo_word'] = embeddings_words.numpy().tolist()
df['elmo_word'] = df.progress_apply(lambda x: x['elmo_word'][x['idx']-1], axis = 1)

In [None]:
df['elmo_sentence'] = embeddings_sent.numpy().tolist()

In [None]:
df['idx'] = df.progress_apply(lambda x: len(x['appended'].split()), axis = 1)
df['shape'] = df.progress_apply(lambda x: len(x['elmo_word']), axis = 1)
df['idx'] = df.progress_apply(lambda x: min(x['idx'], x['shape']), axis = 1)

In [None]:
df.tail(2)

In [None]:
df = df.drop(['elmo_words', 'idx', 'shape'], axis = 1)

In [None]:
df

In [None]:
df.to_pickle('elmobert.pkl')

In [23]:
from google.colab import files

In [24]:
files.download('final_df.pkl')

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>