In [1]:
import json
import numpy as np
import pandas as pd
import pickle
import time
import torch
import torch.nn as nn

from BertForSequenceClassificationOutputPooled import *
from BertTM import *
from sentence_transformers import SentenceTransformer
from sentence_transformers import models, losses
from sklearn.cluster import KMeans

### Load pretrained

In [7]:
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased', )
model = BertForSequenceClassificationOutputPooled.from_pretrained('bert-base-uncased', 
                                                              output_attentions=True, 
                                                              output_hidden_states=True)
labels = torch.tensor([1]).unsqueeze(0)
input_list = []
token_list = []
cls_ = '[CLS]'
sep_ = '[SEP]'
sentences = ['Hello, my dog is cute and cutest.', 'I am too']
for i, sent in enumerate(sentences):
    inputs = tokenizer.encode_plus(sentences[i], add_special_tokens=True)
    tokens = [cls_] + tokenizer.tokenize(sentences[i]) + [sep_]
    input_ids = torch.tensor(inputs['input_ids']).unsqueeze(0)
    input_list.append(input_ids)
    token_list.append(tokens)

### Load fine-tuned

In [2]:
output_dir = "../bert-classifier-pytorch/model_save_attention_1epoch"

# Load a trained model and vocabulary that you have fine-tuned
model = BertForSequenceClassificationOutputPooled.from_pretrained(output_dir,
                                                      output_attentions = True, 
                                                      output_hidden_states = True)
tokenizer = BertTokenizer.from_pretrained(output_dir)
labels = torch.tensor([1]).unsqueeze(0)
input_list = []
token_list = []
cls_ = '[CLS]'
sep_ = '[SEP]'
sentences = ['Hello, my dog is cute and cutest.', 'I am too']
for i, sent in enumerate(sentences):
    inputs = tokenizer.encode_plus(sentences[i], add_special_tokens=True)
    tokens = [cls_] + tokenizer.tokenize(sentences[i]) + [sep_]
    input_ids = torch.tensor(inputs['input_ids']).unsqueeze(0)
    input_list.append(input_ids)
    token_list.append(tokens)

In [35]:
word_embedding_model = models.BERT(output_dir, max_seq_length = 240,)

# Apply mean pooling to get one fixed sized sentence vector
pooling_model = models.Pooling(word_embedding_model.get_word_embedding_dimension(),
                               pooling_mode_mean_tokens=True,
                               pooling_mode_cls_token=False,
                               pooling_mode_max_tokens=False)

st_model = SentenceTransformer(modules=[word_embedding_model, pooling_model],
                               #device=torch.device("cuda")
                              )

### Test that attention and vectorization work

In [111]:
attentions = get_attention(sentences, model, tokenizer, method = 'first')
np.sum([tpl[1] for tpl in attentions[1]])

vectorized = vectorize(sentences, model, tokenizer)
torch.stack(vectorized).detach().numpy().shape

(2, 768)

In [120]:
get_attention(["this movie was the cutest. read more at http://worstever.com"], model, tokenizer, method = 'first')

[[('this', 0.0613682),
  ('movie', 0.030429687),
  ('was', 0.035641603),
  ('the', 0.21911351),
  ('cutest', 0.06270852),
  ('.', 0.09440403),
  ('read', 0.0423442),
  ('more', 0.06480052),
  ('at', 0.06262489),
  ('http', 0.027938599),
  (':', 0.043154325),
  ('/', 0.038011733),
  ('/', 0.041409045),
  ('worstever', 0.04187157),
  ('.', 0.08471608),
  ('com', 0.049463503)]]

## Topic Model

In [33]:
df = pd.read_csv("nlwx_2020_hashtags_no_rt_predictions.csv")
#df = preprocess(df)
data = df['text']

In [3]:
batch_size = 20
ngram = (1, 3)
n_topics = 10

#### Pooled

In [7]:
rows, attentions = [], []
counter = 0
for i in range(0, len(data), batch_size):
    index = min(i + batch_size, len(data))
    rows.append(vectorize(data[i:index], model, tokenizer))
    attentions.extend(get_attention(data[i:index], model, tokenizer))
    if counter % 50 == 0:
        print(f"Processed {counter} rows out of {len(data)}.")
    counter += 1

Processed 0 rows out of 100.


#### Sentence embeddings

In [40]:
%%time

rows, attentions = [], []
start_time = time.time()
for i in range(0, len(data)):
    rows.extend(st_model.encode([data[i]]))
    attentions.extend(get_attention([data[i]], model, tokenizer))
    if i % 500 == 0:
        print(f'Processed {(i)} rows in {round(time.time() - start_time, 2)} seconds.')

Processed 0 rows in 0.15 seconds.
Processed 500 rows in 98.27 seconds.
Processed 1000 rows in 193.77 seconds.
Processed 1500 rows in 290.81 seconds.
Processed 2000 rows in 386.74 seconds.
Processed 2500 rows in 484.5 seconds.
Processed 3000 rows in 584.24 seconds.
Processed 3500 rows in 679.84 seconds.
Processed 4000 rows in 776.88 seconds.
Processed 4500 rows in 873.04 seconds.
Processed 5000 rows in 969.55 seconds.
Processed 5500 rows in 1067.45 seconds.
Processed 6000 rows in 1168.92 seconds.
Processed 6500 rows in 1268.35 seconds.
Processed 7000 rows in 1366.79 seconds.
Processed 7500 rows in 1468.06 seconds.
Processed 8000 rows in 1570.23 seconds.
Processed 8500 rows in 1671.15 seconds.
Processed 9000 rows in 1772.01 seconds.
Processed 9500 rows in 1872.32 seconds.
Processed 10000 rows in 1970.01 seconds.
Processed 10500 rows in 2071.37 seconds.
Processed 11000 rows in 2172.99 seconds.
Processed 11500 rows in 2273.16 seconds.
Processed 12000 rows in 2370.85 seconds.
Processed 1250

In [20]:
with open('stopwords-en.json') as fopen:
    stopwords = json.load(fopen)

stopwords.extend(['#', '@', '…', "'", "’", "[UNK]", "\"", ";", "*", "_", "amp", "&",
                 'nlwhiteout', 'nlweather', 'newfoundland', 'nlblizzard2020', 'nlstorm2020',
                  'snowmaggedon2020', 'stormageddon2020', 'snowpocalypse2020', 'snowmageddon',
                  'nlstorm', 'nltraffic', 'nlwx', 'nlblizzard'])
    
print(len(stopwords))
print(stopwords[:5])

1323
["'ll", "'tis", "'twas", "'ve", '10']


In [193]:
#concat = np.concatenate(rows, axis = 0)
#concat = [item.detach().numpy() for item in concat]
#concat = np.asarray(concat, dtype=np.float32)

In [42]:
all_model_data = []

for i in range(len(rows)):
    all_model_data.append((data[i], df.prediction[i], attentions[i], rows[i]))
    
#pickle.dump(all_model_data, open(f"attentions_sent_embeddings.pkl", "wb" ))

In [24]:
all_model_data = pickle.load(open("attentions_sent_embeddings.pkl", "rb"))
texts, _, attentions, rows = zip(*all_model_data)

In [25]:
%%time

print("Fitting kmeans model.")
rows = rows[:5000]
texts = texts[:5000]
attentions = attentions[:5000]
kmeans = KMeans(n_clusters = n_topics, random_state = 0).fit(rows)
labels = kmeans.labels_

Fitting kmeans model.
CPU times: user 6.34 s, sys: 840 ms, total: 7.18 s
Wall time: 5.03 s


In [42]:
%%time

from textblob import TextBlob, Word

overall, filtered_a, filtered_texts, filtered_l = [], [], [], []
url_re = '(https?:\/\/(?:www\.|(?!www))[a-zA-Z0-9][a-zA-Z0-9-]+[a-zA-Z0-9]\.[^\s]{2,}|www\.[a-zA-Z0-9][a-zA-Z0-9-]+[a-zA-Z0-9]\.[^\s]{2,}|https?:\/\/(?:www\.|(?!www))[a-zA-Z0-9]+\.[^\s]{2,}|www\.[a-zA-Z0-9]+\.[^\s]{2,})'
print("Filtering attentions.")
for idx, a in enumerate(attentions):
    f = [(Word(i[0]).lemmatize(),i[1]) for i in a if i[0] not in stopwords and i[0] not in url_re]
    f_txt = [w[0] for w in f]
    if len(f) > 0:
        overall.extend(f)
        filtered_a.append(f)
        filtered_texts.append(f_txt)
        filtered_l.append(labels[idx])

Filtering attentions.
CPU times: user 4.19 s, sys: 40 ms, total: 4.23 s
Wall time: 4.23 s


In [43]:
%%time

print("Generating ngrams.")
o_ngram = generate_ngram(overall, ngram)
features = []
for i in o_ngram:
    features.append(' '.join([w[0] for w in i]))
features = list(set(features))


Generating ngrams.
CPU times: user 120 ms, sys: 0 ns, total: 120 ms
Wall time: 120 ms


In [44]:
%%time

print(
"""
Determining cluster components. This will take awhile. 
Progress will be printed for every 500th processed property.
""")

components = {}
words_label = {}
start_time = time.time()
for idx, label in enumerate(filtered_l):
    if label not in components:
        components[label] = {}
        words_label[label] = []
    else:
        f = generate_ngram(filtered_a[idx], ngram)
        for w in f:
            word = ' '.join([r[0] for r in w])
            score = np.mean([r[1] for r in w])
            if word in features:
                if word in components[label]:
                    components[label][word] += score
                else:
                    components[label][word] = score
                words_label[label].append(word)
    if (idx + 1) % 500 == 0:
        print(f'Processed {(idx + 1)} texts in {round(time.time() - start_time, 2)} seconds.')
            
print(f"Finished determining a total of {idx + 1} cluster components. Total time {round(time.time() - start_time, 2)} seconds.")


Determining cluster components. This will take awhile. 
Progress will be printed for every 500th processed property.

Processed 500 texts in 8.88 seconds.
Processed 1000 texts in 17.97 seconds.
Processed 1500 texts in 29.17 seconds.
Processed 2000 texts in 38.65 seconds.
Processed 2500 texts in 48.81 seconds.
Processed 3000 texts in 58.62 seconds.
Processed 3500 texts in 68.35 seconds.
Processed 4000 texts in 78.25 seconds.
Processed 4500 texts in 89.35 seconds.
Finished determining a total of 4989 cluster components. Total time 99.83 seconds.
CPU times: user 1min 39s, sys: 20 ms, total: 1min 39s
Wall time: 1min 39s


In [45]:
fully_indexed = tf_icf(words_label)

In [215]:
#pickle.dump(components, open("components.pickle", "wb"))

In [46]:
components_tfidf_attn = {}
components_tfidf = {}
for k1 in components:
    components_tfidf_attn[k1] = {}
    components_tfidf[k1] = {}
    for k2 in components[k1]:
        components_tfidf_attn[k1][k2] = fully_indexed[k1][k2] * components[k1][k2]
        components_tfidf[k1][k2] = fully_indexed[k1][k2]

In [47]:
topics_attn = topics_df(
    10,
    components,
    n_words = 10)

#pickle.dump(topics_attn, open("topics_sent_embed.pickle", "wb"))

In [48]:
topics_attn

Unnamed: 0,topic 0,topic 1,topic 2,topic 3,topic 4,topic 5,topic 6,topic 7,topic 8,topic 9
0,view,power,snow,hope,stay,outage,snow,storm,storm,storm
1,window,closed,wind,stay,stay safe,power,storm,power,snow,starr
2,view window,snow,emergency,safe,safe,power outage,blizzard,snow,wind,snow
3,paradise,emergency,storm,thinking,storm,closed,snowday,stay,blizzard,ken starr
4,door,road,blizzard,stay safe,warning,close,winter,buried,weather,ken
5,storm,storm,snowfall,hoping,friend,business close,snowstorm,house,emergency,saveng ken starr
6,wow,stay,john,friend,warm,lost,wind,missing,stormchips,eminem
7,breaking,plow,declared,storm,stay safe warm,emergency,weather,shovel,snowstorm,eminem saveng ken
8,car,weather,gust,prayer,snow,emergency business close,day,stuck,monster,saveng ken
9,road,lost,airport,hope stay,emergency,business,view,car,day,ken starr sexeducation


In [39]:
topics_tfidf = topics_df(
    10,
    components_tfidf,
    n_words = 10)

topics_tfidf

Unnamed: 0,topic 0,topic 1,topic 2,topic 3,topic 4,topic 5,topic 6,topic 7,topic 8,topic 9
0,view,power,snow,safe,safe,power,snow,power,snow,learn
1,window,snow,winds,hope,stay,outages,storm,snow,storm,eminem
2,view window,closed,john,stay,stay safe,businesses close,blizzard,storm,blizzard,sexeducation
3,sharing view,emergency,storm,stay safe,storm,power outages,snowday,house,wind,ken
4,sharing view window,roads,emergency,thinking,safe stay,emergency businesses close,snowmageddon2020,car,weather,starr
5,onthegocbc,john,airport,friends,emergency,businesses,winter,window,winds,даниила
6,sharing,update,blizzard,prayers,warm,outage,day,stay,john,егорова
7,cbcnl,road,declared,family,stay safe warm,declared emergency,snowstorm,door,day,photo learn
8,john,conditions,snowfall,hoping,snow,emergency businesses,view,hospital,snowiest,learn michelleobama
9,door,storm,wind,warm,safe warm,declared emergency businesses,window,john,emergency,michelleobama eminem


In [40]:
topics_tfidf_attn = topics_df(
    10,
    components_tfidf_attn,
    n_words = 10)

topics_tfidf_attn

Unnamed: 0,topic 0,topic 1,topic 2,topic 3,topic 4,topic 5,topic 6,topic 7,topic 8,topic 9
0,view,power,snow,safe,stay,power,snow,power,snow,starr
1,window,closed,storm,hope,safe,outages,storm,snow,storm,ken starr
2,view window,snow,winds,stay,stay safe,outage,blizzard,storm,blizzard,ken
3,sharing view window,emergency,emergency,thinking,storm,power outages,snowday,stay,wind,eminem
4,sharing view,roads,john,stay safe,warm,businesses close,winter,house,weather,saveng ken starr
5,door,storm,blizzard,friends,stay safe warm,close,snowstorm,buried,winds,eminem saveng ken
6,john,weather,snowfall,hoping,emergency,closed,snowmageddon2020,car,emergency,saveng ken
7,paradise,road,wind,prayers,safe stay,power outage,day,shovel,stormchips,ken starr sexeducation
8,car,clearing,declared,storm,friends,emergency businesses close,weather,stuck,day,eminem saveng
9,road,conditions,airport,family,snow,emergency,wind,emergency,snowstorm,starr sexeducation
