In [289]:
import pickle

from BertTM import *

## BERT models

### Load pretrained

In [7]:
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased', )
model = BertForSequenceClassificationOutputPooled.from_pretrained('bert-base-uncased', 
                                                              output_attentions=True, 
                                                              output_hidden_states=True)

### Load fine-tuned

In [2]:
output_dir = "../bert-classifier-pytorch/model_save_attention_1epoch"

# Load a trained model and vocabulary that you have fine-tuned
model = BertForSequenceClassificationOutputPooled.from_pretrained(output_dir,
                                                      output_attentions = True, 
                                                      output_hidden_states = True)
tokenizer = BertTokenizer.from_pretrained(output_dir)

### Test model

In [None]:
input_list = []
token_list = []
cls_ = '[CLS]'
sep_ = '[SEP]'
sentences = ['Hello, my dog is cute and cutest.', 'I am too']
for i, sent in enumerate(sentences):
    inputs = tokenizer.encode_plus(sentences[i], add_special_tokens=True)
    tokens = [cls_] + tokenizer.tokenize(sentences[i]) + [sep_]
    input_ids = torch.tensor(inputs['input_ids']).unsqueeze(0)
    input_list.append(input_ids)
    token_list.append(tokens)

### Load sentence embedder

In [35]:
word_embedding_model = models.BERT(output_dir, max_seq_length = 240,)

# Apply mean pooling to get one fixed sized sentence vector
pooling_model = models.Pooling(word_embedding_model.get_word_embedding_dimension(),
                               pooling_mode_mean_tokens=True,
                               pooling_mode_cls_token=False,
                               pooling_mode_max_tokens=False)

st_model = SentenceTransformer(modules=[word_embedding_model, pooling_model],
                               #device=torch.device("cuda")
                              )

### Test that attention and vectorization work

In [111]:
attentions = get_attention(sentences, model, tokenizer, method = 'first')
np.sum([tpl[1] for tpl in attentions[1]])

vectorized = vectorize(sentences, model, tokenizer)
torch.stack(vectorized).detach().numpy().shape

(2, 768)

In [120]:
get_attention(["this movie was the cutest. read more at http://worstever.com"], model, tokenizer, method = 'first')

[[('this', 0.0613682),
  ('movie', 0.030429687),
  ('was', 0.035641603),
  ('the', 0.21911351),
  ('cutest', 0.06270852),
  ('.', 0.09440403),
  ('read', 0.0423442),
  ('more', 0.06480052),
  ('at', 0.06262489),
  ('http', 0.027938599),
  (':', 0.043154325),
  ('/', 0.038011733),
  ('/', 0.041409045),
  ('worstever', 0.04187157),
  ('.', 0.08471608),
  ('com', 0.049463503)]]

## Topic Model from BERT

### Load data and set params

In [33]:
df = pd.read_csv("nlwx_2020_hashtags_no_rt_predictions.csv")
data = df['text']
ngram = (1, 3)
n_topics = 9

### Create embeddings

In [40]:
%%time

rows, attentions = get_embeddings(data, model, tokenizer)

Processed 0 rows in 0.15 seconds.
Processed 500 rows in 98.27 seconds.
Processed 1000 rows in 193.77 seconds.
Processed 1500 rows in 290.81 seconds.
Processed 2000 rows in 386.74 seconds.
Processed 2500 rows in 484.5 seconds.
Processed 3000 rows in 584.24 seconds.
Processed 3500 rows in 679.84 seconds.
Processed 4000 rows in 776.88 seconds.
Processed 4500 rows in 873.04 seconds.
Processed 5000 rows in 969.55 seconds.
Processed 5500 rows in 1067.45 seconds.
Processed 6000 rows in 1168.92 seconds.
Processed 6500 rows in 1268.35 seconds.
Processed 7000 rows in 1366.79 seconds.
Processed 7500 rows in 1468.06 seconds.
Processed 8000 rows in 1570.23 seconds.
Processed 8500 rows in 1671.15 seconds.
Processed 9000 rows in 1772.01 seconds.
Processed 9500 rows in 1872.32 seconds.
Processed 10000 rows in 1970.01 seconds.
Processed 10500 rows in 2071.37 seconds.
Processed 11000 rows in 2172.99 seconds.
Processed 11500 rows in 2273.16 seconds.
Processed 12000 rows in 2370.85 seconds.
Processed 1250

Save data after creating embeddings

In [None]:
all_model_data = []

for i in range(len(rows)):
    all_model_data.append((data[i], df.prediction[i], attentions[i], rows[i]))
    
#pickle.dump(all_model_data, open(f"attentions_sent_embeddings.pkl", "wb" ))

Define stopwords.

In [281]:
extended_stopwords = ['#', '@', '…', "'", "’", "[UNK]", "\"", ";", "*", "_", "amp", "&", "“", "”",
                      'nlwhiteout', 'nlweather', 'newfoundland', 'nlblizzard2020', 'nlstorm2020',
                      'snowmaggedon2020', 'stormageddon2020', 'snowpocalypse2020', 'snowmageddon',
                      'nlstorm', 'nltraffic', 'nlwx', 'nlblizzard']
stopwords = get_stopwords(extended_stopwords)
print(len(stopwords))

1325


Load pickled embedding data.

In [255]:
all_model_data = pickle.load(open("attentions_sent_embeddings.pkl", "rb"))
texts, _, attentions, rows = zip(*all_model_data)

### Kmeans model

Train kmeans model.

In [283]:
%%time

labels = get_clusters(rows, n_topics)

Fitting kmeans model.
The number of texts per label are:
{0: 2632, 1: 4438, 2: 1034, 3: 826, 4: 670, 5: 798, 6: 2858, 7: 4467, 8: 4074}
CPU times: user 37.4 s, sys: 2 s, total: 39.4 s
Wall time: 33.8 s


#### Cluster components

Drop stopwords and empty documents.

In [270]:
%%time
  
filtered_a, filtered_t, filtered_l = filter_data(attentions, stopwords, labels)

Filtering attentions.
CPU times: user 11.8 s, sys: 0 ns, total: 11.8 s
Wall time: 11.8 s


Use Gensim's phraser to determine which ngram to include a word in.

In [285]:
%%time

features = get_phrases(filtered_t, min_count=10, threshold=0.5)

CPU times: user 4.66 s, sys: 0 ns, total: 4.66 s
Wall time: 4.66 s


Use BERT's attention mechanism to determine which words characterize each kmeans cluster.

In [273]:
%%time
    
components, words_label = determine_cluster_components(filtered_l, filtered_a, ngram)


Determining cluster components. This will take awhile. 
Progress will be printed for every 500th processed property.
    
Processed 5000 texts in 1.65 seconds.
Processed 10000 texts in 3.31 seconds.
Processed 15000 texts in 5.05 seconds.
Processed 20000 texts in 6.89 seconds.
Finished determining a total of 21755 cluster components. Total time 7.46 seconds.
CPU times: user 7.45 s, sys: 8 ms, total: 7.46 s
Wall time: 7.46 s


Use term frequency inverse cluster frequency with and without BERT's attentions mechanism to determine which words characterize each kmeans cluster. 

In [284]:
%%time

tfidf_indexed = tf_icf(words_label, n_topics)
components_tfidf, components_tfidf_attn = get_tfidf_components(components, tfidf_indexed)

CPU times: user 352 ms, sys: 0 ns, total: 352 ms
Wall time: 350 ms


#### Topics

In [None]:
topics_attn = topics_df(
    topics = n_topics,
    components = components,
    n_words = 10)

#pickle.dump(topics_attn, open("topics_sent_embed.pickle", "wb"))

In [277]:
topics_attn

Unnamed: 0,topic 0,topic 1,topic 2,topic 3,topic 4,topic 5,topic 6,topic 7,topic 8
0,snow,snowmageddon2020,power,assistance,hope,stay safe,snow,snow,snow
1,storm,snow,closed,helping,thinking,safe,storm,storm,storm
2,emergency,snowstorm,power outage,people,hoping,warning,people,blizzard,shovel
3,blizzard,blizzard,road,support,stay safe,storm,power,snowstorm,love
4,wind,nlsnowstorm2020,snow,emergency,prayer,stay,car,weather,people
5,snowfall,canada,storm,supply,safe,emergency,digging,winter,day
6,weather,love,emergency,community,storm,envcanada advisory blowingsnow,buried,blizzard2020,hope
7,update,storm,lost power,food,friend,stay safe warm,missing,snowmageddon2020,dog
8,john,wow,outage,snow,god,alert,stuck,newfoundlandstorm,neighbour
9,forecast,crazy,damage,service,people,snow,emergency,wind,newfoundlanders


In [278]:
topics_tfidf = topics_df(
    topics = n_topics,
    components = components_tfidf,
    n_words = 10)

topics_tfidf

Unnamed: 0,topic 0,topic 1,topic 2,topic 3,topic 4,topic 5,topic 6,topic 7,topic 8
0,snow,snowmageddon2020,power,assistance,prayer,stay safe,snow,snow,snow
1,storm,snow,road,people,hope,safe,car,storm,day
2,wind,cbcnl,john,emergency,stay safe,storm,people,blizzard,time
3,john,canada,power outage,helping,thinking,blizzard warning,john,snowmageddon2020,people
4,emergency,snowstorm,snow,support,hope safe,envcanada advisory blowingsnow,road,snowstorm,love
5,blizzard,nlsnowstorm2020,emergency,john,safe,warning,house,winter,dog
6,update,yytsoe,closed,community,people,emergency,door,day,snowmageddon2020
7,day,john,storm,food,friend,stay safe warm,street,john,john
8,weather,day,street,supply,hoping,snow,power,blizzard2020,morning
9,canada,snowmegeddon2020,outage,snow,storm,stay,storm,weather,даниила егорова pin


In [279]:
topics_tfidf_attn = topics_df(
    topics = n_topics,
    components = components_tfidf_attn,
    n_words = 10)

topics_tfidf_attn

Unnamed: 0,topic 0,topic 1,topic 2,topic 3,topic 4,topic 5,topic 6,topic 7,topic 8
0,snow,snowmageddon2020,power,assistance,hope,stay safe,snow,snow,snow
1,storm,snow,road,people,thinking,safe,people,storm,people
2,emergency,snowstorm,closed,helping,prayer,storm,car,blizzard,love
3,wind,canada,power outage,emergency,stay safe,warning,storm,snowstorm,day
4,blizzard,nlsnowstorm2020,snow,support,hoping,stay,power,snowmageddon2020,time
5,john,love,storm,community,safe,envcanada advisory blowingsnow,road,winter,storm
6,update,day,emergency,food,hope safe,emergency,door,weather,shovel
7,snowfall,blizzard,outage,supply,friend,blizzard warning,house,blizzard2020,dog
8,weather,john,john,snow,people,stay safe warm,street,day,neighbour
9,day,wow,lost power,service,storm,snow,john,newfoundlandstorm,friend
