In [63]:
import hdbscan
import numpy as np
import pandas as pd
import re
import tensorflow as tf
import transformers
import umap

from nltk.corpus import stopwords
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import CountVectorizer
from plotnine import *

In [2]:
from transformers import (
    AutoConfig,
    AutoModel,
    AutoTokenizer,
    TFAutoModelForSequenceClassification,
    AdamW,
    glue_convert_examples_to_features,
    pipeline
)

In [34]:
model = AutoModel.from_pretrained('./huggingface_model/', from_tf=True)

All TF 2.0 model weights were used when initializing BertModel.

All the weights of BertModel were initialized from the TF 2.0 model.
If your task is similar to the task the model of the checkpoint was trained on, you can already use BertModel for predictions without further training.


In [35]:
tokenizer = AutoTokenizer.from_pretrained('digitalepidemiologylab/covid-twitter-bert')

In [36]:
pipe = pipeline("feature-extraction", model=model, tokenizer=tokenizer)

In [37]:
df = pd.read_csv('labeled/labeled.tsv', sep = '\t')
df

Unnamed: 0,id,label,text
0,1344688894038695939,0,We are excited to receive news that we will be...
1,1344689204899565571,0,Chris Rock excited to get the COVID vaccine: ‘...
2,1344689375079247872,0,"If you're still excited about the new year, yo..."
3,1344689636615073793,1,"A new strain, more contagious....yet the same ..."
4,1344689691052838915,0,Covid vaccine first dose: done! Excited to be ...
...,...,...,...
12687,1413118303283810311,0,Ooh! Potential #CoronaVac by side-effect: I'm ...
12688,1413142807024336909,0,"This rocks, very excited to get the vaccine as..."
12689,1413143486568681478,0,We're excited to announce a COVID-19 vaccinati...
12690,1413129569662554117,0,Dec 2020: MOH: Vaccines are here! FPs: Yay! We...


In [38]:
X_train, X_test, y_train, y_test = train_test_split(df['text'].to_list(), df['label'], test_size=0.2, random_state=42)

In [39]:
stop_words = stopwords.words('english')

In [41]:
text = []

for sent in X_train:
    s = ""
    for w in sent.lower().split():
        if w not in stop_words:
            s = s + " " + w
    s = re.sub(pattern='^ +', repl='', string=s)
    text.append(s)

In [42]:
embeddings = pipe(text, batch_size=128, truncation="only_first")

In [43]:
emb = [np.mean(e[0], axis=0) for e in embeddings]

In [45]:
umap_embeddings = umap.UMAP(n_neighbors=15, 
                            n_components=5, 
                            metric='cosine').fit_transform(emb)

In [46]:
cluster = hdbscan.HDBSCAN(min_cluster_size=15,
                          metric='euclidean',                      
                          cluster_selection_method='eom').fit(umap_embeddings)

In [47]:
umap_data = umap.UMAP(n_neighbors=15, n_components=2, min_dist=0.0, metric='cosine').fit_transform(emb)
result = pd.DataFrame(umap_data, columns=['x', 'y'])
result['labels'] = cluster.labels_

In [48]:
result['antivax'] = df['label']

In [66]:
result.to_csv('bertopic_result.csv', index=False)

In [51]:
docs_df = pd.DataFrame(X_train, columns=["Doc"])
docs_df['Topic'] = cluster.labels_
docs_df['Doc_ID'] = range(len(docs_df))
docs_per_topic = docs_df.groupby(['Topic'], as_index = False).agg({'Doc': ' '.join})

In [52]:
def c_tf_idf(documents, m, ngram_range=(1, 1)):
    count = CountVectorizer(ngram_range=ngram_range, stop_words="english").fit(documents)
    t = count.transform(documents).toarray()
    w = t.sum(axis=1)
    tf = np.divide(t.T, w)
    sum_t = t.sum(axis=0)
    idf = np.log(np.divide(m, sum_t)).reshape(-1, 1)
    tf_idf = np.multiply(tf, idf)

    return tf_idf, count
  
tf_idf, count = c_tf_idf(docs_per_topic.Doc.values, m=len(X_train))

In [53]:
def extract_top_n_words_per_topic(tf_idf, count, docs_per_topic, n=20):
    words = count.get_feature_names()
    labels = list(docs_per_topic.Topic)
    tf_idf_transposed = tf_idf.T
    indices = tf_idf_transposed.argsort()[:, -n:]
    top_n_words = {label: [(words[j], tf_idf_transposed[i][j]) for j in indices[i]][::-1] for i, label in enumerate(labels)}
    return top_n_words

def extract_topic_sizes(df):
    topic_sizes = (df.groupby(['Topic'])
                     .Doc
                     .count()
                     .reset_index()
                     .rename({"Topic": "Topic", "Doc": "Size"}, axis='columns')
                     .sort_values("Size", ascending=False))
    return topic_sizes

top_n_words = extract_top_n_words_per_topic(tf_idf, count, docs_per_topic, n=20)
topic_sizes = extract_topic_sizes(docs_df); topic_sizes.head(10)



Unnamed: 0,Topic,Size
5,4,6533
4,3,3514
0,-1,41
2,1,24
3,2,22
1,0,19


In [54]:
top_n_words[0][:10]

[('free', 0.32796258150643554),
 ('nomandates', 0.24095186202919094),
 ('noforcedcovidvaccines', 0.24095186202919094),
 ('belong', 0.24095186202919094),
 ('unconstitutional', 0.23879733201748557),
 ('unethical', 0.23482587133858748),
 ('tracked', 0.22493630926331618),
 ('mandates', 0.22214288237503843),
 ('bodies', 0.2208201496992256),
 ('choose', 0.21954226415934402)]

In [55]:
top_n_words[1][:10]

[('incarceration', 0.28126645924007565),
 ('licenses', 0.28126645924007565),
 ('ubiquitous', 0.28126645924007565),
 ('isolation', 0.2756041029988635),
 ('pointless', 0.27386107580286856),
 ('universal', 0.27386107580286856),
 ('movement', 0.2689950278909609),
 ('worst', 0.2568778028251933),
 ('passports', 0.2492536376479497),
 ('completely', 0.2463357365690436)]

In [56]:
top_n_words[2][:10]

[('excuse', 0.6483243386689037),
 ('herd', 0.6261577089892524),
 ('using', 0.5360632085501423),
 ('immunity', 0.519847734572914),
 ('kids', 0.5149098787563685),
 ('therapy', 0.27972714812633453),
 ('gene', 0.2763384846640605),
 ('voodoo', 0.04714020622508993),
 ('growth', 0.04331066379105709),
 ('tea', 0.042077826490978576)]

In [57]:
top_n_words[3][:10]

[('experimental', 0.04287535124971336),
 ('gene', 0.03702586690966361),
 ('therapy', 0.0362531361161765),
 ('depopulation', 0.02636350389124544),
 ('people', 0.026053136693860016),
 ('covid', 0.024432943000661463),
 ('vaccines', 0.023302585525732195),
 ('mrna', 0.02132436977416879),
 ('virus', 0.01941517869809748),
 ('untested', 0.018342466594300503)]

In [58]:
top_n_words[4][:10]

[('got', 0.04081448407990466),
 ('dose', 0.0368480942854966),
 ('vaccinated', 0.03203865787898253),
 ('worry', 0.0308963590418855),
 ('today', 0.029836231125362517),
 ('covid', 0.029479925221600305),
 ('second', 0.027756822383957536),
 ('just', 0.027213762325878788),
 ('don', 0.026754458065144093),
 ('grateful', 0.026472424097737878)]

In [83]:
mom_df = pd.read_csv('labeled/mom_post_label.csv', encoding= 'unicode_escape')

In [89]:
mom_df = mom_df[mom_df['antivax'] != -1]

In [147]:
mom_df['antivax'].value_counts()

0.0    27
1.0     8
Name: antivax, dtype: int64

In [157]:
# Small function only used for formatting the output
def format_prediction(preds, label_mapping, label_name):
    preds = tf.nn.softmax(preds.detach(), axis=1)
    formatted_preds = []
    for pred in preds.numpy():
        # convert to Python types and sort
        pred = {label: float(probability) for label, probability in zip(label_mapping.values(), pred)}
        pred = {k: v for k, v in sorted(pred.items(), key=lambda item: item[1], reverse=True)}
        formatted_preds.append({label_name: list(pred.keys())[0], f'{label_name}_probabilities': pred})
    return formatted_preds

In [153]:
tf.nn.softmax(preds[0][0].detach(), axis=1)

<tf.Tensor: shape=(96, 1024), dtype=float32, numpy=
array([[0.00014324, 0.0006547 , 0.00025379, ..., 0.00201751, 0.00134345,
        0.00074256],
       [0.00030995, 0.00067512, 0.00086431, ..., 0.00237262, 0.00112358,
        0.00120067],
       [0.00053847, 0.00178776, 0.00171518, ..., 0.00288228, 0.00117129,
        0.00028236],
       ...,
       [0.00034234, 0.00216193, 0.00021207, ..., 0.00135204, 0.0006121 ,
        0.00072489],
       [0.00018248, 0.00033556, 0.00022555, ..., 0.0005103 , 0.00141485,
        0.00126999],
       [0.00014324, 0.00065471, 0.00025379, ..., 0.00201749, 0.00134341,
        0.00074256]], dtype=float32)>

In [93]:
# Map the labels for printing
label_mapping = {
    "0": 0,
    "1": 1
}

In [96]:
max_seq_length = 96 #@param {type: "integer"}

In [120]:
def encode_fn(text_list):
  #將text_list embedding成bert模型可用的輸入形式
  #text_list:['我愛你','貓不是狗']
    t_tokenizer = tokenizer(
        text_list,
        padding = True,
        truncation = True,
        max_length = max_seq_length,
        return_tensors='tf'  # 返回的型別為pytorch tensor
    )
    input_ids = t_tokenizer['input_ids']
    token_type_ids = t_tokenizer['token_type_ids']
    attention_mask = t_tokenizer['attention_mask']
    return input_ids, token_type_ids, attention_mask

In [115]:
input_ids, _, _ = encode_fn(mom_df['Message'].to_list())

In [121]:
preds = model(input_ids)

In [117]:
label_name = 'antivax'

In [162]:
formatted_preds = format_prediction(preds[0], label_mapping, 'antivax')

TypeError: only size-1 arrays can be converted to Python scalars

In [163]:
tf.nn.softmax(preds[0].detach(), axis=1)

<tf.Tensor: shape=(36, 96, 1024), dtype=float32, numpy=
array([[[0.00415969, 0.00924361, 0.00364814, ..., 0.02224816,
         0.01136238, 0.01898837],
        [0.00791513, 0.0083819 , 0.01092547, ..., 0.0230077 ,
         0.00835636, 0.02699903],
        [0.01345911, 0.02172535, 0.02122122, ..., 0.02735737,
         0.00852646, 0.00621477],
        ...,
        [0.00879401, 0.02700099, 0.00269667, ..., 0.01318887,
         0.0045794 , 0.01639715],
        [0.00463676, 0.00414553, 0.00283692, ..., 0.00492395,
         0.01047049, 0.0284161 ],
        [0.00415973, 0.00924374, 0.00364814, ..., 0.02224805,
         0.0113621 , 0.01898838]],

       [[0.00498838, 0.02577468, 0.00342259, ..., 0.01112231,
         0.01365386, 0.01257421],
        [0.01687934, 0.01438432, 0.02723755, ..., 0.00639525,
         0.01161274, 0.00954325],
        [0.00628709, 0.02311921, 0.00563783, ..., 0.02287043,
         0.01258226, 0.00873444],
        ...,
        [0.00498838, 0.02577453, 0.00342259, ..., 0.