In [6]:
import nltk
nltk.download('wordnet')
nltk.download('omw-1.4')
nltk.download('stopwords')
import string
import numpy as np
import random
from bertopic import BERTopic
from sklearn.cluster import AgglomerativeClustering
from sentence_transformers import SentenceTransformer
import tensorflow as tf
import tensorflow_hub as hub
import tensorflow_text
import pandas as pd
import keras
from pathlib import Path

[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\steem\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package omw-1.4 to
[nltk_data]     C:\Users\steem\AppData\Roaming\nltk_data...
[nltk_data]   Package omw-1.4 is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\steem\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


Cleaning article content

In [7]:
custom_words_to_filter = ['Reuters', 'Reuters.com', 'CNET']

lemmatizer = nltk.stem.WordNetLemmatizer()
tokenizer = nltk.tokenize.RegexpTokenizer(r'\w+')
stop_words = nltk.corpus.stopwords.words('english')
words_to_filter = np.concatenate((stop_words, custom_words_to_filter))

def clean_text(text):
    text = "".join([char for char in text if char not in string.punctuation])
    text = lemmatizer.lemmatize(text)
    text = tokenizer.tokenize(text)
    text = [word for word in text if word not in words_to_filter]
    text = ' '.join(text)
    text = text.lower()

    return text

In [12]:
docs_StarWars = ["Count Dooku Voice Actor Corey Burton Tried Something New on Star Wars: Tales of the Jedi", "'Andor' Episode 8 Explained: 'Rogue One' Cameos and a 'Star Wars' Hell Prison - CNET", "Andor Gave Us the Gayest Screen Fade in Star Wars History", "Ahsoka Tano Herself, Ashley Eckstein, Breaks Down Star Wars: Tales of the Jedi", "Star Wars: The Deckbuilding Game could rule the galaxy of 2-player card games", "There's a new Star Wars project from Damon Lindelof in the works", "Star Wars Fatigue Shouldn't Stop You From Watching 'Andor' - CNET", "'Andor' is the best 'Star Wars' show since 'The Mandalorian' — but the least popular", "Star Wars characters take over Mexico City - Reuters"]

docs_China = ["Alarmed by suicide attack, China and Pakistan join hands in probe - Reuters", "Death of boy in lockdown fuels backlash against China's zero-Covid policy", "UPDATE 2-Hong Kong stocks tumble as Xi appointments fan economic fears; yuan weakens", "China supports central SOEs to issue tech innovation bonds - Reuters"]

docs_WorldCup = ["World Cup: FIFA head comments on beer ban - CP24", "T20 World Cup: Bangladesh bowlers impress to secure victory over Netherlands", "Canada's goalkeeper Crepeau to miss World Cup with broken leg - Reuters", "World Cup stadium alcohol ban emblematic of contradictions", "T20 World Cup: Pakistan beat Netherlands to avoid elimination"]

docs_Ukraine = ["Ukraine war: Wagner chief Prigozhin defends brutal killing video", "Chance for peace in Ukraine, says France's Macron - Reuters", "Ukraine nuclear agency thickens alleged dirty bomb plot - CTV News", "Ukraine war round-up: Missile blast in Poland and Zambian family's grief", "Ukraine round-up: Refugees urged to stay away and 'dirty bomb' claims", "Russia says Ukraine hands over 50 prisoners of war - Reuters.com"]

docs = [docs_StarWars, docs_China, docs_WorldCup, docs_Ukraine]
docs = list(np.concatenate(docs).flat)

# data_path = ( Path.cwd() / "data/RoundedTestDataset").resolve()
# testdata = pd.DataFrame(docs, columns=['title'])
# testdata.to_csv(data_path, encoding='utf-8', index=False)

docs = [clean_text(doc) for doc in docs]
print(docs)

['count dooku voice actor corey burton tried something new star wars tales jedi', 'andor episode 8 explained rogue one cameos star wars hell prison', 'andor gave us gayest screen fade star wars history', 'ahsoka tano herself ashley eckstein breaks down star wars tales jedi', 'star wars the deckbuilding game could rule galaxy 2player card games', 'theres new star wars project damon lindelof works', 'star wars fatigue shouldnt stop you from watching andor', 'andor best star wars show since the mandalorian least popular', 'star wars characters take mexico city', 'alarmed suicide attack china pakistan join hands probe', 'death boy lockdown fuels backlash chinas zerocovid policy', 'update 2hong kong stocks tumble xi appointments fan economic fears yuan weakens', 'china supports central soes issue tech innovation bonds', 'world cup fifa head comments beer ban cp24', 't20 world cup bangladesh bowlers impress secure victory netherlands', 'canadas goalkeeper crepeau miss world cup broken leg', 

In [10]:
USE_DATASET = False

if USE_DATASET:
    data = pd.read_csv('stance_data.csv')
    data = data.rename(columns={'post': 'title', 'new_topic': 'subjects', 'label': 'stance'})
    data['stance'] = data['stance'].astype('category')
    # convert from 0=con, 1=pro, 2=neutral to 0=neg, 1=neu, 2=pro
    neu = data['stance'] == 1
    data.loc[data['stance'] == 2, 'stance'] = 1
    data.loc[neu, 'stance'] = 2
else:
    data = pd.DataFrame(docs, columns=['title'])
    label_options = ['world cup', 'Andor', 'China', 'Ukraine'] # Ukraine subjects reflecting positive outcome of war (pos = could end 'well')
    subjects = len(docs_StarWars)*['Andor'] + len(docs_China)*['China'] + len(docs_WorldCup)*['world_cup'] + len(docs_Ukraine)*['Ukraine']
    stance = ['neu', 'neg', 'pos', 'neg', 'neu', 'pos', 'neg', 'pos', 'neu', 'neg', 'neu', 'neg', 'pos', 'neg', 'neu', 'neu', 'pos', 'neu', 'neg', 'neu', 'pos', 'pos', 'neg', 'neg']
    data['stance'] = stance
    data['subjects'] = subjects

    data = data.sample(frac=1).reset_index(drop=True)

    # Note that the categorical index has to start at 0 (so -1 for negative stance isn't allowed)
    data.loc[ data['stance'] == 'neg', 'stance' ] = 0
    data.loc[ data['stance'] == 'neu', 'stance' ] = 1
    data.loc[ data['stance'] == 'pos', 'stance' ] = 2

    data['title'] = data['title'].astype('str')
    data['subjects'] = data['subjects'].astype('str')
    data['stance'] = data['stance'].astype('category')

    # data_path = ( Path.cwd() / "data/RoundedDataset").resolve()
    # data.to_csv(data_path, encoding='utf-8', index=False)

pd.set_option('display.max_colwidth', None)
display(data)
print(data.dtypes)

Unnamed: 0,title,stance,subjects
0,death boy lockdown fuels backlash chinas zerocovid policy,1,China
1,canadas goalkeeper crepeau miss world cup broken leg,1,world_cup
2,star wars characters take mexico city,1,Andor
3,star wars fatigue shouldnt stop you from watching andor,0,Andor
4,alarmed suicide attack china pakistan join hands probe,0,China
5,t20 world cup bangladesh bowlers impress secure victory netherlands,1,world_cup
6,ukraine roundup refugees urged stay away dirty bomb claims,0,Ukraine
7,russia says ukraine hands 50 prisoners war reuterscom,0,Ukraine
8,world cup stadium alcohol ban emblematic contradictions,2,world_cup
9,count dooku voice actor corey burton tried something new star wars tales jedi,1,Andor


title         object
stance      category
subjects      object
dtype: object


Label topics

In [40]:
sentence_model = SentenceTransformer("paraphrase-multilingual-mpnet-base-v2")
embeddings = sentence_model.encode(data['title'])

# TODO: try n_gram_range=(a, b) inside BERTopic
cluster_model = AgglomerativeClustering(linkage='ward', distance_threshold=1.5, n_clusters=None)
topic_model = BERTopic(hdbscan_model=cluster_model).fit(data['title'], embeddings)
topics, probs = topic_model.fit_transform(data['title'])

topic_labels = topic_model.generate_topic_labels(nr_words=3,
                                                 topic_prefix=False,
                                                 word_length=15,
                                                 separator=", ")
topic_model.set_topic_labels(topic_labels)
# topic_model.save('topic_model')

topic_model.get_topic_info()

Unnamed: 0,Topic,Count,Name,CustomName
0,0,9,0_wars_star_andor_jedi,"wars, star, andor"
1,1,6,1_ukraine_war_says_roundup,"ukraine, war, says"
2,2,5,2_cup_world_netherlands_t20,"cup, world, netherlands"
3,3,4,3_china_2hong_yuan_fuels,"china, 2hong, yuan"


In [41]:
topic_labels_series = pd.Series(topic_labels)
docs_topic = topic_labels_series[topics].tolist()
data['predicted_topic'] = docs_topic
display(data)

Unnamed: 0,title,stance,subjects,predicted_topic
0,andor gave us gayest screen fade star wars history,2,Andor,"wars, star, andor"
1,star wars characters take mexico city,1,Andor,"wars, star, andor"
2,t20 world cup bangladesh bowlers impress secure victory netherlands,1,world_cup,"cup, world, netherlands"
3,china supports central soes issue tech innovation bonds,2,China,"china, 2hong, yuan"
4,count dooku voice actor corey burton tried something new star wars tales jedi,1,Andor,"wars, star, andor"
5,andor episode 8 explained rogue one cameos star wars hell prison,0,Andor,"wars, star, andor"
6,ukraine war roundup missile blast poland zambian familys grief,2,Ukraine,"ukraine, war, says"
7,theres new star wars project damon lindelof works,2,Andor,"wars, star, andor"
8,ukraine war wagner chief prigozhin defends brutal killing video,0,Ukraine,"ukraine, war, says"
9,ahsoka tano herself ashley eckstein breaks down star wars tales jedi,0,Andor,"wars, star, andor"


In [42]:
topic_model.visualize_documents(docs, embeddings=embeddings, custom_labels=True)

Stance

In [43]:
# load model
model_path = ( Path.cwd() / "models/stance_model_v2").resolve()
model = keras.models.load_model(model_path)

# inference
probs = model.predict(
    x=[
        data['title'],
        data['predicted_topic']
    ]
)
predicted_class = np.argmax(probs, axis=1)



In [13]:
display(probs)
display(predicted_class)

NameError: name 'probs' is not defined