In [None]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [None]:
#Hyperparameters 

threat_path = '.../nuclear_threat.csv'

by_column = "text" #choose a column on which the data will be trained: "text", "text-medium", "text-short"
side = 'r' # w -- West, r -- Russia
assume = ('W', 'E') #must be a tuple of (label_to_change, suggested_label_instead). "W" -- Warning, "E" -- Escalatory, "D" -- De-escalatory

to_expand = True #if True -- trained sentence-wise
n_epochs = 10 #no. of epochs that model will be trained on
test_on = by_column

#Expand Data

In [None]:
import nltk
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
import string

# Download necessary NLTK resources
nltk.download('punkt')
nltk.download('stopwords')
nltk.download('wordnet')

def preprocess_text(text):
    # Tokenize the text into words
    tokens = word_tokenize(text.lower())
    
    # Remove punctuation
    tokens = [token for token in tokens if token not in string.punctuation]
    
    # Remove stop words
    stop_words = set(stopwords.words('english')).difference({"not", "no"})
    tokens = [token for token in tokens if token not in stop_words]
    
    # Lemmatize the tokens
    lemmatizer = WordNetLemmatizer()
    tokens = [lemmatizer.lemmatize(token) for token in tokens]
    
    return " ".join(tokens)

# Example usage
text = "Hello, how not are you doing today? I hope everything is going well!"
preprocessed_text = preprocess_text(text)
print(preprocessed_text)


hello not today hope everything going well


[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


In [None]:
def expand(data, type, side=None, binary_assumption=None, sent_level = to_expand):
  expanded_data = pd.DataFrame(columns=data.columns)

  if sent_level:
    #if sent_level is True, then we will split the text in a given cell
    #into sentences and split one row into multiple rows of corresponding values
    
    for index, row in data.iterrows():
        text = row[type]
        label = row["label"]
        
        # Split the text into sentences
        sentences = text.split(".")
        
        # Create a new row for each sentence
        for sentence in sentences:
            if sentence.strip() != "":
                sentence = preprocess_text(sentence)
                expanded_data = pd.concat([expanded_data, pd.DataFrame({**row, type: sentence.strip()}, index=[0])], ignore_index=True)
  else:
    expanded_data = data

  if side:
    expanded_data = expanded_data[expanded_data['side'] == side]
  if binary_assumption:
    prev_label, new_label = binary_assumption[0], binary_assumption[1]
    expanded_data.loc[expanded_data['label'] == prev_label, 'label'] = new_label
  return expanded_data

# Text Classification with SpaCy

In [None]:
from tkinter.constants import N
import pandas as pd

nuclear_threat = pd.read_csv(threat_path)
nuclear_threat = expand(nuclear_threat, by_column, side, binary_assumption=assume, sent_level=to_expand)
len(nuclear_threat)

3188

In [None]:
import pandas as pd
from sklearn.model_selection import train_test_split

# Splitting the DataFrame into train and test sets
nuclear_threat_train, nuclear_threat_test = train_test_split(nuclear_threat, test_size=0.1, random_state=40)

print("Train DataFrame:")
print(len(nuclear_threat_train))
print("\nTest DataFrame:")
print(len(nuclear_threat_test))
nuclear_threat_test

Train DataFrame:
2869

Test DataFrame:
319


Unnamed: 0,date,text,text-medium,text-short,side,label,comment
4967,2022-04-25,better time comprehensive project within russi...,"In a lengthy April 25 interview, Lavrov somewh...",Lavrov: The risks of nuclear war are “very sig...,r,E,
686,2022-02-22,later began assure u accession nato central ea...,"In a televised address on February 21, Putin a...",Putin: Ukraine is seeking to acquire tactical ...,r,E,
3256,2022-03-16,focus protecting mother child supporting famil...,"Russian rhetoric, meanwhile, remained aggressi...","Putin: The West wants to “finish off” Russia,...",r,E,
2,2022-01-27,no need review russia ’ nuclear doctrine docum...,"A month before the invasion, the Russian leade...",Medvedev: Russia has “the right” to use nuclea...,r,E,
3140,2022-03-16,rendered aid simply could not otherwise,"Russian rhetoric, meanwhile, remained aggressi...","Putin: The West wants to “finish off” Russia,...",r,E,
...,...,...,...,...,...,...,...
5691,2022-04-29,`` massacre civilian bucha lavrov reply `` mar...,"The same day, April 29, Lavrov qualified his e...",Lavrov: Russian policymakers are “champions o...,r,D,
6588,2022-08-05,26 october 2020 president russian federation v...,"On August 5, Russia reiterated the defensive n...",Russia’s nuclear policy is “aimed exclusively ...,r,D,
5692,2022-04-29,mayor bucha declared victory saying bucha retu...,"The same day, April 29, Lavrov qualified his e...",Lavrov: Russian policymakers are “champions o...,r,D,
1968,2022-03-03,cia extensive presence time,"On March 2, Russian Foreign Ministry spokeswom...","Lavrov: Thought of nuclear war is on Western,...",r,D,


# Building a Bag of Words model


In [None]:
import spacy
# Create an empty model
nlp = spacy.blank("en")

In [None]:
if assume:
  #binary classification
  # Add the TextCategorizer to the empty model
  textcat = nlp.add_pipe("textcat")
  # Add labels to text classifier
  textcat.add_label("E")
  textcat.add_label("D")
else:
  #multiple labels
  textcat = nlp.add_pipe("textcat_multilabel")
  textcat.add_label("E")
  textcat.add_label("D")
  textcat.add_label("W")

# Training a Text Categorizer Model

In [None]:
train_texts = nuclear_threat_train[by_column].values
if assume:
  train_labels = [{'cats': {'E': label == 'E',
                            'D': label == 'D'}} 
                  for label in nuclear_threat_train['label']]
else:
  train_labels = [{'cats': {'E': label == 'E',
                          'D': label == 'D', 'W':label == 'W'}} 
                for label in nuclear_threat_train['label']]

Then we combine the texts and labels into a single list.

In [None]:
train_data = list(zip(train_texts, train_labels))
train_data[:10]

[('regard question rhetorical', {'cats': {'E': False, 'D': True}}),
 ('question short still chose read piece paper',
  {'cats': {'E': False, 'D': True}}),
 ('always remember', {'cats': {'E': True, 'D': False}}),
 ('policy confrontation', {'cats': {'E': True, 'D': False}}),
 ('enticing ukraine idea one day country become part west defeat putin',
  {'cats': {'E': False, 'D': True}}),
 ('would like point situation not considered separately development past 30 year full various event relation russia west west rest world particular united state',
  {'cats': {'E': False, 'D': True}}),
 ('experience serve good lesson u shown u paralysis power first step towards complete degradation oblivion',
  {'cats': {'E': True, 'D': False}}),
 ('ten year ago', {'cats': {'E': True, 'D': False}}),
 ('domestic sourcing not complete achieve mean',
  {'cats': {'E': True, 'D': False}}),
 ('not ignore thing either especially know so-called west act regard russia',
  {'cats': {'E': True, 'D': False}})]

In [None]:
from spacy.util import minibatch
from spacy.training.example import Example

spacy.util.fix_random_seed(1)
optimizer = nlp.begin_training()

# Create the batch generator with batch size = 8
batches = minibatch(train_data, size=8)
# Iterate through minibatches
for batch in batches:
    # Each batch is a list of (text, label) 
    for text, labels in batch:
        doc = nlp.make_doc(text)
        example = Example.from_dict(doc, labels)
        nlp.update([example], sgd=optimizer)

This is just one training loop (or epoch) through the data. The model will typically need multiple epochs. Use another loop for more epochs, and optionally re-shuffle the training data at the begining of each loop. 

In [None]:
import random

random.seed(1)
spacy.util.fix_random_seed(1)
optimizer = nlp.begin_training()

losses = {}
for epoch in range(n_epochs):
    random.shuffle(train_data)
    # Create the batch generator with batch size = 8
    batches = minibatch(train_data, size=8)
    # Iterate through minibatches
    for batch in batches:
        for text, labels in batch:
            doc = nlp.make_doc(text)
            example = Example.from_dict(doc, labels)
            nlp.update([example], sgd=optimizer, losses=losses)
    print(losses)

{'textcat': 619.4779936905276}
{'textcat': 1176.695700145588}
{'textcat': 1561.2454800420169}
{'textcat': 1813.8168988087452}
{'textcat': 1970.361529583255}
{'textcat': 2076.159996243773}
{'textcat': 2167.188737259952}
{'textcat': 2252.380002380052}
{'textcat': 2309.943610894378}
{'textcat': 2374.368548135213}


# Making Predictions

In [None]:
texts = nuclear_threat_test[test_on]
docs = [nlp.tokenizer(text) for text in texts]

# Use textcat to get the scores for each doc
if assume:
  textcat = nlp.get_pipe('textcat')
else:
  textcat = nlp.get_pipe('textcat_multilabel')
scores = textcat.predict(docs)

The scores are used to predict a single class or label by choosing the label with the highest probability. You get the index of the highest probability with `scores.argmax`, then use the index to get the label string from `textcat.labels`.

In [None]:
# From the scores, find the label with the highest score/probability
predicted_labels = scores.argmax(axis=1)
predicted_labels = [textcat.labels[label] for label in predicted_labels]

Examples

In [None]:
# texts = ["russia has a right for defensive nuclear alert"]
# docs = [nlp.tokenizer(text) for text in texts]
    
# # Use textcat to get the scores for each doc
# textcat = nlp.get_pipe('textcat')
# scores = textcat.predict(docs)
# # From the scores, find the label with the highest score/probability
# predicted_labels = scores.argmax(axis=1)
# predicted_labels = [textcat.labels[label] for label in predicted_labels]
# predicted_labels

#Evaluation. Confusion matrix

In [None]:
labels_test = list(nuclear_threat_test['label'])
len(labels_test)

319

In [None]:
tp = 0
tn = 0
fp = 0
fn = 0

for i in range(len(labels_test)):
  print(40*"-")
  print("Event ", i, predicted_labels[i], labels_test[i])
  if predicted_labels[i] == "E" and labels_test[i] == "E":
    tp += 1
    print("TP")
  if predicted_labels[i] != "E" and labels_test[i] != "E":
    tn += 1
    print("TN")
  if predicted_labels[i] == "E" and labels_test[i] != "E":
    fp += 1
    print("FP")
  if predicted_labels[i] != "E" and labels_test[i] == "E":
    fn += 1
    print("FN")
  print(docs[i])
tp, tn, fp, fn

----------------------------------------
Event  0 E E
TP
better time comprehensive project within russia-nato council work together promote settlement afghanistan
----------------------------------------
Event  1 E E
TP
later began assure u accession nato central eastern european country would improve relation moscow relieve country fear steeped bitter historical legacy even create belt country friendly towards russia
----------------------------------------
Event  2 E E
TP
focus protecting mother child supporting family child
----------------------------------------
Event  3 E E
TP
no need review russia ’ nuclear doctrine document make possible achieve various goal deputy chairman russian security council dmitry medvedev said interview russian medium outlet including ta
----------------------------------------
Event  4 E E
TP
rendered aid simply could not otherwise
----------------------------------------
Event  5 D D
TN
far could putin go concern mass mobilization could accompanied i

(202, 33, 55, 29)

In [None]:
def calculate_f1_score(tp, tn, fp, fn):
    if tp+fp == 0:
      precision = 0.0
    else:
      precision = tp / (tp + fp)
    
    if tp+fn == 0:
      recall = 0.0
    else:
      recall = tp / (tp + fn)
    if precision + recall == 0:
      f1_score = 0.0
    else:
      f1_score = 2 * (precision * recall) / (precision + recall)
    return f1_score

def calculate_accuracy(tp, tn, fp, fn):
    accuracy = (tp + tn) / (tp + tn + fp + fn)
    return accuracy

print("F1: ", calculate_f1_score(tp, tn, fp, fn))
print("Accuracy: ", calculate_accuracy(tp, tn, fp, fn))

F1:  0.8278688524590164
Accuracy:  0.7366771159874608


#Top words associated with each class

In [None]:
import pandas as pd
import re

df = nuclear_threat
# Assuming 'df' is your dataframe and 'text_column' is the column containing the text data
text_data = ' '.join(df['text'].astype(str).tolist())

# Tokenize the text into individual words
tokens = nltk.word_tokenize(text_data)

# Generate ngrams from the tokens
bigrams = nltk.ngrams(tokens, 5)

# Get unique bigrams using a set
unique_bigrams = set(bigrams)

# Convert the set back to a list
unique_bigrams_list = list(unique_bigrams)
unique_bigrams_list = [" ".join(b) for b in unique_bigrams]

texts = unique_bigrams_list
docs = [nlp.tokenizer(text) for text in texts]

# Use textcat to get the scores for each doc
if assume:
  textcat = nlp.get_pipe('textcat')
else:
  textcat = nlp.get_pipe('textcat_multilabel')
scores = textcat.predict(docs)

dictionary = {k: v[0] for k, v in zip(texts, scores)} #v[0] -- escalation; v[1] -- de-escalation

sorted_dict = dict(sorted(dictionary.items(), key=lambda x: x[1],reverse=True))

top_k = 50
i = 0
for key, value in sorted_dict.items():
    print(key, value)
    i+=1
    if i == top_k:
      break

regime outwardly looked wonderful attractive 1.0
also touched upon topical international 1.0
also find new joint solution 1.0
u guarantee not happen also 1.0
also generally accepted norm morality 1.0
statement unit involved also practiced 1.0
must also find new joint 1.0
west would provided endless support 1.0
account russian company individual also 1.0
immediate responsibility especially training also 1.0
mind russia belarus largest supplier 1.0
people russia said wednesday force 1.0
minsk agreement signed approved un 1.0
economy global trade whole suffered 1.0
inherited not soviet era also 1.0
everyone russia everyone blaming russia 1.0
principle committed risk quite high 1.0
according statement unit involved also 1.0
gorbachev reagan said 1987 accept 1.0
lost industrial technological potential – 0.9999999
whelan brittney griner realistic proposal 0.9999999
done blatantly stated united state 0.9999999
important people business people aware 0.9999999
everyone ’ mantra third world 0.99