In [2]:
import pandas as pd
import json
import re
import nltk
from nltk.corpus import stopwords
import warnings
warnings.filterwarnings("ignore")

# Load NLTK stopwords
nltk.download('stopwords')
stop_words = stopwords.words('english')
stop_words.extend(['from', 'subject', 're', 'edu', 'use'])

# Load JSON data
with open('../data/processed/UN_data.json', 'r') as file:
    data = json.load(file)

# Convert JSON data to DataFrame
articles = pd.DataFrame(data)

def preprocess_text(text):
    text = re.sub('[,\.!?]', '', text)
    text = text.lower()
    return text

articles['text_processed'] = articles['text'].apply(preprocess_text)

# Tokenize and remove stopwords
def remove_stopwords(texts):
    return [" ".join([word for word in re.findall(r'\b\w+\b', doc) if word not in stop_words]) for doc in texts]

data = articles['text_processed'].values.tolist()
data_processed = remove_stopwords(data)

[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/ritutoshniwal/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [3]:
articles['text_processed'] = data_processed
articles.head()

Unnamed: 0,session,year,country,text,label,text_processed
0,57,2002,PAN,﻿Allow me\nto begin my statement by expressing...,135,allow begin statement expressing pleasure gove...
1,57,2002,IND,﻿I\ncongratulate Mr. Kavan on his election as ...,77,congratulate mr kavan election president gener...
2,57,2002,MDV,"﻿Mr. President, it gives\nme great pleasure to...",110,mr president gives great pleasure join others ...
3,57,2002,DJI,﻿In a year\nof frightful tragedy and uncertain...,45,year frightful tragedy uncertainty least fortu...
4,57,2002,NLD,"﻿The smoke\nat ground zero, only a few blocks ...",128,smoke ground zero blocks away cleared empty sp...


In [4]:
# countries with coalition of the willing
coalition_of_willing_countries_list = ["United States",
    "Afghanistan", "Albania", "Australia", "Azerbaijan", "Bulgaria", "Colombia",
    "Czechia", "Denmark", "El Salvador", "Eritrea", "Estonia", "Ethiopia",
    "Georgia", "Hungary", "Iceland", "Italy", "Japan", "Korea, Republic of", "Latvia",
    "Lithuania", "North Macedonia", "Netherlands", "Nicaragua", "Philippines", "Poland",
    "Romania", "Slovakia", "Spain", "Türkiye", "United Kingdom", "Uzbekistan",
    "Costa Rica", "Dominican Republic", "Honduras", "Kuwait", "Marshall Islands",
    "Micronesia, Federated States of", "Mongolia", "Palau", "Portugal", "Rwanda", "Singapore",
    "Solomon Islands", "Uganda", "Panama", "Angola", "Tonga", "Ukraine"
]
import pycountry
countries = articles['country'].unique()
country_names = {country.alpha_3: country.name for country in pycountry.countries}
#add country names to the data
articles['country_name'] = articles['country'].map(country_names)
#add column called coalition_of_willing, if the country is in the list, then it is part of the coalition
articles['coalition_of_willing'] = articles['country_name'].isin(coalition_of_willing_countries_list)
articles.head()
# Replace coalition_of_willing with 1 and 0
articles['coalition_of_willing'] = articles['coalition_of_willing'].astype(int)
articles.head()

Unnamed: 0,session,year,country,text,label,text_processed,country_name,coalition_of_willing
0,57,2002,PAN,﻿Allow me\nto begin my statement by expressing...,135,allow begin statement expressing pleasure gove...,Panama,1
1,57,2002,IND,﻿I\ncongratulate Mr. Kavan on his election as ...,77,congratulate mr kavan election president gener...,India,0
2,57,2002,MDV,"﻿Mr. President, it gives\nme great pleasure to...",110,mr president gives great pleasure join others ...,Maldives,0
3,57,2002,DJI,﻿In a year\nof frightful tragedy and uncertain...,45,year frightful tragedy uncertainty least fortu...,Djibouti,0
4,57,2002,NLD,"﻿The smoke\nat ground zero, only a few blocks ...",128,smoke ground zero blocks away cleared empty sp...,Netherlands,1


In [5]:
# Select the relevant columns
X_text = articles['text_processed'].tolist()
y = articles['coalition_of_willing'].tolist()

In [6]:
from transformers import RobertaTokenizer, RobertaModel
import torch

# Load the tokenizer and pre-trained model
tokenizer = RobertaTokenizer.from_pretrained("roberta-base")
model = RobertaModel.from_pretrained("roberta-base")

Some weights of RobertaModel were not initialized from the model checkpoint at roberta-base and are newly initialized: ['roberta.pooler.dense.bias', 'roberta.pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [7]:
def get_embeddings(texts):
    # Tokenize the input texts
    inputs = tokenizer(texts, padding=True, truncation=True, return_tensors="pt")

    # Move to GPU if available
    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
    model.to(device)
    inputs = {key: val.to(device) for key, val in inputs.items()}

    # Get embeddings
    with torch.no_grad():
        outputs = model(**inputs)
        # Use the [CLS] token's representation as the embedding
        embeddings = outputs.last_hidden_state[:, 0, :].cpu().numpy()

    return embeddings

In [8]:
# Generate embeddings for all your text data
embeddings = get_embeddings(X_text)
print(embeddings.shape)  # Shape should be (num_samples, embedding_dim)

KeyboardInterrupt: 