In [1]:
import numpy as np
import pandas as pd

In [2]:
des_path = "UN_kfold/"

In [3]:
articles = pd.read_csv("UN_preprocessd_data.csv")
articles

Unnamed: 0,session,year,country,text,label,text_processed,country_name,coalition_of_willing
0,57,2002,PAN,﻿Allow me\nto begin my statement by expressing...,135,﻿allow me\nto begin my statement by expressing...,Panama,1
1,57,2002,IND,﻿I\ncongratulate Mr. Kavan on his election as ...,77,﻿i\ncongratulate mr kavan on his election as p...,India,0
2,57,2002,MDV,"﻿Mr. President, it gives\nme great pleasure to...",110,﻿mr president it gives\nme great pleasure to j...,Maldives,0
3,57,2002,DJI,﻿In a year\nof frightful tragedy and uncertain...,45,﻿in a year\nof frightful tragedy and uncertain...,Djibouti,0
4,57,2002,NLD,"﻿The smoke\nat ground zero, only a few blocks ...",128,﻿the smoke\nat ground zero only a few blocks a...,Netherlands,1
...,...,...,...,...,...,...,...,...
1133,62,2007,AGO,"At the outset, \nI would like to salute the in...",1,at the outset \ni would like to salute the int...,Angola,1
1134,62,2007,AUT,Let me address at the \noutset a burning issue...,9,let me address at the \noutset a burning issue...,Austria,0
1135,62,2007,MNE,"Allow me, Mr. President, to join others in w...",117,allow me mr president to join others in welc...,Montenegro,0
1136,62,2007,FJI,"On behalf of the people of Fiji, I extend to ...",57,on behalf of the people of fiji i extend to y...,Fiji,0


In [4]:
texts = articles["text_processed"].values

In [5]:
from sklearn.feature_extraction.text import CountVectorizer

vectorizer = CountVectorizer(max_df=0.5, min_df=2, stop_words="english")
# docs = torch.from_numpy(vectorizer.fit_transform(news["data"]).toarray())
docs = vectorizer.fit_transform(texts).toarray()

vocab = pd.DataFrame(columns=["word", "index"])
vocab["word"] = vectorizer.get_feature_names_out()
vocab["index"] = vocab.index
vocab["word"]

0                    015
1                     02
2                    020
3                    033
4                    039
              ...       
14873           ìfrozenî
14874                ìin
14875    ìresponsibility
14876         ìstandards
14877                ìwe
Name: word, Length: 14878, dtype: object

In [6]:
print("Dictionary size: %d" % len(vocab))
print("Corpus size: {}".format(docs.shape))

Dictionary size: 14878
Corpus size: (1138, 14878)


In [7]:
docs

array([[0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       ...,
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0]])

In [8]:
label_docs = articles["coalition_of_willing"].values
label_docs

array([1, 0, 0, ..., 0, 0, 0])

In [9]:
# save docs to file, format as non-zero count of columns, then followed by sum of word index and count
with open(des_path + "UN_kfold.dat", "w") as f:
    for i in range(len(docs)):
        # number of non-zero columns for docs[i]
        f.write(str(len(np.nonzero(docs[i])[0])) + " ")
        for j in range(len(vocab)):
            if docs[i][j] > 0:
                f.write(str(j) + ":" + str(docs[i][j]) + " ")
        f.write("\n")

In [10]:
with open(des_path + "label_UN_kfold.dat", "w") as f:
    for i in range(len(label_docs)):
        # only write the label if it is not -1
        if label_docs[i] != -1:
            f.write(str(label_docs[i]) + "\n")

In [11]:
# ceate a function that can copy the lines from one file to another by a list of indices
def copy_lines(file, indices, output):
    with open(file, "r") as f:
        lines = f.readlines()
        with open(output, "w") as out:
            for i in indices:
                out.write(lines[i])

In [12]:
from sklearn.model_selection import StratifiedKFold

# remove the -1 labels
label_docs = label_docs[label_docs != -1]

# split the data into 5 folds
skf = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)
# initialize the fold label with all zeros
kfold = np.zeros(len(label_docs))

# create the folds for the data
for i, (train_index, test_index) in enumerate(skf.split(texts, label_docs)):
    copy_lines(
        des_path + "UN_kfold.dat",
        train_index,
        des_path + "UN_kfold_train_" + str(i) + ".dat",
    )
    copy_lines(
        des_path + "UN_kfold.dat",
        test_index,
        des_path + "UN_kfold_test_" + str(i) + ".dat",
    )
    copy_lines(
        des_path + "label_UN_kfold.dat",
        train_index,
        des_path + "label_UN_kfold_train_" + str(i) + ".dat",
    )
    copy_lines(
        des_path + "label_UN_kfold.dat",
        test_index,
        des_path + "label_UN_kfold_test_" + str(i) + ".dat",
    )
    print(i, len(train_index), len(test_index))
    kfold[test_index] = i

# save the fold label
np.savetxt(des_path + "UN_kfold_label.csv", kfold, fmt="%d")

0 910 228
1 910 228
2 910 228
3 911 227
4 911 227


In [13]:
num_topics = 2

In [14]:
import subprocess


def train_test(kfold):
    accur = []
    for k in range(kfold):
        train_data = des_path + f"UN_kfold_train_{k}.dat"
        train_label = des_path + f"label_UN_kfold_train_{k}.dat"
        train_tmp = des_path + f"UN_tmp_{k}"
        test_data = des_path + f"UN_kfold_test_{k}.dat"
        test_label = des_path + f"label_UN_kfold_test_{k}.dat"
        model_path = train_tmp + "/final.model"
        test_tmp = des_path + f"UN_tmp_inf_{k}"
        # run the following command in the terminal
        subprocess.run(
            f"./slda est {train_data} {train_label} ./settings.txt 1.0 {num_topics} random {train_tmp}",
            shell=True,
            stdout=subprocess.DEVNULL,
            stderr=subprocess.DEVNULL,
        )
        result = subprocess.run(
            f"./slda inf {test_data} {test_label} ./settings.txt {model_path} {test_tmp}",
            shell=True,
            capture_output=True,
            text=True,
        )
        # Split the output into lines
        output_lines = result.stdout.splitlines()

        # Get the last line of the output
        last_line = output_lines[-1] if output_lines else "No output"

        # Extract the accuracy value from the last line
        if "average accuracy:" in last_line:
            accuracy = last_line.split("average accuracy:")[1].strip()
            print(f"Fold {k} accuracy: {accuracy}")
            accur.append(float(accuracy))
        else:
            print(f"Fold {k} failed")

    print(f"Average accuracy: {np.mean(accur)}")

In [15]:
train_test(5)

Fold 0 accuracy: 0.741
Fold 1 accuracy: 0.741
Fold 2 accuracy: 0.741
Fold 3 accuracy: 0.744
Fold 4 accuracy: 0.744
Average accuracy: 0.7421999999999999
