# Bag of Words model


In [1]:
!nvidia-smi

Tue Mar  4 16:34:08 2025       
+---------------------------------------------------------------------------------------+
| NVIDIA-SMI 535.183.01             Driver Version: 535.183.01   CUDA Version: 12.2     |
|-----------------------------------------+----------------------+----------------------+
| GPU  Name                 Persistence-M | Bus-Id        Disp.A | Volatile Uncorr. ECC |
| Fan  Temp   Perf          Pwr:Usage/Cap |         Memory-Usage | GPU-Util  Compute M. |
|                                         |                      |               MIG M. |
|   0  Tesla T4                       Off | 00000000:00:06.0 Off |                    0 |
| N/A   40C    P8              10W /  70W |      2MiB / 15360MiB |      0%      Default |
|                                         |                      |                  N/A |
+-----------------------------------------+----------------------+----------------------+
                                                                    

In [2]:
import pandas as pd
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.model_selection import train_test_split
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import accuracy_score, f1_score

Load Dataset

In [3]:
def convert_to_binary(label) -> int:
    """
    Paragraphs with original labels of 0 or 1 are considered to be negative examples of PCL and will have the label 0 = negative.
		Paragraphs with original labels of 2, 3 or 4 are considered to be positive examples of PCL and will have the label 1 = positive.
    """
    # If label_array is a string, convert it to a list
    return 0 if label == 0 or label == 1 else 1 

In [4]:
TRAIN_DATA_PATH = "data/dontpatronizeme_pcl.tsv"
TRAIN_ALLOC_PATH = "data/train_semeval_parids-labels.csv"
DEV_ALLOC_PATH = "data/dev_semeval_parids-labels.csv"

column_names = ["par_id", "art_id", "keyword", "country", "text", "orig_label"]
text_data = pd.read_csv(TRAIN_DATA_PATH, sep="\t", header=None, names=column_names)
text_data["label"] = text_data["orig_label"].apply(convert_to_binary)

print("Text Data shape:", text_data.shape)

train_alloc = pd.read_csv(TRAIN_ALLOC_PATH).drop("label", axis=1)
official_dev_alloc = pd.read_csv(DEV_ALLOC_PATH).drop("label", axis=1)
print("Train Allocation shape:", train_alloc.shape)
print("Official Dev Allocation shape:", official_dev_alloc.shape)

# our train data (and dev if internally split) 
train_data = pd.merge(train_alloc, text_data, on="par_id")
# our test data
official_dev_data = pd.merge(official_dev_alloc, text_data, on="par_id")
print("Merged Train Data shape:", train_data.shape)
print("Merged Official Dev Data shape:", official_dev_data.shape)


Text Data shape: (10469, 7)
Train Allocation shape: (8375, 1)
Official Dev Allocation shape: (2094, 1)
Merged Train Data shape: (8375, 7)
Merged Official Dev Data shape: (2094, 7)


Remove Nan text that was present in dev set

In [5]:
# Step 1: Identify rows with NaN in the 'text' column
nan_rows = official_dev_data[official_dev_data["text"].isna()]

# Step 2: Print the rows with NaN values
print("Rows with NaN values in 'text' column:")
print(nan_rows)

# Step 3: Remove rows with NaN values from the original official_dev_data
# official_dev_data = official_dev_data.dropna(subset=["text"])

# Step 3 alternative: replace with placeholder string
official_dev_data["text"] = official_dev_data["text"].fillna("missing text")

# Optionally, print the cleaned data shape
print("Cleaned Official Dev Data shape:", official_dev_data.shape)

Rows with NaN values in 'text' column:
     par_id      art_id  keyword country text  orig_label  label
434    8640  @@16852855  migrant      ke  NaN           0      0
Cleaned Official Dev Data shape: (2094, 7)


Extract text and labels

In [6]:
X = train_data["text"]
y_train = train_data["label"]

Convert to bag of words features

In [7]:
vectorizer = CountVectorizer()
X_train = vectorizer.fit_transform(X)

Internal data split (if necessary)

In [8]:
# Train-test split (of official training set)
# X_train, X_test, y_train, y_test = train_test_split(X_bow, y, test_size=0.2, random_state=16, stratify=y)


Train Naive Bayes classifier

In [9]:
model = MultinomialNB()
model.fit(X_train, y_train)


Evaluate on the Official DevSet

In [10]:
# Prepare 
X_official_dev = vectorizer.transform(official_dev_data["text"])
y_official_dev = official_dev_data["label"]

# Evaluate 
y_pred_official = model.predict(X_official_dev)
accuracy_official = accuracy_score(y_official_dev, y_pred_official)
f1_official = f1_score(y_official_dev, y_pred_official)

print(f"Official Dev Accuracy: {accuracy_official:.4f}")
print(f"Official Dev F1 Score: {f1_official:.4f}")

Official Dev Accuracy: 0.9054
Official Dev F1 Score: 0.0660


Finding mislabeled data for analysis

In [11]:
import numpy as np

# Get misclassified indices
misclassified_indices = np.where(y_official_dev != y_pred_official)[0]

# Open a file to write misclassified examples
with open("misclassified_examples.txt", "w", encoding="utf-8") as f:
    for idx in misclassified_indices:
        text = official_dev_data.iloc[idx]["text"]
        true_label = y_official_dev[idx]
        predicted_label = y_pred_official[idx]
        
        f.write(f"Text: {text}\n")
        f.write(f"True Label: {true_label}\n")
        f.write(f"Predicted Label: {predicted_label}\n")
        f.write("-" * 50 + "\n")  # Separator for readability

print(f"Misclassified examples written to misclassified_examples.txt")


Misclassified examples written to misclassified_examples.txt
