

---


# Right Classifier


---



In [1]:
# Import Libraries
import pandas as pd, numpy as np
from sklearn.svm import SVC
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import accuracy_score, classification_report, precision_recall_fscore_support
from sentence_transformers import SentenceTransformer
from collections import defaultdict
import joblib

# Import data
from google.colab import files
uploaded = files.upload()

RANDOM_STATE = 42

# Load and prepare data
df = pd.read_excel("MANUALLY_ANNOTATED_DATA.xlsx")[['text','right']].dropna()
df['text'] = df['text'].astype(str)

label_encoder = LabelEncoder()
df['label'] = label_encoder.fit_transform(df['right'])

X_train_texts, X_test_texts, y_train, y_test = train_test_split(
    df['text'], df['label'],
    stratify=df['label'], test_size=0.2, random_state=RANDOM_STATE
)

# SBERT embeddings
print("Encoding SBERT embeddings...")
sbert = SentenceTransformer('paraphrase-multilingual-mpnet-base-v2')
X_train = sbert.encode(X_train_texts.tolist(), show_progress_bar=True)
X_test  = sbert.encode(X_test_texts.tolist(),  show_progress_bar=True)

# Train linear SVM
print("Training linear SVM (C=1, no class weights, no probabilities)...")
svm = SVC(kernel='linear', C=1, class_weight=None, probability=False, random_state=RANDOM_STATE)
svm.fit(X_train, y_train)

# Save model and encoder for pipeline reuse
joblib.dump(svm, "right_sbert_model.joblib")
joblib.dump(label_encoder, "right_label_encoder.joblib")
joblib.dump({"features": "SBERT", "C": 1, "class_weight": None, "probabilities": False},
            "right_model_config.joblib")
print("Saved: right_sbert_model.joblib, right_label_encoder.joblib, right_model_config.joblib")

# Predict & evaluate (single label)
y_pred = svm.predict(X_test)

top1_acc = accuracy_score(y_test, y_pred)
macro_prf = precision_recall_fscore_support(y_test, y_pred, average="macro", zero_division=0)
weighted_prf = precision_recall_fscore_support(y_test, y_pred, average="weighted", zero_division=0)

print(f"\nTop-1 Accuracy: {top1_acc:.3f}")
print(f" Macro P/R/F1: {macro_prf[0]:.3f} / {macro_prf[1]:.3f} / {macro_prf[2]:.3f}")
print(f" Weighted P/R/F1: {weighted_prf[0]:.3f} / {weighted_prf[1]:.3f} / {weighted_prf[2]:.3f}")

print("\n Classification report (Top-1):\n",
      classification_report(y_test, y_pred, target_names=label_encoder.classes_, digits=3, zero_division=0))

# Save detailed predictions and per-class accuracy
results, per_class = [], defaultdict(lambda: [0, 0])
for i in range(len(X_test_texts)):
    true = y_test.iloc[i]
    pred = y_pred[i]
    results.append({
        "text": X_test_texts.iloc[i],
        "true_label": label_encoder.inverse_transform([true])[0],
        "prediction": label_encoder.inverse_transform([pred])[0]
    })
    lab = label_encoder.inverse_transform([true])[0]
    per_class[lab][1] += 1
    if true == pred:
        per_class[lab][0] += 1

pd.DataFrame(results).to_csv("right_svm_sbert_predictions.csv", index=False)
print("Predictions saved to right_svm_sbert_predictions.csv")

print("\n Per-Category Top-1 Accuracy:")
print("Right\tTop-1 Acc\tSupport")
for lab in sorted(per_class.keys()):
    correct, total = per_class[lab]
    acc = correct / total if total else 0.0
    print(f"{lab}\t{acc:.2f}\t\t{total}")


Saving MANUALLY_ANNOTATED_DATA.xlsx to MANUALLY_ANNOTATED_DATA.xlsx
Encoding SBERT embeddings...


The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


modules.json:   0%|          | 0.00/229 [00:00<?, ?B/s]

config_sentence_transformers.json:   0%|          | 0.00/122 [00:00<?, ?B/s]

README.md: 0.00B [00:00, ?B/s]

sentence_bert_config.json:   0%|          | 0.00/53.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/723 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/1.11G [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/402 [00:00<?, ?B/s]

sentencepiece.bpe.model:   0%|          | 0.00/5.07M [00:00<?, ?B/s]

tokenizer.json: 0.00B [00:00, ?B/s]

special_tokens_map.json:   0%|          | 0.00/239 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/190 [00:00<?, ?B/s]

Batches:   0%|          | 0/22 [00:00<?, ?it/s]

Batches:   0%|          | 0/6 [00:00<?, ?it/s]

Training linear SVM (C=1, no class weights, no probabilities)...
Saved: right_sbert_model.joblib, right_label_encoder.joblib, right_model_config.joblib

Top-1 Accuracy: 0.711
 Macro P/R/F1: 0.536 / 0.469 / 0.477
 Weighted P/R/F1: 0.699 / 0.711 / 0.698

 Classification report (Top-1):
               precision    recall  f1-score   support

       EQUAL      0.250     0.143     0.182         7
     GENERAL      0.756     0.775     0.765        80
    SECURITY      0.333     0.500     0.400         6
      STATUS      0.500     0.143     0.222         7
      STRIKE      0.750     0.836     0.791        61
       WAGES      0.625     0.417     0.500        12

    accuracy                          0.711       173
   macro avg      0.536     0.469     0.477       173
weighted avg      0.699     0.711     0.698       173

Predictions saved to right_svm_sbert_predictions.csv

 Per-Category Top-1 Accuracy:
Right	Top-1 Acc	Support
EQUAL	0.14		7
GENERAL	0.78		80
SECURITY	0.50		6
STATUS	0.14		7




---
# Same model but with POS tags
---


In [3]:
# Import Libraries
import pandas as pd, numpy as np
from sklearn.svm import SVC
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import accuracy_score, classification_report, precision_recall_fscore_support
from sentence_transformers import SentenceTransformer
from sklearn.base import BaseEstimator, TransformerMixin
import spacy, joblib

RANDOM_STATE = 42

# Load and prepare data
df = pd.read_excel("MANUALLY_ANNOTATED_DATA.xlsx")[['text','right']].dropna()
df['text'] = df['text'].astype(str)

label_encoder = LabelEncoder()
df['label'] = label_encoder.fit_transform(df['right'])

X_train_texts, X_test_texts, y_train, y_test = train_test_split(
    df['text'], df['label'], stratify=df['label'], test_size=0.2, random_state=RANDOM_STATE
)

# SBERT embeddings
print("Encoding SBERT embeddings...")
sbert = SentenceTransformer('paraphrase-multilingual-mpnet-base-v2')
X_train_sbert = sbert.encode(X_train_texts.tolist(), show_progress_bar=True)
X_test_sbert  = sbert.encode(X_test_texts.tolist(),  show_progress_bar=True)

# POS tag frequency features
print("Extracting POS features...")
nlp = spacy.load("en_core_web_sm")

class SpacyFeatures(BaseEstimator, TransformerMixin):
    def __init__(self):
        self.pos_tags = ['NOUN','VERB','ADJ','ADV','AUX','PRON','ADP','CCONJ','DET','NUM','PROPN']
    def fit(self, X, y=None): return self
    def transform(self, texts):
        feats = []
        for doc in nlp.pipe(texts, disable=["ner","parser"]):
            counts = {t:0 for t in self.pos_tags}
            for tok in doc:
                if tok.pos_ in counts: counts[tok.pos_] += 1
            total = sum(counts.values()) + 1e-6
            feats.append([counts[t]/total for t in self.pos_tags])
        return np.array(feats)

X_train_pos = SpacyFeatures().fit_transform(X_train_texts)
X_test_pos  = SpacyFeatures().fit_transform(X_test_texts)

# Concatenate SBERT and POS
X_train = np.hstack([X_train_sbert, X_train_pos])
X_test  = np.hstack([X_test_sbert,  X_test_pos])

# Train SVM
print("Training linear SVM with POS...")
svm = SVC(kernel='linear', C=1, class_weight=None, probability=False, random_state=RANDOM_STATE)
svm.fit(X_train, y_train)

# Save model and encoder
joblib.dump(svm, "right_svm.joblib")
joblib.dump(label_encoder, "right_label_encoder.joblib")
joblib.dump({"features":"SBERT+POS", "C":1, "class_weight":None, "probability":False}, "right_model_config.joblib")
print("Saved: right_svm.joblib, right_label_encoder.joblib, right_model_config.joblib")

# Predict and evaluate
scores = svm.decision_function(X_test)
top1 = np.argmax(scores, axis=1)

acc = accuracy_score(y_test, top1)
macro_prf = precision_recall_fscore_support(y_test, top1, average="macro", zero_division=0)
weighted_prf = precision_recall_fscore_support(y_test, top1, average="weighted", zero_division=0)

print(f"\n Top-1 Accuracy: {acc:.3f}")
print(f"Macro P/R/F1: {macro_prf[0]:.3f} / {macro_prf[1]:.3f} / {macro_prf[2]:.3f}")
print(f"Weighted P/R/F1: {weighted_prf[0]:.3f} / {weighted_prf[1]:.3f} / {weighted_prf[2]:.3f}")

print("\n Classification report (Top-1):\n",
      classification_report(y_test, top1, target_names=label_encoder.classes_, digits=3, zero_division=0))

# Save predictions
out = pd.DataFrame({
    "text": X_test_texts.reset_index(drop=True),
    "true_label": label_encoder.inverse_transform(y_test),
    "pred_label": label_encoder.inverse_transform(top1)
})
out.to_csv("right_svm_sbert_pos_predictions.csv", index=False)
print("Predictions saved to right_svm_sbert_pos_predictions.csv")


Encoding SBERT embeddings...


Batches:   0%|          | 0/22 [00:00<?, ?it/s]

Batches:   0%|          | 0/6 [00:00<?, ?it/s]

Extracting POS features...
Training linear SVM with POS...
Saved: right_svm.joblib, right_label_encoder.joblib, right_model_config.joblib

 Top-1 Accuracy: 0.711
Macro P/R/F1: 0.548 / 0.470 / 0.479
Weighted P/R/F1: 0.699 / 0.711 / 0.696

 Classification report (Top-1):
               precision    recall  f1-score   support

       EQUAL      0.333     0.143     0.200         7
     GENERAL      0.753     0.762     0.758        80
    SECURITY      0.333     0.500     0.400         6
      STATUS      0.500     0.143     0.222         7
      STRIKE      0.743     0.852     0.794        61
       WAGES      0.625     0.417     0.500        12

    accuracy                          0.711       173
   macro avg      0.548     0.470     0.479       173
weighted avg      0.699     0.711     0.696       173

Predictions saved to right_svm_sbert_pos_predictions.csv


In [4]:
print("\n Per-Category Top-1 Accuracy:")
print("Right\tTop-1 Acc\tSupport")
for lab in sorted(per_class.keys()):
    correct, total = per_class[lab]
    acc = correct / total if total else 0.0
    print(f"{lab}\t{acc:.2f}\t\t{total}")



 Per-Category Top-1 Accuracy:
Right	Top-1 Acc	Support
EQUAL	0.14		7
GENERAL	0.78		80
SECURITY	0.50		6
STATUS	0.14		7
STRIKE	0.84		61
WAGES	0.42		12
