# **Basic SVM**

In [None]:
import pandas as pd
import numpy as np
from sklearn.svm import SVC
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import classification_report, accuracy_score
from sklearn.model_selection import train_test_split
from sentence_transformers import SentenceTransformer

# Load data
from google.colab import files
uploaded = files.upload()
df = pd.read_excel("MANUALLY_ANNOTATED_DATA.xlsx")

# Prepare data
texts = df['text'].astype(str).tolist()
labels = df['concept'].astype(str).tolist()

# Encode labels
label_encoder = LabelEncoder()
y = label_encoder.fit_transform(labels)

# Load SBERT and compute embeddings
model = SentenceTransformer('paraphrase-multilingual-mpnet-base-v2')
X = model.encode(texts)

# Train/test split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42, stratify=y)

# Train SVM
svm = SVC(kernel='linear')
svm.fit(X_train, y_train)

# Predict
y_pred = svm.predict(X_test)
predicted_labels = label_encoder.inverse_transform(y_pred)
true_labels = label_encoder.inverse_transform(y_test)

# Evaluate
print(f"Accuracy: {accuracy_score(true_labels, predicted_labels):.2f}")
print(classification_report(true_labels, predicted_labels, zero_division=0))


Saving MANUALLY_ANNOTATED_DATA.xlsx to MANUALLY_ANNOTATED_DATA (2).xlsx
Accuracy: 0.32
              precision    recall  f1-score   support

      ACCESS       0.60      0.75      0.67         4
        ADEQ       0.17      0.18      0.17        11
         ADR       1.00      1.00      1.00         1
    AUTONOMY       0.42      0.56      0.48        18
      BALANC       0.32      0.54      0.40        13
     COLLECT       0.22      0.33      0.27         6
      COMPAR       0.50      0.25      0.33         4
       COMPL       0.25      0.33      0.29         3
      COURTS       0.00      0.00      0.00         1
      CREATE       0.00      0.00      0.00         5
       DETER       0.00      0.00      0.00         1
    ECONOMIC       0.68      0.62      0.65        21
     ENFORCE       1.00      0.33      0.50         3
        JUST       0.08      0.07      0.07        15
       LEGAL       0.43      0.60      0.50         5
    PHYSICAL       0.00      0.00      0.00     

# **SVM on grouped categories**

In [None]:
# Grouping Categories together

# Define the mapping dictionary
concept_mapping = {
    # POLITICS_INSTITUTIONS
    'COLLECT': 'POLITICS_INSTITUTIONS',
    'POLIT': 'POLITICS_INSTITUTIONS',

    # JUSTICE_RIGHTS
    'JUST': 'JUSTICE_RIGHTS',
    'RIGHTS': 'JUSTICE_RIGHTS',
    'LEGAL': 'JUSTICE_RIGHTS',
    'AUTONOMY': 'JUSTICE_RIGHTS',
    'RELATE': 'JUSTICE_RIGHTS',
    'POWER': 'JUSTICE_RIGHTS',

    # REMEDY_DESIGN
    'ADEQ': 'REMEDY_DESIGN',
    'CREATE': 'REMEDY_DESIGN',
    'REPLIC': 'REMEDY_DESIGN',
    'BALANC': 'REMEDY_DESIGN',
    'COMPAR': 'REMEDY_DESIGN',

    # ENFORCEMENT
    'COMPL': 'ENFORCEMENT',
    'DETER': 'ENFORCEMENT',
    'ENFORCE': 'ENFORCEMENT',
    'COURTS': 'ENFORCEMENT',
    'ACCESS': 'ENFORCEMENT',
    'ADR': 'ENFORCEMENT',

    # MATERIAL_HARM
    'ECONOMIC': 'MATERIAL_HARM',
    'PHYSICAL': 'MATERIAL_HARM',
}

# Apply mapping
df['meta_concept'] = df['concept'].map(concept_mapping)

# Check for any unmapped concepts (optional)
unmapped = df[df['meta_concept'].isnull()]['concept'].unique()
if len(unmapped) > 0:
    print("Unmapped concepts found:", unmapped)

df = df[['text', 'meta_concept']]

In [None]:
# Prepare data
texts = df['text'].astype(str).tolist()
labels = df['meta_concept'].astype(str).tolist()

# Encode labels
label_encoder = LabelEncoder()
y = label_encoder.fit_transform(labels)

# Load SBERT and compute embeddings
model = SentenceTransformer('paraphrase-multilingual-mpnet-base-v2')
X = model.encode(texts)

# Train/test split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42, stratify=y)

# Train SVM
svm = SVC(kernel='linear')
svm.fit(X_train, y_train)

# Predict
y_pred = svm.predict(X_test)
predicted_labels = label_encoder.inverse_transform(y_pred)
true_labels = label_encoder.inverse_transform(y_test)

# Evaluate
print(f"Accuracy: {accuracy_score(true_labels, predicted_labels):.2f}")
print(classification_report(true_labels, predicted_labels, zero_division=0))

Accuracy: 0.51
                       precision    recall  f1-score   support

          ENFORCEMENT       0.35      0.46      0.40        13
       JUSTICE_RIGHTS       0.53      0.61      0.57        67
        MATERIAL_HARM       0.65      0.54      0.59        24
POLITICS_INSTITUTIONS       0.50      0.42      0.45        24
        REMEDY_DESIGN       0.47      0.40      0.43        45

             accuracy                           0.51       173
            macro avg       0.50      0.49      0.49       173
         weighted avg       0.51      0.51      0.51       173



# **Multi Layer Perceptron (MLP)**

In [None]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from sklearn.neural_network import MLPClassifier
from sklearn.metrics import classification_report, confusion_matrix
from sentence_transformers import SentenceTransformer

In [None]:
# 1. Load and prepare data
df = pd.read_excel("MANUALLY_ANNOTATED_DATA.xlsx")
df = df[['text', 'concept']]

# 2. Load SBERT model
model = SentenceTransformer('paraphrase-multilingual-mpnet-base-v2')

# 3. Encode text
X = model.encode(df['text'].tolist(), show_progress_bar=True)

# 4. Encode labels
label_encoder = LabelEncoder()
y = label_encoder.fit_transform(df['concept'])

# 5. Train/test split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# 6. Train classifier
clf = MLPClassifier(hidden_layer_sizes=(256, 128), max_iter=500, random_state=42)
clf.fit(X_train, y_train)

# 7. Evaluate
y_pred = clf.predict(X_test)
print(classification_report(y_test, y_pred, target_names=label_encoder.classes_))


Batches:   0%|          | 0/27 [00:00<?, ?it/s]

              precision    recall  f1-score   support

      ACCESS       0.25      0.50      0.33         2
        ADEQ       0.23      0.21      0.22        14
         ADR       0.00      0.00      0.00         2
    AUTONOMY       0.47      0.47      0.47        17
      BALANC       0.58      0.64      0.61        11
     COLLECT       1.00      0.18      0.31        11
      COMPAR       0.33      0.40      0.36         5
       COMPL       0.00      0.00      0.00         6
      COURTS       0.00      0.00      0.00         0
      CREATE       0.12      0.20      0.15         5
       DETER       0.00      0.00      0.00         1
    ECONOMIC       0.67      0.90      0.77        20
     ENFORCE       0.50      0.33      0.40         3
        JUST       0.18      0.13      0.15        15
       LEGAL       0.62      0.62      0.62         8
    PHYSICAL       0.00      0.00      0.00         3
       POLIT       0.41      0.56      0.47        16
       POWER       0.00    

  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


In [None]:
# Grouping Categories together

# Define the mapping dictionary
concept_mapping = {
    # POLITICS_INSTITUTIONS
    'COLLECT': 'POLITICS_INSTITUTIONS',
    'POLIT': 'POLITICS_INSTITUTIONS',

    # JUSTICE_RIGHTS
    'JUST': 'JUSTICE_RIGHTS',
    'RIGHTS': 'JUSTICE_RIGHTS',
    'LEGAL': 'JUSTICE_RIGHTS',
    'AUTONOMY': 'JUSTICE_RIGHTS',
    'RELATE': 'JUSTICE_RIGHTS',
    'POWER': 'JUSTICE_RIGHTS',

    # REMEDY_DESIGN
    'ADEQ': 'REMEDY_DESIGN',
    'CREATE': 'REMEDY_DESIGN',
    'REPLIC': 'REMEDY_DESIGN',
    'BALANC': 'REMEDY_DESIGN',
    'COMPAR': 'REMEDY_DESIGN',

    # ENFORCEMENT
    'COMPL': 'ENFORCEMENT',
    'DETER': 'ENFORCEMENT',
    'ENFORCE': 'ENFORCEMENT',
    'COURTS': 'ENFORCEMENT',
    'ACCESS': 'ENFORCEMENT',
    'ADR': 'ENFORCEMENT',

    # MATERIAL_HARM
    'ECONOMIC': 'MATERIAL_HARM',
    'PHYSICAL': 'MATERIAL_HARM',
}

# Apply mapping
df['meta_concept'] = df['concept'].map(concept_mapping)

# Check for any unmapped concepts (optional)
unmapped = df[df['meta_concept'].isnull()]['concept'].unique()
if len(unmapped) > 0:
    print("Unmapped concepts found:", unmapped)

df = df[['text', 'meta_concept']]

In [None]:
df = df[['text', 'meta_concept']]

# 2. Load SBERT model
model = SentenceTransformer('paraphrase-multilingual-mpnet-base-v2')

# 3. Encode text
X = model.encode(df['text'].tolist(), show_progress_bar=True)

# 4. Encode labels
label_encoder = LabelEncoder()
y = label_encoder.fit_transform(df['meta_concept'])

# 5. Train/test split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# 6. Train classifier
clf = MLPClassifier(hidden_layer_sizes=(256, 128), max_iter=500, random_state=42)
clf.fit(X_train, y_train)

# 7. Evaluate
y_pred = clf.predict(X_test)
print(classification_report(y_test, y_pred, target_names=label_encoder.classes_))

# only on categories observed
labels = np.unique(y)  # all possible class indices
print(classification_report(y_test, y_pred, labels=labels, target_names=label_encoder.classes_))

print(confusion_matrix(y_test, y_pred))

Batches:   0%|          | 0/27 [00:00<?, ?it/s]

                       precision    recall  f1-score   support

          ENFORCEMENT       0.27      0.21      0.24        14
       JUSTICE_RIGHTS       0.54      0.68      0.60        62
        MATERIAL_HARM       0.68      0.65      0.67        23
POLITICS_INSTITUTIONS       0.56      0.37      0.44        27
        REMEDY_DESIGN       0.55      0.51      0.53        47

             accuracy                           0.54       173
            macro avg       0.52      0.48      0.50       173
         weighted avg       0.54      0.54      0.54       173

                       precision    recall  f1-score   support

          ENFORCEMENT       0.27      0.21      0.24        14
       JUSTICE_RIGHTS       0.54      0.68      0.60        62
        MATERIAL_HARM       0.68      0.65      0.67        23
POLITICS_INSTITUTIONS       0.56      0.37      0.44        27
        REMEDY_DESIGN       0.55      0.51      0.53        47

             accuracy                           0.

In [None]:
from google.colab import files
uploaded = files.upload()

Saving MANUALLY_ANNOTATED_DATA.xlsx to MANUALLY_ANNOTATED_DATA (1).xlsx


# **Ensemble method: SVM and seed sentence cosine**

In [None]:
import numpy as np
import pandas as pd
from sklearn.svm import SVC
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import accuracy_score
from sentence_transformers import SentenceTransformer, util

# Load your data
df = pd.read_excel("MANUALLY_ANNOTATED_DATA.xlsx")
df = df[['text', 'concept']].dropna()
df['text'] = df['text'].astype(str)

# Encode labels
label_encoder = LabelEncoder()
df['label'] = label_encoder.fit_transform(df['concept'])

# Train/test split
from sklearn.model_selection import train_test_split
texts = df['text'].tolist()
labels = df['label'].tolist()
X_train_texts, X_test_texts, y_train, y_test = train_test_split(texts, labels, stratify=labels, test_size=0.2, random_state=42)

# Load SBERT model and embed texts
model = SentenceTransformer('paraphrase-multilingual-mpnet-base-v2')
X_train = model.encode(X_train_texts)
X_test = model.encode(X_test_texts)

# Train SVM with probability enabled
svm = SVC(kernel='linear', probability=True)
svm.fit(X_train, y_train)

# Cosine similarity seeds (average 4 statements per class from training set)
from collections import defaultdict
seed_embeddings = {}
max_seeds = 4
df_train = pd.DataFrame({'text': X_train_texts, 'label': y_train})

for label in np.unique(y_train):
    examples = df_train[df_train['label'] == label]['text'].tolist()
    sample = examples[:max_seeds]
    emb = model.encode(sample)
    seed_embeddings[label] = np.mean(emb, axis=0)

# Ensemble prediction
predictions = []
confidence_threshold = 0.7

for i, test_vec in enumerate(X_test):
    probs = svm.predict_proba([test_vec])[0]
    top_class = np.argmax(probs)
    top_conf = probs[top_class]

    if top_conf >= confidence_threshold:
        predictions.append(top_class)
    else:
        # Fall back to cosine similarity
        sims = {label: util.cos_sim(test_vec, centroid)[0][0].item()
                for label, centroid in seed_embeddings.items()}
        best_cosine_label = max(sims, key=sims.get)
        predictions.append(best_cosine_label)

# Evaluate
accuracy = accuracy_score(y_test, predictions)
print(f"Ensemble accuracy: {accuracy:.3f}")


The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


modules.json:   0%|          | 0.00/229 [00:00<?, ?B/s]

config_sentence_transformers.json:   0%|          | 0.00/122 [00:00<?, ?B/s]

README.md: 0.00B [00:00, ?B/s]

sentence_bert_config.json:   0%|          | 0.00/53.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/723 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/1.11G [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/402 [00:00<?, ?B/s]

sentencepiece.bpe.model:   0%|          | 0.00/5.07M [00:00<?, ?B/s]

tokenizer.json: 0.00B [00:00, ?B/s]

special_tokens_map.json:   0%|          | 0.00/239 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/190 [00:00<?, ?B/s]

Ensemble accuracy: 0.173


In [None]:
from sklearn.metrics import classification_report, confusion_matrix

# Evaluate
accuracy = accuracy_score(y_test, predictions)
print(f"Ensemble accuracy: {accuracy:.3f}")

# Classification report
report = classification_report(y_test, predictions, target_names=label_encoder.classes_)
print("\nClassification Report:\n", report)

# Confusion matrix (optional, for inspection)
cm = confusion_matrix(y_test, predictions)
print("\nConfusion Matrix:\n", cm)


Ensemble accuracy: 0.173

Classification Report:
               precision    recall  f1-score   support

      ACCESS       0.00      0.00      0.00         4
        ADEQ       0.00      0.00      0.00        11
         ADR       0.50      1.00      0.67         1
    AUTONOMY       0.43      0.17      0.24        18
      BALANC       0.30      0.23      0.26        13
     COLLECT       0.17      0.33      0.22         6
      COMPAR       0.25      0.25      0.25         4
       COMPL       0.00      0.00      0.00         3
      COURTS       0.14      1.00      0.25         1
      CREATE       0.12      0.40      0.19         5
       DETER       0.00      0.00      0.00         1
    ECONOMIC       0.12      0.05      0.07        21
     ENFORCE       0.29      0.67      0.40         3
        JUST       0.20      0.13      0.16        15
       LEGAL       0.27      0.60      0.38         5
    PHYSICAL       0.00      0.00      0.00         3
       POLIT       0.26      0.

# **Adding a class weighting, and falls back to cosine similarity only when SVM’s top prediction confidence is below a threshold**

In [None]:
import pandas as pd
import numpy as np
from sklearn.svm import SVC
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import accuracy_score
from sentence_transformers import SentenceTransformer, util
import spacy
from sklearn.base import BaseEstimator, TransformerMixin
from collections import Counter

# Load and prepare data
df = pd.read_excel("MANUALLY_ANNOTATED_DATA.xlsx")[['text', 'concept']].dropna()
df['text'] = df['text'].astype(str)

# Encode labels
label_encoder = LabelEncoder()
df['label'] = label_encoder.fit_transform(df['concept'])

# Train/test split
X_train_texts, X_test_texts, y_train, y_test = train_test_split(
    df['text'], df['label'], stratify=df['label'], test_size=0.2, random_state=42
)

# Load SBERT model
sbert = SentenceTransformer('paraphrase-multilingual-mpnet-base-v2')
X_train_sbert = sbert.encode(X_train_texts.tolist(), show_progress_bar=True)
X_test_sbert = sbert.encode(X_test_texts.tolist(), show_progress_bar=True)

# POS tag feature engineering
nlp = spacy.load("en_core_web_sm")

class SpacyFeatures(BaseEstimator, TransformerMixin):
    def __init__(self):
        self.pos_tags = ['NOUN', 'VERB', 'ADJ', 'ADV', 'AUX', 'PRON', 'ADP', 'CCONJ', 'DET', 'NUM', 'PROPN']
    def fit(self, X, y=None): return self
    def transform(self, texts):
        features = []
        for doc in nlp.pipe(texts, disable=["ner", "parser"]):
            counts = {tag: 0 for tag in self.pos_tags}
            for token in doc:
                if token.pos_ in counts:
                    counts[token.pos_] += 1
            total = sum(counts.values()) + 1e-6
            norm = [counts[tag] / total for tag in self.pos_tags]
            features.append(norm)
        return np.array(features)

# Combine SBERT + POS features
X_train_feats = np.hstack([X_train_sbert, SpacyFeatures().fit_transform(X_train_texts)])
X_test_feats = np.hstack([X_test_sbert, SpacyFeatures().fit_transform(X_test_texts)])

# Train SVM with class weights
svm = SVC(kernel='linear', probability=True, class_weight='balanced')
svm.fit(X_train_feats, y_train)

# Create cosine similarity centroids from training data
seed_embeddings = {}
train_df = pd.DataFrame({'text': X_train_texts, 'label': y_train})
for label in np.unique(y_train):
    samples = train_df[train_df['label'] == label]['text'].tolist()[:4]
    seed_vecs = sbert.encode(samples)
    seed_embeddings[label] = np.mean(seed_vecs, axis=0)


Batches:   0%|          | 0/22 [00:00<?, ?it/s]

Batches:   0%|          | 0/6 [00:00<?, ?it/s]

In [None]:

# Prediction loop with confidence-based fallback to cosine
confidence_threshold = 0.4
results = []
cosine_used = 0

for i, test_vec in enumerate(X_test_feats):
    sbert_vec = X_test_sbert[i]
    svm_probs = svm.predict_proba([test_vec])[0]
    top_idx = np.argmax(svm_probs)
    top_prob = svm_probs[top_idx]
    true_label = y_test.iloc[i]

    # Use cosine similarity fallback if SVM uncertain
    if top_prob < confidence_threshold:
        cosine_used += 1
        cosine_scores = {
            label: util.cos_sim(sbert_vec, centroid)[0][0].item()
            for label, centroid in seed_embeddings.items()
        }
        top_idx = max(cosine_scores, key=cosine_scores.get)

    results.append({
        "text": X_test_texts.iloc[i],
        "true_label": label_encoder.inverse_transform([true_label])[0],
        "predicted_label": label_encoder.inverse_transform([top_idx])[0],
        "used_cosine": top_prob < confidence_threshold
    })

# Accuracy calculation
df_results = pd.DataFrame(results)
accuracy = (df_results['true_label'] == df_results['predicted_label']).mean()

print(f"\n✅ Final Accuracy: {accuracy:.3f}")
print(f"🧠 Cosine fallback used on {cosine_used} of {len(X_test_feats)} samples "
      f"({cosine_used / len(X_test_feats):.1%})")

# Save predictions
df_results.to_csv("svm_confidence_cosine_fallback_predictions.csv", index=False)
print("\n📁 Predictions saved to 'svm_confidence_cosine_fallback_predictions.csv'")



✅ Final Accuracy: 0.231
🧠 Cosine fallback used on 149 of 173 samples (86.1%)

📁 Predictions saved to 'svm_confidence_cosine_fallback_predictions.csv'


In [None]:
from sklearn.metrics import classification_report, confusion_matrix

# Accuracy calculation
df_results = pd.DataFrame(results)
accuracy = (df_results['true_label'] == df_results['predicted_label']).mean()

print(f"\n✅ Final Accuracy: {accuracy:.3f}")
print(f"🧠 Cosine fallback used on {cosine_used} of {len(X_test_feats)} samples "
      f"({cosine_used / len(X_test_feats):.1%})")

# Encode back into numeric form for sklearn metrics
y_true = label_encoder.transform(df_results['true_label'])
y_pred = label_encoder.transform(df_results['predicted_label'])

# Classification report
report = classification_report(y_true, y_pred, target_names=label_encoder.classes_)
print("\nClassification Report:\n", report)

# Optional: confusion matrix
cm = confusion_matrix(y_true, y_pred)
print("\nConfusion Matrix:\n", cm)

# Optional: save as Excel for appendix
report_dict = classification_report(y_true, y_pred, target_names=label_encoder.classes_, output_dict=True)
report_df = pd.DataFrame(report_dict).transpose()
report_df.to_excel("ensemble_POS_classification_report.xlsx")



✅ Final Accuracy: 0.231
🧠 Cosine fallback used on 149 of 173 samples (86.1%)

Classification Report:
               precision    recall  f1-score   support

      ACCESS       0.00      0.00      0.00         4
        ADEQ       0.00      0.00      0.00        11
         ADR       0.50      1.00      0.67         1
    AUTONOMY       0.50      0.33      0.40        18
      BALANC       0.42      0.38      0.40        13
     COLLECT       0.17      0.33      0.22         6
      COMPAR       0.25      0.25      0.25         4
       COMPL       0.00      0.00      0.00         3
      COURTS       0.14      1.00      0.25         1
      CREATE       0.14      0.40      0.21         5
       DETER       0.00      0.00      0.00         1
    ECONOMIC       0.43      0.29      0.34        21
     ENFORCE       0.29      0.67      0.40         3
        JUST       0.12      0.07      0.09        15
       LEGAL       0.38      0.60      0.46         5
    PHYSICAL       0.00      0.0