

---


# Concept classifier


---



In [1]:
# Import Libraries
import pandas as pd, numpy as np
from sklearn.svm import SVC
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import accuracy_score, classification_report, precision_recall_fscore_support
from sentence_transformers import SentenceTransformer
from collections import defaultdict
import joblib

# Import data
from google.colab import files
uploaded = files.upload()

RANDOM_STATE = 42

# Load and prepare data
df = pd.read_excel("MANUALLY_ANNOTATED_DATA.xlsx")[['text','concept']].dropna()
df['text'] = df['text'].astype(str)

label_encoder = LabelEncoder()
df['label'] = label_encoder.fit_transform(df['concept'])

X_train_texts, X_test_texts, y_train, y_test = train_test_split(
    df['text'], df['label'], stratify=df['label'], test_size=0.2, random_state=RANDOM_STATE
)

# SBERT embeddings
print("Encoding SBERT embeddings...")
sbert = SentenceTransformer('paraphrase-multilingual-mpnet-base-v2')
X_train = sbert.encode(X_train_texts.tolist(), show_progress_bar=True)
X_test  = sbert.encode(X_test_texts.tolist(),  show_progress_bar=True)

# Train linear SVM
print("Training linear SVM (C=1, no class weights, no probabilities)...")
svm = SVC(kernel='linear', C=1, class_weight=None, probability=False, random_state=RANDOM_STATE)
svm.fit(X_train, y_train)

# Save model and encoder for pipeline reuse
joblib.dump(svm, "concept_svm.joblib")
joblib.dump(label_encoder, "concept_label_encoder.joblib")
joblib.dump({"use_l2": False, "C": 1, "class_weight": None}, "concept_model_config.joblib")
print("Saved: concept_svm.joblib, concept_label_encoder.joblib, concept_model_config.joblib")

# Predict and evaluate (Top-1 via argmax of decision_function; Top-3 via ranking)
scores = svm.decision_function(X_test)
order = np.argsort(scores, axis=1)[:, ::-1]
top3 = order[:, :3]
top1 = top3[:, 0]

# Overall metrics
top1_acc = accuracy_score(y_test, top1)
top3_hit = np.mean([y_test.iloc[i] in top3[i] for i in range(len(y_test))])
macro_prf = precision_recall_fscore_support(y_test, top1, average="macro", zero_division=0)
weighted_prf = precision_recall_fscore_support(y_test, top1, average="weighted", zero_division=0)

print(f"\n Top-1 Accuracy: {top1_acc:.3f}")
print(f" Top-3 Accuracy: {top3_hit:.3f}")
print(f" Macro P/R/F1: {macro_prf[0]:.3f} / {macro_prf[1]:.3f} / {macro_prf[2]:.3f}")
print(f" Weighted P/R/F1: {weighted_prf[0]:.3f} / {weighted_prf[1]:.3f} / {weighted_prf[2]:.3f}")

print("\n Classification report (Top-1):\n",
      classification_report(y_test, top1, target_names=label_encoder.classes_, digits=3, zero_division=0))

# Save detailed predictions and per-class Top-1/Top-3
results, top1_counts, top3_counts = [], defaultdict(lambda:[0,0]), defaultdict(lambda:[0,0])
class_list = sorted(df['concept'].unique())

for i in range(len(X_test_texts)):
    true = y_test.iloc[i]
    preds3 = top3[i]
    results.append({
        "text": X_test_texts.iloc[i],
        "true_label": label_encoder.inverse_transform([true])[0],
        "top1_prediction": label_encoder.inverse_transform([preds3[0]])[0],
        "top3_predictions": [label_encoder.inverse_transform([j])[0] for j in preds3],
        "correct_in_top3": true in preds3
    })
    lab = label_encoder.inverse_transform([true])[0]
    top1_counts[lab][1] += 1; top3_counts[lab][1] += 1
    if true == preds3[0]: top1_counts[lab][0] += 1
    if true in preds3:    top3_counts[lab][0] += 1

pd.DataFrame(results).to_csv("svm_sbert_predictions.csv", index=False)
print("Predictions saved to svm_sbert_predictions.csv")

print("\n Per-Category Accuracy:")
print("Category\tTop-1 Acc\tTop-3 Acc\tSupport")
for lab in class_list:
    c1, tot = top1_counts[lab]; c3, _ = top3_counts[lab]
    print(f"{lab}\t{(c1/tot if tot else 0):.2f}\t\t{(c3/tot if tot else 0):.2f}\t\t{tot}")


Saving MANUALLY_ANNOTATED_DATA.xlsx to MANUALLY_ANNOTATED_DATA.xlsx
Encoding SBERT embeddings...


The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


modules.json:   0%|          | 0.00/229 [00:00<?, ?B/s]

config_sentence_transformers.json:   0%|          | 0.00/122 [00:00<?, ?B/s]

README.md: 0.00B [00:00, ?B/s]

sentence_bert_config.json:   0%|          | 0.00/53.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/723 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/1.11G [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/402 [00:00<?, ?B/s]

sentencepiece.bpe.model:   0%|          | 0.00/5.07M [00:00<?, ?B/s]

tokenizer.json: 0.00B [00:00, ?B/s]

special_tokens_map.json:   0%|          | 0.00/239 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/190 [00:00<?, ?B/s]

Batches:   0%|          | 0/22 [00:00<?, ?it/s]

Batches:   0%|          | 0/6 [00:00<?, ?it/s]

Training linear SVM (C=1, no class weights, no probabilities)...
Saved: concept_svm.joblib, concept_label_encoder.joblib, concept_model_config.joblib

 Top-1 Accuracy: 0.347
 Top-3 Accuracy: 0.647
 Macro P/R/F1: 0.394 / 0.302 / 0.317
 Weighted P/R/F1: 0.365 / 0.347 / 0.340

 Classification report (Top-1):
               precision    recall  f1-score   support

      ACCESS      0.667     0.500     0.571         4
        ADEQ      0.167     0.182     0.174        11
         ADR      1.000     1.000     1.000         1
    AUTONOMY      0.455     0.556     0.500        18
      BALANC      0.412     0.538     0.467        13
     COLLECT      0.286     0.333     0.308         6
      COMPAR      1.000     0.250     0.400         4
       COMPL      0.000     0.000     0.000         3
      COURTS      0.000     0.000     0.000         1
      CREATE      0.000     0.000     0.000         5
       DETER      0.000     0.000     0.000         1
    ECONOMIC      0.636     0.667     0.651



---
# Same Model but with POS Tags


---






In [2]:
# Import Libraries
import pandas as pd, numpy as np
from sklearn.svm import SVC
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import accuracy_score, classification_report, precision_recall_fscore_support
from sentence_transformers import SentenceTransformer
from sklearn.base import BaseEstimator, TransformerMixin
from collections import defaultdict
import spacy, joblib

RANDOM_STATE = 42

# Load and prepare data
df = pd.read_excel("MANUALLY_ANNOTATED_DATA.xlsx")[['text','concept']].dropna()
df['text'] = df['text'].astype(str)

label_encoder = LabelEncoder()
df['label'] = label_encoder.fit_transform(df['concept'])

X_train_texts, X_test_texts, y_train, y_test = train_test_split(
    df['text'], df['label'], stratify=df['label'], test_size=0.2, random_state=RANDOM_STATE
)

# SBERT embeddings
print("Encoding SBERT embeddings...")
sbert = SentenceTransformer('paraphrase-multilingual-mpnet-base-v2')
X_train_sbert = sbert.encode(X_train_texts.tolist(), show_progress_bar=True)
X_test_sbert  = sbert.encode(X_test_texts.tolist(),  show_progress_bar=True)

# POS tag frequency features
print("Extracting POS features...")
nlp = spacy.load("en_core_web_sm")

class SpacyFeatures(BaseEstimator, TransformerMixin):
    def __init__(self):
        self.pos_tags = ['NOUN','VERB','ADJ','ADV','AUX','PRON','ADP','CCONJ','DET','NUM','PROPN']
    def fit(self, X, y=None): return self
    def transform(self, texts):
        feats = []
        for doc in nlp.pipe(texts, disable=["ner","parser"]):
            counts = {t:0 for t in self.pos_tags}
            for tok in doc:
                if tok.pos_ in counts: counts[tok.pos_] += 1
            total = sum(counts.values()) + 1e-6
            norm = [counts[t]/total for t in self.pos_tags]
            feats.append(norm)
        return np.array(feats)

X_train_pos = SpacyFeatures().fit_transform(X_train_texts)
X_test_pos  = SpacyFeatures().fit_transform(X_test_texts)

# Concatenate SBERT and POS
X_train = np.hstack([X_train_sbert, X_train_pos])
X_test  = np.hstack([X_test_sbert,  X_test_pos])

# Train SVM
print("Training linear SVM with POS...")
svm = SVC(kernel='linear', C=1, class_weight=None, probability=False, random_state=RANDOM_STATE)
svm.fit(X_train, y_train)

# Save model and encoder
joblib.dump(svm, "svm_sbert_pos_model.joblib")
joblib.dump(label_encoder, "svm_label_encoder.joblib")
joblib.dump({"use_l2": False, "C": 1, "class_weight": None, "features": "SBERT+POS"}, "svm_model_config.joblib")
print("Saved: svm_sbert_pos_model.joblib, svm_label_encoder.joblib, svm_model_config.joblib")

# Predict and evaluate
scores = svm.decision_function(X_test)
order = np.argsort(scores, axis=1)[:, ::-1]
top3 = order[:, :3]
top1 = top3[:, 0]

top1_acc = accuracy_score(y_test, top1)
top3_hit = np.mean([y_test.iloc[i] in top3[i] for i in range(len(y_test))])
macro_prf = precision_recall_fscore_support(y_test, top1, average="macro", zero_division=0)
weighted_prf = precision_recall_fscore_support(y_test, top1, average="weighted", zero_division=0)

print(f"\n Top-1 Accuracy: {top1_acc:.3f}")
print(f"Top-3 Accuracy: {top3_hit:.3f}")
print(f"Macro P/R/F1: {macro_prf[0]:.3f} / {macro_prf[1]:.3f} / {macro_prf[2]:.3f}")
print(f"Weighted P/R/F1: {weighted_prf[0]:.3f} / {weighted_prf[1]:.3f} / {weighted_prf[2]:.3f}")

print("\n Classification report (Top-1):\n",
      classification_report(y_test, top1, target_names=label_encoder.classes_, digits=3, zero_division=0))

# Save predictions
results, top1_counts, top3_counts = [], defaultdict(lambda:[0,0]), defaultdict(lambda:[0,0])
class_list = sorted(df['concept'].unique())

for i in range(len(X_test_texts)):
    true = y_test.iloc[i]; preds3 = top3[i]
    results.append({
        "text": X_test_texts.iloc[i],
        "true_label": label_encoder.inverse_transform([true])[0],
        "top1_prediction": label_encoder.inverse_transform([preds3[0]])[0],
        "top3_predictions": [label_encoder.inverse_transform([j])[0] for j in preds3],
        "correct_in_top3": true in preds3
    })
    lab = label_encoder.inverse_transform([true])[0]
    top1_counts[lab][1]+=1; top3_counts[lab][1]+=1
    if true == preds3[0]: top1_counts[lab][0]+=1
    if true in preds3:    top3_counts[lab][0]+=1

pd.DataFrame(results).to_csv("svm_sbert_pos_predictions.csv", index=False)
print("Predictions saved to svm_sbert_pos_predictions.csv")

print("\n Per-Category Accuracy:")
print("Category\tTop-1 Acc\tTop-3 Acc\tSupport")
for lab in class_list:
    c1, tot = top1_counts[lab]; c3, _ = top3_counts[lab]
    print(f"{lab}\t{(c1/tot if tot else 0):.2f}\t\t{(c3/tot if tot else 0):.2f}\t\t{tot}")


Encoding SBERT embeddings...


Batches:   0%|          | 0/22 [00:00<?, ?it/s]

Batches:   0%|          | 0/6 [00:00<?, ?it/s]

Extracting POS features...
Training linear SVM with POS...
Saved: svm_sbert_pos_model.joblib, svm_label_encoder.joblib, svm_model_config.joblib

 Top-1 Accuracy: 0.341
Top-3 Accuracy: 0.647
Macro P/R/F1: 0.394 / 0.291 / 0.309
Weighted P/R/F1: 0.366 / 0.341 / 0.337

 Classification report (Top-1):
               precision    recall  f1-score   support

      ACCESS      0.500     0.250     0.333         4
        ADEQ      0.154     0.182     0.167        11
         ADR      1.000     1.000     1.000         1
    AUTONOMY      0.417     0.556     0.476        18
      BALANC      0.438     0.538     0.483        13
     COLLECT      0.286     0.333     0.308         6
      COMPAR      1.000     0.250     0.400         4
       COMPL      0.000     0.000     0.000         3
      COURTS      0.000     0.000     0.000         1
      CREATE      0.000     0.000     0.000         5
       DETER      0.000     0.000     0.000         1
    ECONOMIC      0.667     0.667     0.667        2