In [10]:
import numpy as np
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer
from sklearn.pipeline import Pipeline
from sklearn.svm import SVC
from sklearn.multiclass import OneVsRestClassifier
from sklearn.multioutput import MultiOutputClassifier
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import f1_score
import os
from datasets import load_dataset

In [13]:
categories = [
  'Legality_Constitutionality_and_jurisprudence',
  'Quality_of_life',
  'Cultural_identity',
  'Fairness_and_equality',
  'Health_and_safety',
  'Policy_prescription_and_evaluation',
  'Political',
  'Capacity_and_resources',
  'Economic',
  'Public_opinion',
  'Morality',
  'Crime_and_punishment',
  'External_regulation_and_reputation',
  'Security_and_defense',
  ]
categories.sort()

In [11]:
def get_features_from_files(features_filenames):
    return load_dataset("text", data_files=features_filenames, sample_by="document", split="train")

def get_dataset(data_dir, prefix='train'):
    def attach_labels(record, idx):
        lbls = labels.iloc[idx]["labels"].split(",")
        return {"labels": [int(cat_name in lbls) for cat_name in categories]}

    features_dir_path = f"{data_dir}/{prefix}-articles-subtask-2"
    labels_path = f"{data_dir}/{prefix}-labels-subtask-2.txt"
    labels = pd.read_csv(labels_path, sep="\t", header=None, names=["ids", "labels"], index_col="ids")
    features_filenames = [os.path.join(features_dir_path, f"article{id}.txt") for id in labels.index]
    features = get_features_from_files(features_filenames)

    return features.map(attach_labels, with_indices=True)

def get_split_dataset(data_dir, split=0.2, seed=42):
    split_dataset = get_dataset(data_dir).train_test_split(split, seed=seed)
    return split_dataset["train"], split_dataset["test"]

In [14]:
langs = ['it', 'fr', 'en', 'ru', 'ge', 'po']
datasets_train = {lang: get_dataset(f'../data/{lang}/') for lang in langs}
datasets_eval = {lang: get_dataset(f'../data/{lang}/', prefix='dev') for lang in langs}

Resolving data files:   0%|          | 0/227 [00:00<?, ?it/s]

Using custom data configuration default-cd26c8760bcabd08
Reusing dataset text (/home/alex/.cache/huggingface/datasets/text/default-cd26c8760bcabd08/0.0.0/acc32f2f2ef863c93c2f30c52f7df6cc9053a1c2230b8d7da0d210404683ca08)


  0%|          | 0/227 [00:00<?, ?ex/s]

Resolving data files:   0%|          | 0/158 [00:00<?, ?it/s]

Using custom data configuration default-da1269d2792d2579


Downloading and preparing dataset text/default to /home/alex/.cache/huggingface/datasets/text/default-da1269d2792d2579/0.0.0/acc32f2f2ef863c93c2f30c52f7df6cc9053a1c2230b8d7da0d210404683ca08...


Downloading data files:   0%|          | 0/1 [00:00<?, ?it/s]

Extracting data files:   0%|          | 0/1 [00:00<?, ?it/s]

0 tables [00:00, ? tables/s]

Dataset text downloaded and prepared to /home/alex/.cache/huggingface/datasets/text/default-da1269d2792d2579/0.0.0/acc32f2f2ef863c93c2f30c52f7df6cc9053a1c2230b8d7da0d210404683ca08. Subsequent calls will reuse this data.


  0%|          | 0/158 [00:00<?, ?ex/s]

Resolving data files:   0%|          | 0/433 [00:00<?, ?it/s]

Using custom data configuration default-89d14f648c9b2d22
Reusing dataset text (/home/alex/.cache/huggingface/datasets/text/default-89d14f648c9b2d22/0.0.0/acc32f2f2ef863c93c2f30c52f7df6cc9053a1c2230b8d7da0d210404683ca08)


  0%|          | 0/433 [00:00<?, ?ex/s]

Resolving data files:   0%|          | 0/143 [00:00<?, ?it/s]

Using custom data configuration default-04418745a3685caf
Reusing dataset text (/home/alex/.cache/huggingface/datasets/text/default-04418745a3685caf/0.0.0/acc32f2f2ef863c93c2f30c52f7df6cc9053a1c2230b8d7da0d210404683ca08)


  0%|          | 0/143 [00:00<?, ?ex/s]

Resolving data files:   0%|          | 0/132 [00:00<?, ?it/s]

Using custom data configuration default-9cd3e09132dee3ea
Reusing dataset text (/home/alex/.cache/huggingface/datasets/text/default-9cd3e09132dee3ea/0.0.0/acc32f2f2ef863c93c2f30c52f7df6cc9053a1c2230b8d7da0d210404683ca08)


  0%|          | 0/132 [00:00<?, ?ex/s]

Resolving data files:   0%|          | 0/145 [00:00<?, ?it/s]

Using custom data configuration default-fe116811d8e4089b


Downloading and preparing dataset text/default to /home/alex/.cache/huggingface/datasets/text/default-fe116811d8e4089b/0.0.0/acc32f2f2ef863c93c2f30c52f7df6cc9053a1c2230b8d7da0d210404683ca08...


Downloading data files:   0%|          | 0/1 [00:00<?, ?it/s]

Extracting data files:   0%|          | 0/1 [00:00<?, ?it/s]

0 tables [00:00, ? tables/s]

Dataset text downloaded and prepared to /home/alex/.cache/huggingface/datasets/text/default-fe116811d8e4089b/0.0.0/acc32f2f2ef863c93c2f30c52f7df6cc9053a1c2230b8d7da0d210404683ca08. Subsequent calls will reuse this data.


  0%|          | 0/145 [00:00<?, ?ex/s]

Resolving data files:   0%|          | 0/76 [00:00<?, ?it/s]

Using custom data configuration default-43f92615f732abcf


Downloading and preparing dataset text/default to /home/alex/.cache/huggingface/datasets/text/default-43f92615f732abcf/0.0.0/acc32f2f2ef863c93c2f30c52f7df6cc9053a1c2230b8d7da0d210404683ca08...


Downloading data files:   0%|          | 0/1 [00:00<?, ?it/s]

Extracting data files:   0%|          | 0/1 [00:00<?, ?it/s]

0 tables [00:00, ? tables/s]

Dataset text downloaded and prepared to /home/alex/.cache/huggingface/datasets/text/default-43f92615f732abcf/0.0.0/acc32f2f2ef863c93c2f30c52f7df6cc9053a1c2230b8d7da0d210404683ca08. Subsequent calls will reuse this data.


  0%|          | 0/76 [00:00<?, ?ex/s]

Resolving data files:   0%|          | 0/53 [00:00<?, ?it/s]

Using custom data configuration default-9952b19a25e38db2


Downloading and preparing dataset text/default to /home/alex/.cache/huggingface/datasets/text/default-9952b19a25e38db2/0.0.0/acc32f2f2ef863c93c2f30c52f7df6cc9053a1c2230b8d7da0d210404683ca08...


Downloading data files:   0%|          | 0/1 [00:00<?, ?it/s]

Extracting data files:   0%|          | 0/1 [00:00<?, ?it/s]

0 tables [00:00, ? tables/s]

Dataset text downloaded and prepared to /home/alex/.cache/huggingface/datasets/text/default-9952b19a25e38db2/0.0.0/acc32f2f2ef863c93c2f30c52f7df6cc9053a1c2230b8d7da0d210404683ca08. Subsequent calls will reuse this data.


  0%|          | 0/53 [00:00<?, ?ex/s]

Resolving data files:   0%|          | 0/83 [00:00<?, ?it/s]

Using custom data configuration default-16b3798cedb69062
Reusing dataset text (/home/alex/.cache/huggingface/datasets/text/default-16b3798cedb69062/0.0.0/acc32f2f2ef863c93c2f30c52f7df6cc9053a1c2230b8d7da0d210404683ca08)


  0%|          | 0/83 [00:00<?, ?ex/s]

Resolving data files:   0%|          | 0/48 [00:00<?, ?it/s]

Using custom data configuration default-2e7eda06a4c9eb07
Reusing dataset text (/home/alex/.cache/huggingface/datasets/text/default-2e7eda06a4c9eb07/0.0.0/acc32f2f2ef863c93c2f30c52f7df6cc9053a1c2230b8d7da0d210404683ca08)


  0%|          | 0/48 [00:00<?, ?ex/s]

Resolving data files:   0%|          | 0/45 [00:00<?, ?it/s]

Using custom data configuration default-12888632656eb392
Reusing dataset text (/home/alex/.cache/huggingface/datasets/text/default-12888632656eb392/0.0.0/acc32f2f2ef863c93c2f30c52f7df6cc9053a1c2230b8d7da0d210404683ca08)


  0%|          | 0/45 [00:00<?, ?ex/s]

Resolving data files:   0%|          | 0/49 [00:00<?, ?it/s]

Using custom data configuration default-0e01e8c0449c7a6d


Downloading and preparing dataset text/default to /home/alex/.cache/huggingface/datasets/text/default-0e01e8c0449c7a6d/0.0.0/acc32f2f2ef863c93c2f30c52f7df6cc9053a1c2230b8d7da0d210404683ca08...


Downloading data files:   0%|          | 0/1 [00:00<?, ?it/s]

Extracting data files:   0%|          | 0/1 [00:00<?, ?it/s]

0 tables [00:00, ? tables/s]

Dataset text downloaded and prepared to /home/alex/.cache/huggingface/datasets/text/default-0e01e8c0449c7a6d/0.0.0/acc32f2f2ef863c93c2f30c52f7df6cc9053a1c2230b8d7da0d210404683ca08. Subsequent calls will reuse this data.


  0%|          | 0/49 [00:00<?, ?ex/s]

In [16]:
dataset_train_split, dataset_eval_split = get_split_dataset("../data/en", seed=42)

Resolving data files:   0%|          | 0/433 [00:00<?, ?it/s]

Using custom data configuration default-89d14f648c9b2d22
Reusing dataset text (/home/alex/.cache/huggingface/datasets/text/default-89d14f648c9b2d22/0.0.0/acc32f2f2ef863c93c2f30c52f7df6cc9053a1c2230b8d7da0d210404683ca08)


  0%|          | 0/433 [00:00<?, ?ex/s]

In [17]:
predictions = dict()
references = dict()
pipeline = Pipeline([
    ('vectorizer', TfidfVectorizer()),
    ('svc', OneVsRestClassifier(SVC()))
], verbose = True)
pipeline = Pipeline([
    ('vectorizer', CountVectorizer(ngram_range = (1, 2), analyzer='word')),
    ('SVM_multiclass', MultiOutputClassifier(SVC(class_weight= None, C=1, kernel='linear'), n_jobs=1))
])

In [18]:
pipeline.fit(dataset_train_split['text'], dataset_train_split['labels'])
predictions['split'] = pipeline.predict(dataset_eval_split['text'])
references['split'] = np.array(dataset_eval_split['labels'])

In [19]:
for lang in langs:
    pipeline.fit(datasets_train[lang]['text'], datasets_train[lang]['labels'])
    predictions[lang] = pipeline.predict(datasets_eval[lang]['text'])
    references[lang] = np.array(datasets_eval[lang]['labels'])

# Scores

In [20]:
scores = pd.DataFrame({
    "Lang": predictions.keys(),
    "MicroF1": [f1_score(r, p, average="micro") for p,r in zip(predictions.values(), references.values())],
    "MacroF1": [f1_score(r, p, average="macro") for p,r in zip(predictions.values(), references.values())],
})
scores = scores.sort_values(by="MicroF1", ascending=False)
scores

Unnamed: 0,Lang,MicroF1,MacroF1
0,split,0.616949,0.412986
3,en,0.605452,0.397523
6,po,0.592233,0.478109
5,ge,0.50625,0.33505
1,it,0.430839,0.334218
2,fr,0.380531,0.272933
4,ru,0.215827,0.131752
