In [1]:
# --- Preprocessing (kept simple) ---
import re

#!pip install stopwordsiso
import stopwordsiso

#nltk.download('stopwords')

def don(text): 
    return text
    
def lower(text): 
    return text.lower()

URL_RE = re.compile(r"https?://\S+|www\.\S+")
def remove_urls(text): 
    return URL_RE.sub("", text)
    
def replace_urls(text): 
    return URL_RE.sub(" <URL> ", text)
    
PUNCT_RE = re.compile(r"[^\w\s]")
def remove_punct(text): 
    return PUNCT_RE.sub(" ", text)

TOKEN_PUNC = re.compile(r"\w+|[^\w\s]")

def remove_stopwords(text, lg="all"):
    #very slow but keeps punctuation
    stoplist = stopwordsiso.stopwords(['bg', 'cs', 'da', 'de', 'el', 'en', 'es', 'et',
                                       'fi', 'fr', 'hu', 'it', 'lt', 'lv', 'mt', 'nl',
                                       'pl', 'pt', 'ro', 'sk', 'sl', 'sv'])
    return " ".join([T for T in TOKEN_PUNC.findall(text) if T not in set(stoplist)])

def compose(*funcs):
    def f(text):
        for fn in funcs:
            text = fn(text)
        return re.sub(r"\s+", " ", text).strip()
    return f

PREPROCESSORS = {
    "DON": don,
    "LOW": lower,
    #"URLrem": remove_urls,
    #"URLrep": replace_urls,
    "PUN": remove_punct,
    "RSW": remove_stopwords,
    #"LOW+URLrem": compose(lower, remove_urls),
    #"LOW+URLrep": compose(lower, replace_urls),
    #"LOW+PUN": compose(lower, remove_punct),
    "LOW+URLrem+PUN": compose(lower, remove_urls, remove_punct),
    #"LOW+URLrep+PUN": compose(lower, remove_urls, remove_punct),
    "LOW+URLrem+PUN+RSW": compose(lower, remove_urls, remove_punct, remove_stopwords),
    #"LOW+URLrep+PUN+RSW": compose(lower, replace_urls, remove_punct, remove_stopwords),

}
multi_string = "I am travelling to Nancy for an NLP course à l'Université de Lorraine"
print(multi_string)
print(remove_stopwords(multi_string))


I travelling Nancy NLP ' Université Lorraine


  import pkg_resources


In [2]:
from sklearn.model_selection import StratifiedKFold, cross_validate
from sklearn.pipeline import Pipeline
from sklearn.feature_extraction.text import CountVectorizer,TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import make_scorer, f1_score
from pathlib import Path
import numpy as np
import pandas as pd

SEED = 42
OUTFILE = Path("results_DiagLang.csv")

macro_f1 = make_scorer(f1_score, average="macro")
SCORING = {"acc": "accuracy", "macro_f1": macro_f1}

def mean_scores(scores):
    return {k.replace("test_", ""): float(np.mean(v))
            for k, v in scores.items() if k.startswith("test_")}
    
MODEL = LogisticRegression(max_iter=2000, random_state=SEED)

def evaluate(X, y, preprocess, vectorizer):
    Xp = [preprocess(t) for t in X]
    pipe = Pipeline([
        ("vect", vectorizer),
        ("clf", MODEL),
    ])
    cv = StratifiedKFold(n_splits=5, shuffle=True, random_state=SEED)
    scores = cross_validate(pipe, Xp, y, cv=cv, scoring=SCORING, n_jobs=-1)
    return mean_scores(scores)

In [3]:
#! unzip corpus_multi.zip
import json
with open("corpus_multi.json") as f:
    json_data = json.load(f)
    
X, y = [x[0] for x in json_data], [x[1] for x in json_data]
print("Samples:", len(X), "Labels:", sorted(set(y)))

Samples: 5984 Labels: ['bg', 'cs', 'da', 'de', 'el', 'en', 'es', 'et', 'fi', 'fr', 'hu', 'it', 'lt', 'lv', 'mt', 'nl', 'pl', 'pt', 'ro', 'sk', 'sl', 'sv']


In [14]:
import warnings
warnings.filterwarnings('ignore')
VECTORIZERS = {
    "count_word_1-1": CountVectorizer(analyzer="word", ngram_range=(1, 1), lowercase=False, max_features= 1000),
    #"count_word_1-2": CountVectorizer(analyzer="word", ngram_range=(1, 2), lowercase=False, max_features= 1000),
    "tfidf_word_1-1": TfidfVectorizer(analyzer="word", ngram_range=(1, 1), lowercase=False, max_features= 1000),
    #"tfidf_word_1-2": TfidfVectorizer(analyzer="word", ngram_range=(1, 2), lowercase=False, max_features= 1000),
    "count_char_3-5": CountVectorizer(analyzer="char", ngram_range=(3, 5), lowercase=False, max_features= 1000),
    #"count_charwb_3-5": CountVectorizer(analyzer="char_wb", ngram_range=(3, 5), lowercase=False, max_features= 1000),
    "tfidf_char_3-5": TfidfVectorizer(analyzer="char", ngram_range=(3, 5), lowercase=False, max_features= 1000),
    #"tfidf_charwb_3-5": TfidfVectorizer(analyzer="char_wb", ngram_range=(3, 5), lowercase=False, max_features= 1000),
}

rows = []
for prep_name, prep in PREPROCESSORS.items():
    for vec_name, vec in VECTORIZERS.items():
        print(f"Prep: {prep_name:12s} | Vec: {vec_name}")
        res = evaluate(X, y, prep, vec)
        print(res)
        rows.append({
            "preprocessing": prep_name,
            "vectorizer": vec_name,
            **res
        })

Prep: DON          | Vec: count_word_1-1


  return _ForkingPickler.loads(res)
  return _ForkingPickler.loads(res)
  return _ForkingPickler.loads(res)
  return _ForkingPickler.loads(res)
  return _ForkingPickler.loads(res)


KeyboardInterrupt: 

In [13]:
df = pd.DataFrame(rows).sort_values("macro_f1", ascending=False)
print(df)#if "RSW" improves results, is it because of English ?

df.to_csv(OUTFILE, index=False)
print(f"Saved to: {OUTFILE}")

   preprocessing      vectorizer       acc  macro_f1
16           RSW  count_word_1-1  0.996658  0.996659
8            URL  count_word_1-1  0.995488  0.995507
0            DON  count_word_1-1  0.995321  0.995342
12           PUN  count_word_1-1  0.995321  0.995342
14           PUN  count_char_3-5  0.995154  0.995170
4            LOW  count_word_1-1  0.994987  0.995022
10           URL  count_char_3-5  0.994820  0.994829
6            LOW  count_char_3-5  0.994486  0.994507
2            DON  count_char_3-5  0.994485  0.994487
17           RSW  tfidf_word_1-1  0.994151  0.994262
1            DON  tfidf_word_1-1  0.988636  0.989112
13           PUN  tfidf_word_1-1  0.988636  0.989112
9            URL  tfidf_word_1-1  0.988469  0.988944
5            LOW  tfidf_word_1-1  0.986797  0.987434
15           PUN  tfidf_char_3-5  0.985294  0.986151
7            LOW  tfidf_char_3-5  0.984959  0.985867
11           URL  tfidf_char_3-5  0.984960  0.985861
3            DON  tfidf_char_3-5  0.984792  0.

  from .autonotebook import tqdm as notebook_tqdm


Path to dataset files: /home/ceres/.cache/kagglehub/datasets/azimulh/tweets-data-for-authorship-attribution-modelling/versions/2


['/home/ceres/.cache/kagglehub/datasets/azimulh/tweets-data-for-authorship-attribution-modelling/versions/2/tweet.csv', '/home/ceres/.cache/kagglehub/datasets/azimulh/tweets-data-for-authorship-attribution-modelling/versions/2/tweet_with_authors.csv']


                   author                                              tweet
0     Neil deGrasse Tyson  A 50-yard field goal in MetLife stadium will d...
1     Neil deGrasse Tyson  @PrintingJesus Yup. I occasionally repost afte...
2     Neil deGrasse Tyson  @slstroud1 @TylerPhernetton False as stated. M...
3     Neil deGrasse Tyson  The next time anybody asks me about my religio...
4     Neil deGrasse Tyson  As climate change reshapes the World’s coastli...
...                   ...                                                ...
9903         Barack Obama  "Progress isn’t guaranteed. It’s not inevitabl...
9904         Barack Obama  America needs a budget that builds a stronger ...
9905         Barack Obama  RT @WhiteHouse: FACT: Since @POTUS took office...
9906         Barack Obama  LIVE: President Obama is speaking at the #WHCD...
9907         Barack Obama  Of course, @MichelleObama’s my wife, so I’m a ...

[9908 rows x 2 columns]


0       Neil deGrasse Tyson
1       Neil deGrasse Tyson
2       Neil deGrasse Tyson
3       Neil deGrasse Tyson
4       Neil deGrasse Tyson
               ...         
9903           Barack Obama
9904           Barack Obama
9905           Barack Obama
9906           Barack Obama
9907           Barack Obama
Name: author, Length: 9908, dtype: object


{'Sebastian Ruder', 'Ellen DeGeneres', 'Barack Obama', 'Neil deGrasse Tyson', 'KATY PERRY'}


Prep: DON          | Vec: count_word_1-1
{'acc': 0.8767650398258734, 'macro_f1': 0.8766916963023161}
Prep: DON          | Vec: tfidf_word_1-1
{'acc': 0.8706095393625924, 'macro_f1': 0.8688717054716129}
Prep: DON          | Vec: count_char_3-5
{'acc': 0.9173392944374179, 'macro_f1': 0.9171911709273759}
Prep: DON          | Vec: tfidf_char_3-5
{'acc': 0.9065400314083695, 'macro_f1': 0.9057020681441363}
Prep: LOW          | Vec: count_word_1-1
{'acc': 0.874242997680793, 'macro_f1': 0.8741392877735237}
Prep: LOW          | Vec: tfidf_word_1-1
{'acc': 0.870710345660159, 'macro_f1': 0.8688157437746675}
Prep: LOW          | Vec: count_char_3-5
{'acc': 0.9152198152886324, 'macro_f1': 0.9150822527896427}
Prep: LOW          | Vec: tfidf_char_3-5
{'acc': 0.9053289805116315, 'macro_f1': 0.9044079103264464}
Prep: PUN          | Vec: count_word_1-1
{'acc': 0.8767650398258734, 'macro_f1': 0.8766916963023161}
Prep: PUN          | Vec: tfidf_word_1-1
{'acc': 0.8706095393625924, 'macro_f1': 0.8688717054

         preprocessing      vectorizer       acc  macro_f1
14                 RSW  count_char_3-5  0.921074  0.921045
2                  DON  count_char_3-5  0.917339  0.917191
6                  LOW  count_char_3-5  0.915220  0.915082
15                 RSW  tfidf_char_3-5  0.906843  0.906653
3                  DON  tfidf_char_3-5  0.906540  0.905702
7                  LOW  tfidf_char_3-5  0.905329  0.904408
10                 PUN  count_char_3-5  0.895135  0.895028
11                 PUN  tfidf_char_3-5  0.886152  0.885044
0                  DON  count_word_1-1  0.876765  0.876692
8                  PUN  count_word_1-1  0.876765  0.876692
19      LOW+URLrem+PUN  tfidf_char_3-5  0.876968  0.876450
4                  LOW  count_word_1-1  0.874243  0.874139
18      LOW+URLrem+PUN  count_char_3-5  0.873638  0.873579
9                  PUN  tfidf_word_1-1  0.870610  0.868872
1                  DON  tfidf_word_1-1  0.870610  0.868872
5                  LOW  tfidf_word_1-1  0.870710  0.8688