# NLP Lab â€“ Minimal Preprocessing Experiments (Single Task)

This notebook runs a tiny grid of preprocessing experiments on **one** supervised text classification task
and saves results to `SEED = 42
OUTFILE = Path("results_autorship-attribution.csv")`.


In [22]:
import re
from pathlib import Path

import numpy 
import pandas

from sklearn.datasets import fetch_20newsgroups
from sklearn.model_selection import StratifiedKFold, cross_validate
from sklearn.pipeline import Pipeline
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import make_scorer, f1_score


In [23]:
# --- Preprocessing (kept simple) ---

!pip install stopwordsiso
import stopwordsiso

def don(text): 
    return text
    
def lower(text): 
    return text.lower()

URL_RE = re.compile(r"https?://\S+|www\.\S+")
def remove_urls(text): 
    return URL_RE.sub(" ", text)
    
def replace_urls(text): 
    return URL_RE.sub(" <URL> ", text)
    
PUNCT_RE = re.compile(r"[^\w\s]")
def remove_punct(text): 
    return PUNCT_RE.sub(" ", text)

TOKEN_PUNC = re.compile(r"\w+|[^\w\s]")
def remove_stopwords(text):
    #very slow but keeps punctuation
    
    return " ".join([T for T in TOKEN_PUNC.findall(text) if T not in set(stopwordsiso.stopwords("en"))])

    
def compose(*funcs):
    def f(text):
        for fn in funcs:
            text = fn(text)
        return re.sub(r"\s+", " ", text).strip()
    return f

PREPROCESSORS = {
    "DON": don,
    "LOW": lower,
    "URLrem": remove_urls,
    "URLrep": replace_urls,
    "PUN": remove_punct,
    "RSW": remove_stopwords,
    "LOW+URLrem": compose(lower, remove_urls),
    "LOW+URLrep": compose(lower, replace_urls),
    "LOW+PUN": compose(lower, remove_punct),
    "LOW+URLrem+PUN": compose(lower, remove_urls, remove_punct),
    "LOW+URLrep+PUN": compose(lower, remove_urls, remove_punct),
    "LOW+URLrem+PUN+RSW": compose(lower, remove_urls, remove_punct, remove_stopwords),
    "LOW+URLrep+PUN+RSW": compose(lower, replace_urls, remove_punct, remove_stopwords),

}
toto = "I am travelling to Nancy for an NLP course at IDMC :https://idmc.univ-lorraine.fr/" 
for prep_name, prep in PREPROCESSORS.items():
    print(f"Prep: {prep_name}")
    print(prep(toto))


Prep: DON
I am travelling to Nancy for an NLP course at IDMC :https://idmc.univ-lorraine.fr/
Prep: LOW
i am travelling to nancy for an nlp course at idmc :https://idmc.univ-lorraine.fr/
Prep: URLrem
I am travelling to Nancy for an NLP course at IDMC : 
Prep: URLrep
I am travelling to Nancy for an NLP course at IDMC : <URL> 
Prep: PUN
I am travelling to Nancy for an NLP course at IDMC  https   idmc univ lorraine fr 
Prep: RSW
I travelling Nancy NLP IDMC : https : / / idmc . univ - lorraine . /
Prep: LOW+URLrem
i am travelling to nancy for an nlp course at idmc :
Prep: LOW+URLrep
i am travelling to nancy for an nlp course at idmc : <URL>
Prep: LOW+PUN
i am travelling to nancy for an nlp course at idmc https idmc univ lorraine fr
Prep: LOW+URLrem+PUN
i am travelling to nancy for an nlp course at idmc
Prep: LOW+URLrep+PUN
i am travelling to nancy for an nlp course at idmc
Prep: LOW+URLrem+PUN+RSW
travelling nancy nlp idmc
Prep: LOW+URLrep+PUN+RSW
travelling nancy nlp idmc URL


In [24]:
#!pip install kagglehub
import kagglehub

path = kagglehub.dataset_download("azimulh/tweets-data-for-authorship-attribution-modelling")

print("Local Path to dataset files:", path)
import glob
csv_data = pandas.read_csv(f"{path}/tweet_with_authors.csv")

X = csv_data["tweet"]
y = csv_data["author"]

print("Samples:", len(X), "Classes:", set(y) )


Local Path to dataset files: /home/ceres/.cache/kagglehub/datasets/azimulh/tweets-data-for-authorship-attribution-modelling/versions/2
Samples: 9908 Classes: {'Sebastian Ruder', 'KATY PERRY', 'Ellen DeGeneres', 'Barack Obama', 'Neil deGrasse Tyson'}


In [25]:
#Evaluation part
macro_f1 = make_scorer(f1_score, average="macro")
SCORING = {"acc": "accuracy", "macro_f1": macro_f1}

def mean_scores(scores):
    return {k.replace("test_", ""): float(numpy.mean(v))
            for k, v in scores.items() if k.startswith("test_")}

def evaluate(X, y, preprocess, vectorizer):
    Xp = [preprocess(t) for t in X]
    pipe = Pipeline([
        ("vect", vectorizer),
        ("clf", MODEL),
    ])
    cv = StratifiedKFold(n_splits=5, shuffle=True, random_state=SEED)
    scores = cross_validate(pipe, Xp, y, cv=cv, scoring=SCORING, n_jobs=-1)
    return mean_scores(scores)


In [26]:
#Partie apprentissage
MODEL = LogisticRegression(max_iter=2000, random_state=SEED)
SEED = 42
OUTFILE = Path("results_autorship-attribution.csv")

import os
os.environ["PYTHONWARNINGS"] = "ignore:pkg_resources is deprecated as an API:UserWarning"


VECTORIZERS = {
    "count_word_1-1": CountVectorizer(analyzer="word", ngram_range=(1, 1), lowercase=False),
    #"count_word_1-2": CountVectorizer(analyzer="word", ngram_range=(1, 2), lowercase=False),
    "tfidf_word_1-1": TfidfVectorizer(analyzer="word", ngram_range=(1, 1), lowercase=False),
    #"tfidf_word_1-2": TfidfVectorizer(analyzer="word", ngram_range=(1, 2), lowercase=False),
    "count_char_3-5": CountVectorizer(analyzer="char", ngram_range=(3, 5), lowercase=False),
    #"count_charwb_3-5": CountVectorizer(analyzer="char_wb", ngram_range=(3, 5), lowercase=False),
    "tfidf_char_3-5": TfidfVectorizer(analyzer="char", ngram_range=(3, 5), lowercase=False),
    #"tfidf_charwb_3-5": TfidfVectorizer(analyzer="char_wb", ngram_range=(3, 5), lowercase=False),
}

rows = []
for prep_name, prep in PREPROCESSORS.items():
    for vec_name, vec in VECTORIZERS.items():
        print(f"Prep: {prep_name:12s} | Vec: {vec_name}")
        res = evaluate(X, y, prep, vec)
        print(res)
        rows.append({
            "preprocessing": prep_name,
            "vectorizer": vec_name,
            **res
        })




Prep: DON          | Vec: count_word_1-1
{'acc': 0.8767650398258734, 'macro_f1': 0.8766916963023161}
Prep: DON          | Vec: tfidf_word_1-1
{'acc': 0.8706095393625924, 'macro_f1': 0.8688717054716129}
Prep: DON          | Vec: count_char_3-5
{'acc': 0.9173392944374179, 'macro_f1': 0.9171911709273759}
Prep: DON          | Vec: tfidf_char_3-5
{'acc': 0.9065400314083695, 'macro_f1': 0.9057020681441363}
Prep: LOW          | Vec: count_word_1-1
{'acc': 0.874242997680793, 'macro_f1': 0.8741392877735237}
Prep: LOW          | Vec: tfidf_word_1-1
{'acc': 0.870710345660159, 'macro_f1': 0.8688157437746675}
Prep: LOW          | Vec: count_char_3-5
{'acc': 0.9152198152886324, 'macro_f1': 0.9150822527896427}
Prep: LOW          | Vec: tfidf_char_3-5
{'acc': 0.9053289805116315, 'macro_f1': 0.9044079103264464}
Prep: URLrem       | Vec: count_word_1-1
{'acc': 0.8661674403299561, 'macro_f1': 0.8663700245342051}
Prep: URLrem       | Vec: tfidf_word_1-1
{'acc': 0.8600112776727041, 'macro_f1': 0.8590353442

In [7]:
df = pd.DataFrame(rows).sort_values("macro_f1", ascending=False)
print(df)

         preprocessing      vectorizer       acc  macro_f1
49  LOW+URLrep+PUN+RSW  tfidf_word_1-1  0.912542  0.912576
45  LOW+URLrem+PUN+RSW  tfidf_word_1-1  0.912542  0.912576
21                 RSW  tfidf_word_1-1  0.904746  0.905020
47  LOW+URLrem+PUN+RSW  tfidf_char_3-5  0.901017  0.900910
51  LOW+URLrep+PUN+RSW  tfidf_char_3-5  0.898644  0.898668
5                  LOW  tfidf_word_1-1  0.892542  0.892805
33             LOW+PUN  tfidf_word_1-1  0.892542  0.892805
25          LOW+URLrem  tfidf_word_1-1  0.892542  0.892801
37      LOW+URLrem+PUN  tfidf_word_1-1  0.892542  0.892801
41      LOW+URLrep+PUN  tfidf_word_1-1  0.892542  0.892801
29          LOW+URLrep  tfidf_word_1-1  0.892203  0.892461
39      LOW+URLrem+PUN  tfidf_char_3-5  0.891864  0.892094
43      LOW+URLrep+PUN  tfidf_char_3-5  0.891864  0.892094
19                 PUN  tfidf_char_3-5  0.891864  0.892075
35             LOW+PUN  tfidf_char_3-5  0.891186  0.891393
48  LOW+URLrep+PUN+RSW  count_word_1-1  0.890169  0.8906

In [8]:
df.to_csv(OUTFILE, index=False)
print(f"Saved to: {OUTFILE}")


Saved to: results_single_task_3_classes.csv


In [5]:
MODEL = LogisticRegression(max_iter=200, random_state=SEED)

In [7]:
full_data = fetch_20newsgroups(
    subset="all",
    remove=("headers", "footers", "quotes"),
)
X, y = full_data.data, full_data.target
print("Samples:", len(X), "Classes:", set(y), "Labels:", full_data.target_names)

VECTORIZERS = {
    #"count_word_1-1": CountVectorizer(analyzer="word", ngram_range=(1, 1), lowercase=False),
    #"count_word_1-2": CountVectorizer(analyzer="word", ngram_range=(1, 2), lowercase=False),
    "tfidf_word_1-1": TfidfVectorizer(analyzer="word", ngram_range=(1, 1), lowercase=False),
    #"tfidf_word_1-2": TfidfVectorizer(analyzer="word", ngram_range=(1, 2), lowercase=False),
    #"count_char_3-5": CountVectorizer(analyzer="char", ngram_range=(3, 5), lowercase=False),
    #"count_charwb_3-5": CountVectorizer(analyzer="char_wb", ngram_range=(3, 5), lowercase=False),
    #"tfidf_char_3-5": TfidfVectorizer(analyzer="char", ngram_range=(3, 5), lowercase=False),
    #"tfidf_charwb_3-5": TfidfVectorizer(analyzer="char_wb", ngram_range=(3, 5), lowercase=False),
}

rows_full = []
for prep_name, prep in PREPROCESSORS.items():
    for vec_name, vec in VECTORIZERS.items():
        print(f"Prep: {prep_name:12s} | Vec: {vec_name}")
        res = evaluate(X, y, prep, vec)
        print(res)
        rows_full.append({
            "preprocessing": prep_name,
            "vectorizer": vec_name,
            **res
        })

Samples: 18846 Classes: {0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19} Labels: ['alt.atheism', 'comp.graphics', 'comp.os.ms-windows.misc', 'comp.sys.ibm.pc.hardware', 'comp.sys.mac.hardware', 'comp.windows.x', 'misc.forsale', 'rec.autos', 'rec.motorcycles', 'rec.sport.baseball', 'rec.sport.hockey', 'sci.crypt', 'sci.electronics', 'sci.med', 'sci.space', 'soc.religion.christian', 'talk.politics.guns', 'talk.politics.mideast', 'talk.politics.misc', 'talk.religion.misc']
Prep: LOW          | Vec: tfidf_word_1-1


  return _ForkingPickler.loads(res)
  return _ForkingPickler.loads(res)

{'acc': 0.7311891016550626, 'macro_f1': 0.7195677702933339}
Prep: URLrem       | Vec: tfidf_word_1-1
{'acc': 0.7207887886168962, 'macro_f1': 0.7101215641322157}
Prep: URLrep       | Vec: tfidf_word_1-1
{'acc': 0.7208949175635665, 'macro_f1': 0.7102590301600126}
Prep: PUN          | Vec: tfidf_word_1-1
{'acc': 0.7203642728302155, 'macro_f1': 0.7095112650794365}
Prep: RSW          | Vec: tfidf_word_1-1
{'acc': 0.7391483363161573, 'macro_f1': 0.7282771443235193}
Prep: LOW+URLrem   | Vec: tfidf_word_1-1
{'acc': 0.7310299363859716, 'macro_f1': 0.7196131318136788}
Prep: LOW+URLrep   | Vec: tfidf_word_1-1
{'acc': 0.7303400560062439, 'macro_f1': 0.7187658683953233}
Prep: LOW+PUN      | Vec: tfidf_word_1-1
{'acc': 0.7311891016550626, 'macro_f1': 0.7195677702933339}
Prep: LOW+URLrem+PUN | Vec: tfidf_word_1-1
{'acc': 0.7310299363859716, 'macro_f1': 0.7196131318136788}
Prep: LOW+URLrep+PUN | Vec: tfidf_word_1-1
{'acc': 0.7310299363859716, 'macro_f1': 0.7196131318136788}
Prep: LOW+URLrem+PUN+RSW | 

In [8]:
df = pd.DataFrame(rows_full).sort_values("macro_f1", ascending=False)
print(df)

         preprocessing      vectorizer       acc  macro_f1
10  LOW+URLrem+PUN+RSW  tfidf_word_1-1  0.745516  0.734555
11  LOW+URLrep+PUN+RSW  tfidf_word_1-1  0.745463  0.734509
4                  RSW  tfidf_word_1-1  0.739148  0.728277
5           LOW+URLrem  tfidf_word_1-1  0.731030  0.719613
8       LOW+URLrem+PUN  tfidf_word_1-1  0.731030  0.719613
9       LOW+URLrep+PUN  tfidf_word_1-1  0.731030  0.719613
0                  LOW  tfidf_word_1-1  0.731189  0.719568
7              LOW+PUN  tfidf_word_1-1  0.731189  0.719568
6           LOW+URLrep  tfidf_word_1-1  0.730340  0.718766
2               URLrep  tfidf_word_1-1  0.720895  0.710259
1               URLrem  tfidf_word_1-1  0.720789  0.710122
3                  PUN  tfidf_word_1-1  0.720364  0.709511


In [9]:
OUTFILE2 = "results_20_classes.csv"
df.to_csv(OUTFILE2, index=False)
print(f"Saved to: {OUTFILE2}")

Saved to: results_20_classes.csv
