In [14]:
import os

import nltk
from nltk.tokenize import word_tokenize
import pandas as pd

from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split

In [15]:
seed = 42

EVALUATION = False
root_path = "/home/stefan/Downloads/apoai2025-nlp" if not EVALUATION else "/bohr/train-t05i/v2"

nltk.data.path.append(f'{root_path}/punkt')

# Data preparation

In [16]:
encoder = LabelEncoder()

def clean_text(text):
    return text

def prep_df(csv_path: str):
    df = pd.read_csv(csv_path)

    df["text"] = df["text"].apply(clean_text)
    X = df["text"]

    if "category" in df.columns:
        y = encoder.fit_transform(df["category"])

    return X, y

In [17]:
X, y = prep_df(f"{root_path}/train_news.csv")

# Model selection

In [18]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.33, random_state=seed)

In [19]:
vectorizer = TfidfVectorizer(max_features=20000, stop_words='english', ngram_range=(1,2), max_df=0.7, min_df=2)

X_train = vectorizer.fit_transform(X_train)
X_test = vectorizer.transform(X_test)

X_train, X_test, y_train, y_test = train_test_split

In [20]:
model = LogisticRegression(C=10, solver='lbfgs')
model.fit(X_train, y_train)

model.score(X_test, y_test)

0.9787878787878788

# Submission

In [21]:
if EVALUATION:
    data_path = os.environ.get("DATA_PATH") + "/"
    test_file_path = data_path + "test_news_nolabel.csv"

    test_df = pd.read_csv(test_file_path)
    test_texts = test_df["text"].astype(str).fillna('').apply(clean_text)

    X_test_tfidf = vectorizer.transform(test_texts)
    predicted_numerical_labels = model.predict(X_test_tfidf)

    predicted_string_labels = encoder.inverse_transform(predicted_numerical_labels)
    test_df["category"] = predicted_string_labels

    output_path = "submission.csv"
    test_df.to_csv(output_path, index=False)