In [14]:
import pandas as pd
import numpy as np

from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics import balanced_accuracy_score
from sklearn.linear_model import LogisticRegression

In [15]:
seed = 42

# Data prep

In [16]:
def clean_df(df):
    df[["title", "content"]] = df[["title", "content"]].fillna(value="")

    return df

def prep_features(df: pd.DataFrame):
    df["text"] = df["title"] + " " + df["content"]
    return df["text"]

In [17]:
df = pd.read_csv("train.csv")
df = clean_df(df)

df_train = prep_features(df)

In [18]:
df_train.head()

0    PSD în alertă Prăbușirea PSD de la altitudinea...
1    În amintirea Vioricăi, milioane de români beau...
2    Dramă! Când credea că nu se poate mai rău, un ...
3    Spania - România, 5-0. „Tricolorii”, îngenunch...
4    Campanie electorală, veselie generală Toate cr...
Name: text, dtype: object

# EDA

In [19]:
df.isna().sum().sort_values(ascending=False)

id         0
title      0
content    0
class      0
text       0
dtype: int64

# Model selection

In [20]:
X_train, X_test, y_train, y_test = train_test_split(df_train, df["class"], test_size=0.33, random_state=seed)

In [21]:
vectorizer = TfidfVectorizer(ngram_range=(1, 1))

X_train = vectorizer.fit_transform(X_train)
X_test = vectorizer.transform(X_test)

In [22]:
def evaluate(clf):
    scores = cross_val_score(
        clf, X_train, y_train, scoring="balanced_accuracy", cv=3, n_jobs=-1
    )

    clf.fit(X_train, y_train)
    score = balanced_accuracy_score(y_test, clf.predict(X_test))

    return scores.mean() - scores.std(), score

In [23]:
lr = LogisticRegression()

evaluate(lr)

(0.9671143452543614, 0.9712997989545638)

In [24]:
clf = lr

# Submission

In [25]:
df_test = pd.read_csv("test.csv")
df_test = clean_df(df_test)

features = prep_features(df_test)
features = vectorizer.transform(features)

In [26]:
df_test["class"] = clf.predict(features)

In [28]:
submission = pd.DataFrame({
    "id": df_test["id"], "class": df_test["class"]
})

submission.to_csv("submission.csv", index=False)