In [None]:
# work in progress, current best score: 0.86659

In [16]:
import pandas as pd
import numpy as np

from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics import balanced_accuracy_score
from sklearn.linear_model import LogisticRegression
from sklearn.feature_selection import SelectKBest, f_classif
from sklearn.svm import SVC
from sklearn.neural_network import MLPClassifier
from sklearn.naive_bayes import MultinomialNB
from sklearn.decomposition import TruncatedSVD

from xgboost import XGBClassifier

In [17]:
seed = 42

# Data prep

In [18]:
def clean_df(df):
    df[["title", "content"]] = df[["title", "content"]].fillna(value="")

    return df

def prep_features(df: pd.DataFrame):
    df["text"] = df["title"] + " " + df["content"]

    df["text"] = df["text"].str.strip()

    return df["text"]

In [19]:
df = pd.read_csv("train.csv")
df = clean_df(df)

df_train = prep_features(df)

In [20]:
df_train.head()

0    PSD în alertă Prăbușirea PSD de la altitudinea...
1    În amintirea Vioricăi, milioane de români beau...
2    Dramă! Când credea că nu se poate mai rău, un ...
3    Spania - România, 5-0. „Tricolorii”, îngenunch...
4    Campanie electorală, veselie generală Toate cr...
Name: text, dtype: object

# EDA

In [21]:
df.isna().sum().sort_values(ascending=False)

id         0
title      0
content    0
class      0
text       0
dtype: int64

# Model selection

In [22]:
X_train, X_test, y_train, y_test = train_test_split(df_train, df["class"], test_size=0.33, random_state=seed)

In [23]:
def evaluate(clf):
    scores = cross_val_score(
        clf, X_train, y_train, scoring="balanced_accuracy", cv=3, n_jobs=-1
    )

    clf.fit(X_train, y_train)
    score = balanced_accuracy_score(y_test, clf.predict(X_test))

    return scores.mean() - scores.std(), score

In [None]:
vectorizer = TfidfVectorizer(
    min_df=10,
    lowercase=True,
    use_idf=True,
    norm=u'l2',
    smooth_idf=True,

    # increase to (1, 4) for better score
    ngram_range=(1, 1),

    # needs finetuning
    max_features=5000
)

X_train_vec = vectorizer.fit_transform(X_train)
X_test_vec = vectorizer.transform(X_test)

In [None]:
print(f"{X_train_vec.shape[1]} total vec features")

# # needs finetuning
svd = TruncatedSVD(n_components=1500)

X_train = svd.fit_transform(X_train_vec, y_train)
X_test = svd.transform(X_test_vec)

5000 total vec features


In [30]:
lr = LogisticRegression()

evaluate(lr)

(0.968222249824302, 0.9698601125854442)

In [31]:
mlp = MLPClassifier((64, 8))

evaluate(mlp)

(0.9708281740148988, 0.9742407318053881)

In [33]:
xgb = XGBClassifier()

evaluate(xgb)

(0.958501111902132, 0.9606176115802172)

In [34]:
clf = mlp

# Submission

In [35]:
df_test = pd.read_csv("test.csv")
df_test = clean_df(df_test)

features = prep_features(df_test)
features = vectorizer.transform(features)
features = svd.transform(features)

In [36]:
df_test["class"] = clf.predict(features)

In [37]:
submission = pd.DataFrame({
    "id": df_test["id"], "class": df_test["class"]
})

submission.to_csv("submission.csv", index=False)