In [967]:
import pandas as pd

from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.naive_bayes import MultinomialNB
from sklearn.svm import LinearSVC
from sklearn.ensemble import StackingClassifier
from sklearn.metrics import f1_score, make_scorer

In [968]:
seed = 42

# Data preparation

In [969]:
def clean_df(df):
    return df

def clean_text(text):
    text = text.lower()
    text = text.replace("$ne$", "entitate_necunoscuta")
    text = text.replace("â", "î").replace("Â", "Î")
    return text

def prep_features(df: pd.DataFrame):
    df = df.drop(["datapointID", "dialect", "category"], axis=1, errors='ignore')

    df["sample"] = df["sample"].str.strip()
    df["sample"] = df["sample"].apply(clean_text)

    return df["sample"]

In [970]:
df = pd.read_csv("train_data.csv")
df = clean_df(df)

df_train = prep_features(df)

In [971]:
df_train.head()

0    entitate_necunoscuta entitate_necunoscuta fin ...
1    asigurator săgeată, predicție cover înfrîngă a...
2    notificările asigurator importanţei entitate_n...
3    entitate_necunoscuta entitate_necunoscuta priv...
4    entitate_necunoscuta entitate_necunoscuta 1 în...
Name: sample, dtype: object

# EDA

In [972]:
df.isna().sum().sort_values(ascending=False)

datapointID    0
sample         0
dialect        0
category       0
dtype: int64

# Model selection - dialect

In [973]:
X_train, X_test, y_train, y_test = train_test_split(df_train, df["dialect"]-1, test_size=0.33, random_state=seed)

In [974]:
vectorizer = TfidfVectorizer(
    min_df=2, max_df=0.9, ngram_range=(1, 3)
)

X_train_vec = vectorizer.fit_transform(X_train)
X_test_vec = vectorizer.transform(X_test)

In [975]:
X_train = X_train_vec
X_test = X_test_vec

In [976]:
def evaluate(clf, is_multiclass=False):
    if is_multiclass:
        scorer = make_scorer(f1_score, average='weighted')
    else: 
        scorer = make_scorer(f1_score)
        
    scores = cross_val_score(clf, X_train, y_train, scoring=scorer, cv=3)

    clf.fit(X_train, y_train)
    score = scorer(clf, X_test, y_test)

    return (scores.mean() - scores.std()).item(), score.item()

In [977]:
nb = MultinomialNB(alpha=0.001)

evaluate(nb)

(0.841817021442793, 0.8416075650118203)

In [978]:
svc = LinearSVC(C=50)

evaluate(svc)

(0.8236386964248699, 0.824390243902439)

In [979]:
st = StackingClassifier(
    [("nb", nb), ("svc", svc)],
    n_jobs=-1,
)

evaluate(st)

(0.846901105274627, 0.8441247002398081)

In [980]:
clf_dialect = st

clf_dialect.fit(X_train, y_train)

# Model selection - category

In [981]:
X_train, X_test, y_train, y_test = train_test_split(
    df_train, df["category"]-1, test_size=0.33, random_state=seed
)

In [982]:
X_train_vec = vectorizer.transform(X_train)
X_test_vec = vectorizer.transform(X_test)

X_train = X_train_vec
X_test = X_test_vec

In [983]:
nb = MultinomialNB(alpha=0.008)

evaluate(nb, is_multiclass=True)

(0.7182037124183772, 0.7520372950725049)

In [984]:
clf_category = nb

clf_category.fit(X_train, y_train)

# Submission

In [985]:
df_test = pd.read_csv("test_data.csv")
df_test = clean_df(df_test)

features = prep_features(df_test)
features = vectorizer.transform(features)

In [986]:
subtask1 = clf_dialect.predict(features)+1

subtask2 = clf_category.predict(features)+1

In [987]:
def build_subtask(subtask_id, answer):
    return pd.DataFrame(
        {
            "subtaskID": subtask_id,
            "datapointID": df_test["datapointID"],
            "answer": answer,
        }
    )

subtasks = [
    (1, subtask1),
    (2, subtask2)
]

submission = pd.concat([build_subtask(sid, subtask) for sid, subtask in subtasks], ignore_index=True)

In [988]:
submission.head()

Unnamed: 0,subtaskID,datapointID,answer
0,1,1621,2
1,1,177,2
2,1,393,2
3,1,1175,1
4,1,539,1


In [989]:
submission.to_csv("submission.csv", index=False)