In [13]:
pip install emoji


[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m A new release of pip is available: [0m[31;49m25.3[0m[39;49m -> [0m[32;49m26.0[0m
[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m To update, run: [0m[32;49mpython3 -m pip install --upgrade pip[0m
Note: you may need to restart the kernel to use updated packages.


In [14]:
import pandas as pd
import numpy as np
import re
import emoji

from sklearn.model_selection import cross_val_score
from sklearn.model_selection import KFold
from sklearn.model_selection import GridSearchCV

from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression

In [15]:
train_df = pd.read_csv('данные/train.csv')
test_df  = pd.read_csv('данные/test.csv')

print(train_df['label'].value_counts())
train_df.head()

label
0    75817
1    74183
Name: count, dtype: int64


Unnamed: 0,text,label
0,Taano ta ako pa?,0
1,Pag - isipan an halimbawa ni Twerter na sarong...,1
2,□ Taano ta kaipuhan pa nin dakol na tawo an so...,1
3,"An mga Pag - eksamin sa Paayadya, Asin an Pag ...",1
4,Si Andrew Cybbbber an nasa itaas na parte kan ...,1


## Очистка текстов

в данных используется филиппинский язык. это low recource language, то есть язык, на котором очень мало академических работ, размеченных датасетов, словарей и тд. поэтому обычные преобразования не сделать: нет мешка стоп-слов + проблемы с лемматизацией (в этом языке очень важны суффиксы, приставки, инфкисы (ставки внути слова), поэтому вручную не выйдет, а в открытом доступе ничего нет). тексты просто очищены от лишних символов и приведены к нижнему регистру. 

In [16]:
def clean_for_tfidf(text):
    text = text.lower()

    # удаляем ссылки
    text = re.sub(r"http\S+|www\S+", " ", text)

    # упоминаная и теги
    text = re.sub(r"@\w+|#\w+", " ", text)

    # эмоджи
    text = emoji.replace_emoji(text, replace=' ')

    # оставим латиницу, филиппинские буквы, пробелы
    text = re.sub(r"[^a-záéíóúñ\s]", " ", text)

    #  лишние пробелы
    text = re.sub(r"\s+", " ", text).strip()

    return text

In [17]:
train_df["text"] = train_df["text"].apply(clean_for_tfidf)

In [18]:
train_df.head()

Unnamed: 0,text,label
0,taano ta ako pa,0
1,pag isipan an halimbawa ni twerter na sarong m...,1
2,taano ta kaipuhan pa nin dakol na tawo an sobr...,1
3,an mga pag eksamin sa paayadya asin an pag ada...,1
4,si andrew cybbbber an nasa itaas na parte kan ...,1


In [29]:
# TF-IDF преобразование 

X_text = train_df["text"]
y = train_df["label"]

tfidf = TfidfVectorizer(
    max_features=2000000,
    min_df=2,
    max_df=0.98,
    ngram_range=(1, 3),
    sublinear_tf=True
)

X = tfidf.fit_transform(X_text)

X.shape

(150000, 241597)

## лог рег

In [20]:
logreg_params = {
    "C": [0.1, 0.3, 0.5, 0.7, 0.9, 1.0, 1.1, 1.2, 1.3, 1.5, 1.8, 2.0],
    "class_weight": ["balanced", None],
    "penalty": ["l2"],
    "solver": ["saga"],
    "max_iter": [40000]
}

logreg = LogisticRegression(n_jobs=-1)

logreg_grid = GridSearchCV(
    estimator=logreg,
    param_grid=logreg_params,
    scoring="roc_auc",
    cv=5,
    n_jobs=-1,
    verbose=2
)

logreg_grid.fit(X, y)

print("Лучшие параметры логрега:", logreg_grid.best_params_)
print("Лучший ROC-AUC логрега:", logreg_grid.best_score_)

Fitting 5 folds for each of 24 candidates, totalling 120 fits
[CV] END C=0.1, class_weight=None, max_iter=40000, penalty=l2, solver=saga; total time=   2.8s
[CV] END C=0.1, class_weight=balanced, max_iter=40000, penalty=l2, solver=saga; total time=   2.8s
[CV] END C=0.1, class_weight=None, max_iter=40000, penalty=l2, solver=saga; total time=   2.8s
[CV] END C=0.1, class_weight=balanced, max_iter=40000, penalty=l2, solver=saga; total time=   2.8s
[CV] END C=0.1, class_weight=balanced, max_iter=40000, penalty=l2, solver=saga; total time=   2.9s
[CV] END C=0.1, class_weight=None, max_iter=40000, penalty=l2, solver=saga; total time=   2.9s
[CV] END C=0.1, class_weight=balanced, max_iter=40000, penalty=l2, solver=saga; total time=   2.9s
[CV] END C=0.1, class_weight=balanced, max_iter=40000, penalty=l2, solver=saga; total time=   3.0s
[CV] END C=0.3, class_weight=balanced, max_iter=40000, penalty=l2, solver=saga; total time=   2.5s
[CV] END C=0.1, class_weight=None, max_iter=40000, penalty=

In [21]:
logreg_final = LogisticRegression(
    C=1.0,
    penalty="l2",
    solver="saga",
    class_weight='balanced',
    max_iter=4000,
    n_jobs=-1
)

logreg_final.fit(X, y)

0,1,2
,penalty,'l2'
,dual,False
,tol,0.0001
,C,1.0
,fit_intercept,True
,intercept_scaling,1
,class_weight,'balanced'
,random_state,
,solver,'saga'
,max_iter,4000


In [22]:
X_test = tfidf.transform(test_df["text"])
logreg_test_pred = logreg_final.predict_proba(X_test)[:, 1]

## LinearSVC + CalibratedClassifierCV

In [23]:
from sklearn.svm import LinearSVC
from sklearn.calibration import CalibratedClassifierCV
from sklearn.metrics import roc_auc_score

svm_params = {
    "C": [0.1, 0.12, 0.15, 0.17, 0.19, 0.2, 0.21, 0.23, 0.25, 0.27, 0.3],
}

best_svm_score = 0
best_svm_model = None
best_svm_params = None

kfold = KFold(n_splits=5, shuffle=True, random_state=42)

for C_value in svm_params["C"]:
    fold_scores = []

    base = LinearSVC(C=C_value)

    model = CalibratedClassifierCV(base, cv=3)

    for train_idx, val_idx in kfold.split(X):
        X_train, X_val = X[train_idx], X[val_idx]
        y_train, y_val = y[train_idx], y[val_idx]

        model.fit(X_train, y_train)
        preds = model.predict_proba(X_val)[:, 1]

        score = roc_auc_score(y_val, preds)
        fold_scores.append(score)

    mean_score = np.mean(fold_scores)
    print(f"C={C_value}, ROC-AUC={mean_score}")

    if mean_score > best_svm_score:
        best_svm_score = mean_score
        best_svm_params = {"C": C_value}
        best_svm_model = model

print("\nЛучшие параметры SVM:", best_svm_params)
print("Лучший ROC-AUC SVM:", best_svm_score)

C=0.1, ROC-AUC=0.7816717511604899
C=0.12, ROC-AUC=0.7822979027117274
C=0.15, ROC-AUC=0.7826884263151312
C=0.17, ROC-AUC=0.7827117974411262
C=0.19, ROC-AUC=0.7826143541446557
C=0.2, ROC-AUC=0.7825348752397853
C=0.21, ROC-AUC=0.7824369295110768
C=0.23, ROC-AUC=0.7821963306101833
C=0.25, ROC-AUC=0.7819142261685521
C=0.27, ROC-AUC=0.7815997970542431
C=0.3, ROC-AUC=0.7810851321222864

Лучшие параметры SVM: {'C': 0.17}
Лучший ROC-AUC SVM: 0.7827117974411262


In [24]:
best_C = best_svm_params["C"]

svm_base = LinearSVC(C=best_C)
svm_final = CalibratedClassifierCV(svm_base, cv=3)

svm_final.fit(X, y)
svm_test_pred = svm_final.predict_proba(X_test)[:, 1]