## NBSVM

In [0]:
import pandas as pd, numpy as np
from sklearn.linear_model import LogisticRegression
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from scipy.sparse import hstack
import gc
import os

Кроме основного был использован датасет 'Jigsaw train multilingual comments (Google API)'

In [0]:
# Loading data

train1 = pd.read_csv("/kaggle/input/jigsaw-multilingual-toxic-comment-classification/jigsaw-toxic-comment-train.csv")
train1['lang'] = 'en'

train_es = pd.read_csv('/kaggle/input/jigsaw-train-multilingual-coments-google-api/jigsaw-toxic-comment-train-google-es-cleaned.csv')
train_es['lang'] = 'es'

train_fr = pd.read_csv('/kaggle/input/jigsaw-train-multilingual-coments-google-api/jigsaw-toxic-comment-train-google-fr-cleaned.csv')
train_fr['lang'] = 'fr'

train_pt = pd.read_csv('/kaggle/input/jigsaw-train-multilingual-coments-google-api/jigsaw-toxic-comment-train-google-pt-cleaned.csv')
train_pt['lang'] = 'pt'

train_ru = pd.read_csv('/kaggle/input/jigsaw-train-multilingual-coments-google-api/jigsaw-toxic-comment-train-google-ru-cleaned.csv')
train_ru['lang'] = 'ru'

train_it = pd.read_csv('/kaggle/input/jigsaw-train-multilingual-coments-google-api/jigsaw-toxic-comment-train-google-it-cleaned.csv')
train_it['lang'] = 'it'

train_tr = pd.read_csv('/kaggle/input/jigsaw-train-multilingual-coments-google-api/jigsaw-toxic-comment-train-google-tr-cleaned.csv')
train_tr['lang'] = 'tr'


train = pd.concat([
    train1[['comment_text', 'lang', 'toxic']],
    train_es[['comment_text', 'lang', 'toxic']],
    train_tr[['comment_text', 'lang', 'toxic']],
    train_fr[['comment_text', 'lang', 'toxic']],
    train_pt[['comment_text', 'lang', 'toxic']],
    train_ru[['comment_text', 'lang', 'toxic']],
    train_it[['comment_text', 'lang', 'toxic']]  
]).sample(n=300000).reset_index(drop=True)

del train1, train_es, train_fr, train_pt, train_ru, train_it, train_tr
gc.collect()

0

In [0]:
test = pd.read_csv('/kaggle/input/jigsaw-multilingual-toxic-comment-classification/test.csv')

subm = pd.read_csv('/kaggle/input/jigsaw-multilingual-toxic-comment-classification/sample_submission.csv')

In [0]:
train.head()

Unnamed: 0,comment_text,lang,toxic
0,Lütfen tahrip etmeyin.,tr,0
1,=== Pakistan Army === \n I guess your beloved ...,en,0
2,The comment directly above this one are from a...,en,0
3,Ogstrokes and 24.239.149.9 are the same person...,en,0
4,": Da parte mia, dubito piuttosto che otterrest...",it,0


Пример данных

In [0]:
train['comment_text'][2]

'The comment directly above this one are from a very strange detractor of mine at cplsanchez.infoa fetish fan site authored by a somewhat disturbed individual who has an unrequited attraction to me.  This commentator was banned from this site under several names and is quite consistent about placing the same comment anywhere I write.  The commentator above insists on comparing the standards for entering the military with the FDA standards for donating blood, a ludicrous comparison by someone who has no working knowledge of the military.  I invite the contributors to look at the amateurish Cplsanchez.info, because it shows what the Matt Sanchez should not becomea childish hate site.  \n\nI was recently the subject of a Fox News interview:  http://www.foxnews.com/video/index.html?playerId=videolandingpage&streamingFormat;=FLASH&referralObject;=8083220&referralPlaylistId;=7f634ca75753642edb5e38bcd9b77f712d735ea8'

In [0]:
label_cols = ['toxic']
train.describe()

Unnamed: 0,toxic
count,300000.0
mean,0.096277
std,0.294971
min,0.0
25%,0.0
50%,0.0
75%,0.0
max,1.0


Заполнение пропусков в трейне и тесте

In [0]:
train['comment_text'].fillna("unknown", inplace=True)
test['content'].fillna("unknown", inplace=True)

Обработка комментариев

In [0]:
import re, string
re_tok = re.compile(f'([{string.punctuation}“”¨«»®´·º½¾¿¡§£₤‘’])')

def tokenize(s): 
  return re_tok.sub(r' \1 ', s).split()

In [0]:
n = train.shape[0]

vec = TfidfVectorizer(ngram_range=(1,2), tokenizer=tokenize,
               strip_accents='unicode', use_idf=1,
               smooth_idf=1, sublinear_tf=1 )


trn_term_doc = vec.fit_transform(train['comment_text'])
test_term_doc = vec.transform(test['content'])


NaiveBayes уравнение

In [0]:
def pr(y_i, y):
    p = x[y==y_i].sum(0)
    return (p+1) / ((y==y_i).sum()+1)

In [0]:
x = trn_term_doc
test_x = test_term_doc

Модель (здесь были проведены эксперименты с LogisticRegression: разные солверы, количество итераций, параметр dual, также для liblinear была попытка использовать L1-регуляризацию. Лучший результат дала текущая комбинация). В статье написано что L2-регуляризация показывает себя лучше, в LogisticRegression она стоит по дефолту. 

(https://nlp.stanford.edu/pubs/sidaw12_simple_sentiment.pdf)

In [0]:
def get_mdl(y):
    y = y.values
    r = np.log(pr(1,y) / pr(0,y))
    m = LogisticRegression(C=4, solver='liblinear', dual=True, max_iter=300)
    x_nb = x.multiply(r)
    return m.fit(x_nb, y), r

Предсказание и сабмит

In [0]:
preds = np.zeros((len(test), len(label_cols)))

for i, j in enumerate(label_cols):
    m,r = get_mdl(train[j])
    preds[:,i] = m.predict_proba(test_x.multiply(r))[:,1]

In [0]:
submid = pd.DataFrame({'id': subm["id"]})
submission = pd.concat([submid, pd.DataFrame(preds, columns = label_cols)], axis=1)

submission.to_csv('submission.csv', index=False)

Public LB : 0.8701