In [1]:
import numpy as np
import pandas as pd

from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import cross_val_score, StratifiedKFold, cross_val_predict
from scipy.sparse import hstack, save_npz, load_npz
from sklearn.externals import joblib
from tqdm import tqdm_notebook, tqdm

class_names = ['toxic', 'severe_toxic', 'obscene', 'threat', 'insult', 'identity_hate']

train = pd.read_csv('./data/train.csv.zip').fillna(' ')
test = pd.read_csv('./data/test.csv.zip').fillna(' ')

train_text = train['comment_text']
test_text = test['comment_text']
all_text = pd.concat([train_text, test_text])

In [2]:
%%time
word_vectorizer = TfidfVectorizer(
    sublinear_tf=True,
    strip_accents='unicode',
    analyzer='word',
    token_pattern=r'\w{1,}',
    stop_words='english',
    ngram_range=(1, 2),
    max_features=12000)
word_vectorizer.fit(all_text)
train_word_features = word_vectorizer.transform(train_text)
test_word_features = word_vectorizer.transform(test_text)
joblib.dump(word_vectorizer,"./additional_matiriels/word_vectorizer_(1,2)_max12000")
del word_vectorizer

CPU times: user 1min 27s, sys: 1.25 s, total: 1min 28s
Wall time: 1min 29s


In [3]:
%%time
char_vectorizer = TfidfVectorizer(
    sublinear_tf=True,
    strip_accents='unicode',
    analyzer='char',
    stop_words='english',
    ngram_range=(2, 6),
    max_features=60000)
char_vectorizer.fit(all_text)
train_char_features = char_vectorizer.transform(train_text)
test_char_features = char_vectorizer.transform(test_text)

joblib.dump(char_vectorizer,"./additional_matiriels/char_vectorizer_(2,6)_max60000")
del char_vectorizer


CPU times: user 12min 23s, sys: 10.8 s, total: 12min 34s
Wall time: 12min 34s


In [4]:
train_features = hstack([train_char_features, train_word_features])
test_features = hstack([test_char_features, test_word_features])

save_npz("./additional_matiriels/train_features_max_iter12_60.npz",train_features)
save_npz("./additional_matiriels/test_features_max_iter12_60.npz",test_features)

In [2]:
%%time
train_features = load_npz('./additional_matiriels/train_features_max_iter12_60.npz')
test_features = load_npz('./additional_matiriels/test_features_max_iter12_60.npz')

CPU times: user 19.9 s, sys: 1.13 s, total: 21 s
Wall time: 21.2 s


In [3]:
%%time
submission = pd.DataFrame.from_dict({'id': test['id']})
log_for_stack = pd.DataFrame.from_dict({'id': train['id']})

for class_name in tqdm(class_names):
    train_target = train[class_name]
    classifier = LogisticRegression(solver='sag', random_state=0)
    
    cv = StratifiedKFold(n_splits=5, random_state=0)
    cv_score = cross_val_predict(classifier, train_features, train_target, cv = cv, n_jobs=1, method="predict_proba", verbose=1)
    log_for_stack[class_name] = cv_score[:, 1]    
    
    classifier.fit(train_features, train_target)
    submission[class_name] = classifier.predict_proba(test_features)[:, 1]
    
submission.to_csv('./submits/log_reg_max_iter15_75_sag_submit.csv', index=False)
log_for_stack.to_csv('./submits/log_reg_max_iter15_75_sag_stack5.csv', index=False)

  0%|          | 0/6 [00:00<?, ?it/s][Parallel(n_jobs=1)]: Done   5 out of   5 | elapsed:  2.6min finished
 17%|█▋        | 1/6 [03:54<19:30, 234.01s/it][Parallel(n_jobs=1)]: Done   5 out of   5 | elapsed:  2.7min finished
 33%|███▎      | 2/6 [07:59<15:59, 239.92s/it][Parallel(n_jobs=1)]: Done   5 out of   5 | elapsed:  2.5min finished
 50%|█████     | 3/6 [11:49<11:49, 236.62s/it][Parallel(n_jobs=1)]: Done   5 out of   5 | elapsed:  2.5min finished
 67%|██████▋   | 4/6 [15:35<07:47, 233.77s/it][Parallel(n_jobs=1)]: Done   5 out of   5 | elapsed:  2.8min finished
 83%|████████▎ | 5/6 [19:42<03:56, 236.44s/it][Parallel(n_jobs=1)]: Done   5 out of   5 | elapsed:  2.6min finished
100%|██████████| 6/6 [23:38<00:00, 236.34s/it]


CPU times: user 22min 57s, sys: 42.2 s, total: 23min 40s
Wall time: 23min 40s


In [5]:
log_for_stack

Unnamed: 0,id,toxic,severe_toxic,obscene,threat,insult,identity_hate
0,0000997932d777bf,0.009896,0.000854,0.003384,0.000496,0.003840,0.000792
1,000103f0d9cfb60f,0.022433,0.003389,0.004981,0.000909,0.009449,0.001602
2,000113f07ec002fd,0.029071,0.001118,0.010443,0.000297,0.008090,0.000341
3,0001b41b1c6bb37e,0.000716,0.000430,0.001124,0.000262,0.000816,0.000658
4,0001d958c54c6e35,0.036057,0.003053,0.026331,0.000801,0.039092,0.004587
5,00025465d4725e87,0.002664,0.000628,0.002496,0.000399,0.003392,0.000914
6,0002bcb3da6cb337,0.981142,0.107842,0.898268,0.003753,0.777409,0.003244
7,00031b1e95af7921,0.029756,0.001498,0.009272,0.001986,0.006299,0.001320
8,00037261f536c51d,0.052280,0.000453,0.010782,0.000351,0.013990,0.001076
9,00040093b2687caa,0.015117,0.001700,0.006490,0.000649,0.006354,0.001852
