In [1]:
import pandas as pd
from util import labels

Using TensorFlow backend.


In [2]:
train = pd.read_csv('./data/train_cleaned.csv')

In [3]:
test = pd.read_csv('./data/test_cleaned.csv')

In [4]:
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer

In [5]:
max_features = [50000, None]

In [6]:
# count_vects_1_2 = [CountVectorizer(ngram_range=(1,2), max_features=x, stop_words='english') for x in max_features]
tf_idf_vects_1_1 = [TfidfVectorizer(max_features=x, stop_words='english') for x in max_features]
tf_idf_vects_1_2 = [TfidfVectorizer(max_features=x, ngram_range=(1,2), stop_words='english') for x in max_features]
# vects = tf_idf_vects_1_1 + count_vects_1_2
vects = tf_idf_vects_1_1 + tf_idf_vects_1_2

In [7]:
C = [1,1.2,5]

In [8]:
best = {}

In [9]:
from sklearn.model_selection import cross_val_score
from sklearn.linear_model import LogisticRegression

In [10]:
for vect in vects:
    print('Fitting vectorizer {}'.format(vect))
    fitted = vect.fit(train['comment_text'].append(test['comment_text']))
    
    train_vect = vect.transform(train['comment_text'])
    print('Fitted vectorizer')
    
    for label in labels:
        print('Now finding best model for {}'.format(label))
        
        for c in C:
            model = LogisticRegression(C=c, class_weight='balanced')
            score = cross_val_score(model,X=train_vect, y=train[label], scoring='roc_auc', cv=5, n_jobs=-1).mean()
            if label not in best:
                best[label] = {
                    'score' : 0
                }
                
            if best[label]['score']  < score:
                best[label] = {
                    'score':score,
                    'model':model,
                    'vect': vect,
                    'C':c
                }
                
        print('Found best model with roc={}'.format(best[label]['score']))
        print('#'*80)
        

Fitting vectorizer TfidfVectorizer(analyzer='word', binary=False, decode_error='strict',
        dtype=<class 'numpy.int64'>, encoding='utf-8', input='content',
        lowercase=True, max_df=1.0, max_features=50000, min_df=1,
        ngram_range=(1, 1), norm='l2', preprocessor=None, smooth_idf=True,
        stop_words='english', strip_accents=None, sublinear_tf=False,
        token_pattern='(?u)\\b\\w\\w+\\b', tokenizer=None, use_idf=True,
        vocabulary=None)
Fitted vectorizer
Now finding best model for toxic
Found best model with roc=0.9699397257007154
################################################################################
Now finding best model for severe_toxic
Found best model with roc=0.9836755439466964
################################################################################
Now finding best model for obscene
Found best model with roc=0.9852337220912831
################################################################################
Now finding best model for

In [11]:
for label in labels:
    print('Best model for {} is {}'.format(label, best[label]))
    print('#'*80)

Best model for toxic is {'C': 5, 'score': 0.97067844776612711, 'model': LogisticRegression(C=5, class_weight='balanced', dual=False,
          fit_intercept=True, intercept_scaling=1, max_iter=100,
          multi_class='ovr', n_jobs=1, penalty='l2', random_state=None,
          solver='liblinear', tol=0.0001, verbose=0, warm_start=False), 'vect': TfidfVectorizer(analyzer='word', binary=False, decode_error='strict',
        dtype=<class 'numpy.int64'>, encoding='utf-8', input='content',
        lowercase=True, max_df=1.0, max_features=None, min_df=1,
        ngram_range=(1, 2), norm='l2', preprocessor=None, smooth_idf=True,
        stop_words='english', strip_accents=None, sublinear_tf=False,
        token_pattern='(?u)\\b\\w\\w+\\b', tokenizer=None, use_idf=True,
        vocabulary=None)}
################################################################################
Best model for severe_toxic is {'C': 1, 'score': 0.98567020805857575, 'model': LogisticRegression(C=1, class_weight='b

In [12]:
preds={}
for label in labels:
    test_vect = best[label]['vect'].transform(test['comment_text'])
    train_vect = best[label]['vect'].transform(train['comment_text'])
    model = best[label]['model'].fit(train_vect, train[label])
    preds[label] = model.predict_proba(test_vect)[:, 1]

In [13]:
preds

{'identity_hate': array([ 0.76865995,  0.0317534 ,  0.04127174, ...,  0.02257207,
         0.11449891,  0.09996913]),
 'insult': array([ 0.985198  ,  0.0153697 ,  0.04427762, ...,  0.0160138 ,
         0.02726532,  0.85729073]),
 'obscene': array([ 0.99932671,  0.00902373,  0.03290908, ...,  0.01556612,
         0.02608201,  0.9639512 ]),
 'severe_toxic': array([ 0.70404019,  0.02460571,  0.03418322, ...,  0.02202048,
         0.02050449,  0.02859875]),
 'threat': array([ 0.10164643,  0.00431405,  0.00414325, ...,  0.00320748,
         0.00657337,  0.01017341]),
 'toxic': array([ 0.99935448,  0.01540551,  0.08006981, ...,  0.02047427,
         0.03399454,  0.99549984])}

In [14]:
subm = pd.DataFrame(preds, index=test['id'])

In [15]:
subm.head()

Unnamed: 0_level_0,identity_hate,insult,obscene,severe_toxic,threat,toxic
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
00001cee341fdb12,0.76866,0.985198,0.999327,0.70404,0.101646,0.999354
0000247867823ef7,0.031753,0.01537,0.009024,0.024606,0.004314,0.015406
00013b17ad220c46,0.041272,0.044278,0.032909,0.034183,0.004143,0.08007
00017563c3f7919a,0.006561,0.004518,0.004212,0.013951,0.002284,0.006268
00017695ad8997eb,0.021874,0.03646,0.023301,0.024765,0.005588,0.064843


In [16]:
subm.to_csv('./submission-tmp/logistic_bl.csv')