In [2]:
import numpy as np
import pandas as pd

from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import cross_val_score
from scipy.sparse import hstack, coo_matrix
from scipy.sparse import save_npz, load_npz
from sklearn.preprocessing import MaxAbsScaler

In [9]:
class_names = ['toxic', 'severe_toxic', 'obscene', 'threat', 'insult', 'identity_hate']

train = pd.read_csv('inputs/train.csv').fillna(' ')
test = pd.read_csv('inputs/test.csv').fillna(' ')

train_text = train['comment_text']
test_text = test['comment_text']
all_text = pd.concat([train_text, test_text])

In [3]:
word_vectorizer = TfidfVectorizer(
    sublinear_tf=True,
    strip_accents='unicode',
    analyzer='word',
    token_pattern=r'\w{1,}',
    stop_words='english',
    ngram_range=(1, 1),
    max_features=10000)
word_vectorizer.fit(all_text)
train_word_features = word_vectorizer.transform(train_text)
test_word_features = word_vectorizer.transform(test_text)

Print('unigrams Done')
char_vectorizer = TfidfVectorizer(
    sublinear_tf=True,
    strip_accents='unicode',
    analyzer='char',
    stop_words='english',
    ngram_range=(2, 6),
    max_features=20000)
char_vectorizer.fit(all_text)
train_char_features = char_vectorizer.transform(train_text)
test_char_features = char_vectorizer.transform(test_text)
print('ngrams done')




In [14]:
### Features Engineered based on EDA

train['total_length'] = train['comment_text'].apply(len)

test['total_length'] = test['comment_text'].apply(len)

train['capitals'] = train['comment_text'].apply(
    lambda comment: sum(1 for c in comment if c.isupper()))

test['capitals'] = train['comment_text'].apply(
    lambda comment: sum(1 for c in comment if c.isupper()))


# Features

train['exc_density'] = train['comment_text'].map(lambda x: x.count('!')/len(x))

test['exc_density'] = test['comment_text'].map(lambda x: x.count('!')/len(x))

train['caps_vs_length'] = train.apply(lambda row: float(row['capitals'])/float(
    row['total_length']),axis=1)

test['caps_vs_length'] = test.apply(lambda row: float(row['capitals'])/float(
    row['total_length']),axis=1)


train['num_unique_words'] = train['comment_text'].apply(
    lambda comment: len(set(w for w in comment.split())))

test['num_unique_words'] = test['comment_text'].apply(
    lambda comment: len(set(w for w in comment.split())))



Unnamed: 0,id,comment_text,toxic,severe_toxic,obscene,threat,insult,identity_hate,exc_density,capitals,total_length,caps_vs_length,num_unique_words
0,0000997932d777bf,Explanation\nWhy the edits made under my usern...,0,0,0,0,0,0,0.0,17,264,0.064394,41
1,000103f0d9cfb60f,D'aww! He matches this background colour I'm s...,0,0,0,0,0,0,0.008929,8,112,0.071429,17
2,000113f07ec002fd,"Hey man, I'm really not trying to edit war. It...",0,0,0,0,0,0,0.0,4,233,0.017167,39
3,0001b41b1c6bb37e,"""\nMore\nI can't make any real suggestions on ...",0,0,0,0,0,0,0.0,11,622,0.017685,82
4,0001d958c54c6e35,"You, sir, are my hero. Any chance you remember...",0,0,0,0,0,0,0.0,2,67,0.029851,13


In [None]:
#Added to Sparse Matrix


train_features = hstack([train_char_features, train_word_features,
                         np.array(train['exc_density'])[:,None],
                         np.array(train['caps_vs_length'])[:,None],
                         np.array(train['num_unique_words'])[:,None]
                         ])


In [None]:
#Added to Sparse Matrix

test_features = hstack([test_char_features, test_word_features,
                         np.array(test['exc_density'])[:,None],
                         np.array(test['caps_vs_length'])[:,None],
                         np.array(test['num_unique_words'])[:,None]
                         ])

In [31]:
save_npz('inputs/train_sparse.npz', train_features)

CPU times: user 1min 30s, sys: 3.17 s, total: 1min 33s
Wall time: 1min 38s


In [31]:
save_npz('inputs/test_sparse.npz', test_features)

CPU times: user 1min 17s, sys: 2.8 s, total: 1min 20s
Wall time: 1min 24s


In [3]:
test_features = load_npz('inputs/test_sparse.npz')

train_features = load_npz('inputs/train_sparse.npz')

<153164x30003 sparse matrix of type '<class 'numpy.float64'>'
	with 115404533 stored elements in Compressed Sparse Row format>

In [5]:
test_features = MaxAbsScaler().fit_transform(test_features)

train_features = MaxAbsScaler().fit_transform(train_features)

In [6]:
test_features

<153164x30003 sparse matrix of type '<class 'numpy.float64'>'
	with 115404533 stored elements in Compressed Sparse Row format>

In [7]:
train_features

<159571x30003 sparse matrix of type '<class 'numpy.float64'>'
	with 136541204 stored elements in Compressed Sparse Row format>

In [10]:
scores = []
submission = pd.DataFrame.from_dict({'id': test['id']})

for class_name in class_names:
    train_target = train[class_name]
    classifier = LogisticRegression(solver='sag')
    cv_score = np.mean(cross_val_score(classifier, train_features, train_target,
                                       cv=3, scoring='roc_auc'))
    scores.append(cv_score)
    print('CV score for class {} is {}'.format(class_name, cv_score))
    classifier.fit(train_features, train_target)
    submission[class_name] = classifier.predict_proba(test_features)[:, 1]
    


print('Total CV score is {}'.format(np.mean(scores)))
submission.to_csv('submissions/submission_LR.csv', index=False)







CV score for class toxic is 0.9711354731575356








CV score for class severe_toxic is 0.9770154862153948








CV score for class obscene is 0.9822983290235393




In [None]:
from sklearn.externals import joblib

for class_name in class_names:
    filename = class_name+"LR.sav"
    print(filename)
    #joblib.dump(model, filename)