In [1]:
%matplotlib inline

import numpy as np
import xgboost as xgb
import gc

import seaborn as sns
from sklearn.metrics import accuracy_score
from pprint import pprint



In [2]:
import sys, os, re, csv, codecs, numpy as np, pandas as pd

#=================Keras==============
from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences
from keras.layers import Dense, Input, Conv2D, Embedding, Dropout, Activation
from keras.layers import Bidirectional, MaxPooling1D, MaxPooling2D, Reshape, Flatten, concatenate
from keras.models import Model
from keras import initializers, regularizers, constraints, optimizers, layers, backend
#=================nltk===============
from nltk.corpus import stopwords
from nltk.stem import SnowballStemmer

Using TensorFlow backend.
  return f(*args, **kwds)


In [3]:
path = './'
comp = ''
EMBEDDING_FILE=f'{path}glove6b/glove.6B.50d.txt'
TRAIN_DATA_FILE=f'{path}{comp}train.csv'
TEST_DATA_FILE=f'{path}{comp}test.csv'

embed_size = 50 # how big is each word vector
max_features = 20000 # how many unique words to use (i.e num rows in embedding vector)
maxlen = 100 # max number of words in a comment to use
number_filters = 20 # the number of CNN filters

train = pd.read_csv(TRAIN_DATA_FILE)
test = pd.read_csv(TEST_DATA_FILE)

list_sentences_train = train["comment_text"].fillna("_na_").values
list_classes = ["toxic", "severe_toxic", "obscene", "threat", "insult", "identity_hate"]
y = train[list_classes].values
list_sentences_test = test["comment_text"].fillna("_na_").values

special_character_removal=re.compile(r'[^a-z\d ]',re.IGNORECASE)
replace_numbers=re.compile(r'\d+',re.IGNORECASE)

def text_to_wordlist(text, remove_stopwords=True, stem_words=True):
    #Remove Special Characters
    text=special_character_removal.sub('',text)
    
    #Replace Numbers
    text=replace_numbers.sub('n',text)
    # Clean the text, with the option to remove stopwords and to stem words.
    # Convert words to lower case and split them
    text = text.lower().split()

    # Optionally, remove stop words
    if remove_stopwords:
        stops = set(stopwords.words("english"))
        text = [w for w in text if not w in stops]
        text = " ".join(text)

    # Optionally, shorten words to their stems
    if stem_words:
        text = text.split()
        stemmer = SnowballStemmer('english')
        stemmed_words = [stemmer.stem(word) for word in text]
        text = " ".join(stemmed_words)
    
    # Return a list of words
    return(text)

comments = []
for text in list_sentences_train:
    comments.append(text_to_wordlist(text))
    
test_comments=[]
for text in list_sentences_test:
    test_comments.append(text_to_wordlist(text))

In [4]:
comments[:2]

['nonsens kiss geek said true ill account termin',
 'pleas vandal page edit w merwin continu block edit']

In [5]:
from sklearn.feature_extraction.text import CountVectorizer,TfidfVectorizer

In [6]:
vect_word = TfidfVectorizer(max_features=50000, lowercase=True, analyzer='word',
                        stop_words= 'english',ngram_range=(1,3),dtype=np.float32)

vect_char = TfidfVectorizer(max_features=20000, lowercase=True, analyzer='char',
                        stop_words= 'english',ngram_range=(1,3),dtype=np.float32)

In [7]:
#vect_word.fit(list(train['comment_text']) + list(test['comment_text']))
tr_vect = vect_word.fit_transform(comments)
ts_vect = vect_word.transform(test_comments)

#vect_char.fit(list(train['comment_text']) + list(test['comment_text']))
tr_vect_char = vect_char.fit_transform(comments)
ts_vect_char = vect_char.transform(test_comments)
gc.collect()

0

In [8]:
from scipy import sparse
X = sparse.hstack([tr_vect, tr_vect_char])
x_test = sparse.hstack([ts_vect, ts_vect_char])

In [15]:
prd = np.zeros((ts_vect.shape[0], y.shape[1]))

In [17]:
y = train[list_classes]

In [20]:
from sklearn.linear_model import LogisticRegression

In [21]:
cv_score =[]
for i,col in enumerate(list_classes):
    lr = LogisticRegression(C=4,random_state = i)
    print('Building {} model for column:{''}'.format(i,col)) 
    lr.fit(tr_vect, y[col])
    #cv_score.append(lr.score)
    prd[:,i] = lr.predict_proba(ts_vect)[:,1]

Building 0 model for column:toxic
Building 1 model for column:severe_toxic
Building 2 model for column:obscene
Building 3 model for column:threat
Building 4 model for column:insult
Building 5 model for column:identity_hate


In [24]:
prd_1 = pd.DataFrame(prd,columns=y.columns)
submit = pd.concat([test['id'],prd_1],axis=1)
#submit.to_csv('toxic_lr.csv.gz',compression='gzip',index=False)
submit.to_csv('toxic_lr.csv',index=False)
submit.head()

Unnamed: 0,id,toxic,severe_toxic,obscene,threat,insult,identity_hate
0,6044863,0.027175,0.004974,0.012542,0.001574,0.010953,0.002813
1,6102620,0.009827,0.000571,0.001861,0.000684,0.003928,0.001042
2,14563293,0.001281,0.000406,0.000919,0.000343,0.001921,0.000268
3,21086297,0.139923,0.004637,0.01223,0.001366,0.009125,0.002718
4,22982444,0.020404,0.004681,0.010055,0.00165,0.011431,0.003273


In [9]:
def runXGB(train_X, train_y, test_X, test_y=None, feature_names=None, seed_val=2017, num_rounds=400):
    param = {}
    param['objective'] = 'binary:logistic'
    param['eta'] = 0.12
    param['max_depth'] = 5
    param['silent'] = 1
    param['eval_metric'] = 'logloss'
    param['min_child_weight'] = 1
    param['subsample'] = 0.5
    param['colsample_bytree'] = 0.7
    param['seed'] = seed_val
    num_rounds = num_rounds

    plst = list(param.items())
    xgtrain = xgb.DMatrix(train_X, label=train_y)

    if test_y is not None:
        xgtest = xgb.DMatrix(test_X, label=test_y)
        watchlist = [ (xgtrain,'train'), (xgtest, 'test') ]
        model = xgb.train(plst, xgtrain, num_rounds, watchlist, early_stopping_rounds=20)
    else:
        xgtest = xgb.DMatrix(test_X)
        model = xgb.train(plst, xgtrain, num_rounds)

    pred_test_y = model.predict(xgtest)
    return model
    

col = ['toxic', 'severe_toxic', 'obscene', 'threat', 'insult', 'identity_hate']
preds = np.zeros((test.shape[0], len(col)))

for i, j in enumerate(col):
    print('fit '+j)
    model = runXGB(X, train[j], x_test)
    preds[:,i] = model.predict(xgb.DMatrix(x_test))
    gc.collect()

fit toxic


TypeError: can not initialize DMatrix from coo_matrix

In [None]:
subm = pd.read_csv('sample_submission.csv')    
submid = pd.DataFrame({'id': subm["id"]})
submission = pd.concat([submid, pd.DataFrame(preds, columns = col)], axis=1)
submission.to_csv('xgb.csv', index=False)