In [1]:
#General purpose
import pandas as pd
import numpy as np

#Preprocessing
from sklearn.feature_extraction.text import CountVectorizer, TfidfTransformer
from sklearn.feature_extraction.text import TfidfVectorizer

from sklearn.decomposition import TruncatedSVD
from sklearn.preprocessing import StandardScaler

#Cross-validation
from sklearn.pipeline import Pipeline
from sklearn.model_selection import train_test_split
from sklearn.model_selection import cross_val_score, KFold, GridSearchCV
from sklearn.metrics import make_scorer

#Metrics
from sklearn.metrics import log_loss

#Models
from sklearn.multiclass import OneVsRestClassifier
from sklearn.svm import SVC, LinearSVC
from sklearn.multioutput import ClassifierChain
from sklearn.naive_bayes import MultinomialNB
from sklearn.svm import SVC
from sklearn.linear_model import LogisticRegression
from sklearn.base import clone
import xgboost as xgb




### Read datasets

In [2]:
corpus_train = pd.read_csv("data/train.csv")
corpus_test = pd.read_csv("data/test.csv")

In [3]:
corpus_train.head(2)

Unnamed: 0,id,comment_text,toxic,severe_toxic,obscene,threat,insult,identity_hate
0,0000997932d777bf,Explanation\nWhy the edits made under my usern...,0,0,0,0,0,0
1,000103f0d9cfb60f,D'aww! He matches this background colour I'm s...,0,0,0,0,0,0


In [4]:
corpus_test.head(2)

Unnamed: 0,id,comment_text
0,00001cee341fdb12,Yo bitch Ja Rule is more succesful then you'll...
1,0000247867823ef7,== From RfC == \n\n The title is fine as it is...


### Define challenge loss function

In [5]:
def calc_loss(y_true, y_pred):
    return np.mean([log_loss(y_true[:, i], y_pred[:, i]) for i in range(y_true.shape[1])])

## Preprocessing

#### Get insults from external dictionnary

In [6]:
insults = pd.read_csv('data/insults.txt', names=["words"])
insults_dict = {v:k for k,v in insults["words"].to_dict().items()}

#### Columns extraction for easy usage

In [7]:
outputs = corpus_train.columns[2:]
train_text = corpus_train["comment_text"]
test_text = corpus_test["comment_text"]

#### Split train(70%)/validation(30%)

In [8]:
xtrain, xval, ytrain, yval = train_test_split(corpus_train["comment_text"],
                                                    corpus_train[outputs],
                                                    test_size=0.3,
                                                    random_state=42) #for reproduciblity

print("Train size: {}".format(xtrain.shape[0]))
print("Test size: {}".format(xval.shape[0]))

Train size: 111699
Test size: 47872


### Extracting DF representation

In [9]:
count_vec = CountVectorizer(max_features=5000,
                            analyzer='word',
                            min_df=10,
                            strip_accents='unicode',
                            token_pattern=r'\w{1,}',
                            #token_pattern=r'\b[^\d\W]+\b',
                            ngram_range=(1,1)
                           ).fit(train_text.append(test_text))

In [10]:
xtrain_df = count_vec.transform(xtrain)
xval_df = count_vec.transform(xval)

### Extracting TF-IDF representation

In [11]:
tfidf = TfidfVectorizer(max_features=5000,
                        min_df=5, 
                        strip_accents='unicode',
                        analyzer='word',
                        #analyzer='char', #works also with chars!
                        #token_pattern=r'\w{1,}',
                        token_pattern=r'\b[^\d\W]+\b',
                        #ngram_range=(1, 2), #ng_grams don't really improve results here
                        use_idf=1,
                        smooth_idf=1,
                        sublinear_tf=1,
                        stop_words = 'english',
                        lowercase=False
                       ).fit(train_text.append(test_text))

In [12]:
xtrain_tfidf = tfidf.transform(xtrain)
xval_tfidf = tfidf.transform(xval)

### Baseline evaluation

#### Naive bayes - DF

In [13]:
naive = OneVsRestClassifier(MultinomialNB(alpha=0.1)).fit(xtrain_df, ytrain)
y_pred = naive.predict_proba(xval_df)
print("Loss: %0.4f" %calc_loss(yval.as_matrix(), y_pred))

Loss: 0.2400


#### Naive bayes - TFIDF

In [14]:
naive = OneVsRestClassifier(MultinomialNB(alpha=0.1)).fit(xtrain_tfidf, ytrain)
y_pred = naive.predict_proba(xval_tfidf)
print("Loss: %0.4f" %calc_loss(yval.as_matrix(), y_pred))

Loss: 0.0796


#### Logistic regression

##### DF

In [15]:
#No cross-validation for now, see later
lr_ovr = OneVsRestClassifier(LogisticRegression(C=10)).fit(xtrain_df, ytrain)

In [16]:
y_pred = lr_ovr.predict_proba(xval_df)
print("Loss: %0.4f" %calc_loss(yval.as_matrix(), y_pred))

Loss: 0.0834


##### TFIDF

In [17]:
lr_ovr = OneVsRestClassifier(LogisticRegression(C=5)).fit(xtrain_tfidf, ytrain)

In [18]:
y_pred = lr_ovr.predict_proba(xval_tfidf)
print("Loss: %0.4f" %calc_loss(yval.as_matrix(), y_pred))

Loss: 0.0553


##### Classifier chains

In [19]:
lr_chains = ClassifierChain(LogisticRegression(C=5), order='random').fit(xtrain_tfidf, ytrain)

In [20]:
y_pred = lr_chains.predict_proba(xval_tfidf)
print("Loss: %0.4f" %calc_loss(yval.as_matrix(), y_pred))

Loss: 0.0588


#### Ensemble classifier chains

In [21]:
#ECC (Ensemble chain classifier)
def ECC_preds_multiple_mods(x, mods):
    return np.array([voter.predict(x) for voter in mods])

def ECC_predict(predictions):
    nb_voters = predictions.shape[0]
    return (sum([x for x in predictions])/nb_voters >= 0.5).astype(int)

def ECC_fit(xtrain, ytrain, model, nb_models=5):
    np.random.seed()
    models = []
    for i in range(nb_models):
        models.append(ClassifierChain(clone(model), order='random').fit(xtrain, ytrain))
    return models

In [22]:
lr_ecc = ECC_fit(xtrain_tfidf, ytrain, LogisticRegression(C=5))

In [23]:
y_preds_ecc = ECC_predict(ECC_preds_multiple_mods(xval_tfidf, lr_ecc))

In [24]:
print("Loss: %0.4f" %calc_loss(yval.as_matrix(), y_preds_ecc))

Loss: 0.6686


### XGBoost

#### Fitting a simple xgboost on TFIDF

In [25]:
clf = OneVsRestClassifier(xgb.XGBClassifier(max_depth=7, n_estimators=200, colsample_bytree=0.8, 
                        subsample=0.8, nthread=4, learning_rate=0.1
                       )).fit(xtrain_tfidf.tocsc(), ytrain)
y_pred = clf.predict_proba(xval_tfidf.tocsc())
print("Loss: %0.4f" %calc_loss(yval.as_matrix(), y_pred))

Loss: 0.0629


#### Fitting a simple xgboost on DF

In [26]:
clf = OneVsRestClassifier(xgb.XGBClassifier(max_depth=7, n_estimators=200, colsample_bytree=0.8, 
                        subsample=0.8, nthread=4, learning_rate=0.1
                       )).fit(xtrain_df.tocsc(), ytrain)

y_pred = clf.predict_proba(xval_df.tocsc())
print("Loss: %0.4f" %calc_loss(yval.as_matrix(), y_pred))

Loss: 0.0563


#### Fitting a simple xgboost on TF-IDF

In [28]:
# Fitting a simple xgboost on tf-idf svd features
clf = OneVsRestClassifier(xgb.XGBClassifier(max_depth=7, n_estimators=200, colsample_bytree=0.8, 
                        subsample=0.8, nthread=4, learning_rate=0.1
                       )).fit(xtrain_tfidf, ytrain)

y_pred = clf.predict_proba(xval_tfidf)
print("Loss: %0.4f" %calc_loss(yval.as_matrix(), y_pred))

Loss: 0.0629


## Some cross validation

In [29]:
cv_scorer = make_scorer(calc_loss, greater_is_better=False, needs_proba=True)

In [30]:
#Example with SVD, scaling features + learning logistic model
# Initialize SVD
#svd = TruncatedSVD()
    
# Initialize the standard scaler 
#scl = StandardScaler()

lr_ovr = OneVsRestClassifier(LogisticRegression())

# Create the pipeline, we dont need a full pipeline here, just cross-validate
# the regularization parameter and the norm to have an idea of possible improvements
#clf = Pipeline([('svd', svd), ('scl', scl), ('lr_ovr', lr_ovr)])
clf = Pipeline([('lr_ovr', lr_ovr)])

In [31]:
#List all possible paramaters for the classifiers at hand
sorted(clf.get_params().keys())

['lr_ovr',
 'lr_ovr__estimator',
 'lr_ovr__estimator__C',
 'lr_ovr__estimator__class_weight',
 'lr_ovr__estimator__dual',
 'lr_ovr__estimator__fit_intercept',
 'lr_ovr__estimator__intercept_scaling',
 'lr_ovr__estimator__max_iter',
 'lr_ovr__estimator__multi_class',
 'lr_ovr__estimator__n_jobs',
 'lr_ovr__estimator__penalty',
 'lr_ovr__estimator__random_state',
 'lr_ovr__estimator__solver',
 'lr_ovr__estimator__tol',
 'lr_ovr__estimator__verbose',
 'lr_ovr__estimator__warm_start',
 'lr_ovr__n_jobs',
 'memory',
 'steps']

In [32]:
#Define values to cross validate
param_grid = {'lr_ovr__estimator__C': [0.1, 1, 10], 
              'lr_ovr__estimator__penalty': ['l1', 'l2']}

In [35]:
# Initialize Grid Search Model
#Use refit=True to get a model with the best paramaters on the full training
model = GridSearchCV(estimator=clf,
                     param_grid=param_grid,
                     scoring=cv_scorer,
                     verbose=1,
                     n_jobs=1,
                     iid=False,
                     refit=True,
                     cv=3).fit(xtrain_tfidf, ytrain.as_matrix())

Fitting 3 folds for each of 10 candidates, totalling 30 fits


[Parallel(n_jobs=1)]: Done  30 out of  30 | elapsed:  4.1min finished


In [36]:
print("Best score: %0.3f" % model.best_score_)
print("Best parameters set:")
best_parameters = model.best_estimator_.get_params()
for param_name in sorted(param_grid.keys()):
    print("\t%s: %r" % (param_name, best_parameters[param_name]))

Best score: -0.057
Best parameters set:
	lr_ovr__estimator__C: 1
	lr_ovr__estimator__penalty: 'l1'


##### Naive bayes cross validation

In [37]:
cv_scorer = make_scorer(calc_loss, greater_is_better=False, needs_proba=True)

In [38]:
nb_model = OneVsRestClassifier(MultinomialNB())

# Create the pipeline 
clf = Pipeline([('nb', nb_model)])

# Parameter grid
param_grid = {'nb__estimator__alpha': [0.001, 0.01, 0.1, 1, 10, 100]}

In [39]:
#List all possible paramaters for the classifiers at hand
sorted(clf.get_params().keys())

['memory',
 'nb',
 'nb__estimator',
 'nb__estimator__alpha',
 'nb__estimator__class_prior',
 'nb__estimator__fit_prior',
 'nb__n_jobs',
 'steps']

In [40]:
# Fit Grid Search Model
# Initialize Grid Search Model
model = GridSearchCV(estimator=clf,
                     param_grid=param_grid,
                     scoring=cv_scorer,
                     verbose=1,
                     n_jobs=-1,
                     iid=True,
                     refit=True,
                     cv=2).fit(xtrain_tfidf, ytrain.as_matrix())
# we could use the full data here but im only using xtrain.

Fitting 2 folds for each of 6 candidates, totalling 12 fits


[Parallel(n_jobs=-1)]: Done  12 out of  12 | elapsed:    2.0s finished


In [41]:
print("Best score: %0.3f" % model.best_score_)
print("Best parameters set:")
best_parameters = model.best_estimator_.get_params()
for param_name in sorted(param_grid.keys()):
    print("\t%s: %r" % (param_name, best_parameters[param_name]))

Best score: -0.073
Best parameters set:
	nb__estimator__alpha: 1


# USING WORD EMBEDDINGS

In [43]:
# load the GloVe vectors in a dictionary
from tqdm import tqdm
embeddings_index = {}
#Choose a pre-trained embedding here
f = open('data/glove.840B.300d.txt')
#f = open('data/glove.twitter.27B/glove.twitter.27B.25d.txt') 
for line in tqdm(f):
    values = line.split()
    word = values[0]
    coefs = np.asarray(values[1:], dtype='float32')
    embeddings_index[word] = coefs
f.close()

print('Found %s word vectors.' % len(embeddings_index))

2196017it [01:32, 23618.04it/s]

Found 2196016 word vectors.





In [44]:
import nltk
nltk.download('punkt')
nltk.download("stopwords")
from nltk import word_tokenize
from nltk.corpus import stopwords
stop_words = stopwords.words('english')
rep_size = embeddings_index["and"].shape[0]

# this function creates a normalized vector for the whole sentence
def sent2vec(s):
    words = str(s).lower().decode('utf-8')
    words = word_tokenize(words)
    words = [w for w in words if not w in stop_words]#filter words that are not in stop_word list
    words = [w for w in words if w.isalpha()]
    M = []
    for w in words:
        try:
            M.append(embeddings_index[w])
        except:
            continue
    M = np.array(M)
    v = M.sum(axis=0)
    if type(v) != np.ndarray:
        return np.zeros(rep_size)
    
    return v / np.sqrt((v ** 2).sum())

[nltk_data] Downloading package punkt to /home/sloan/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to /home/sloan/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.


In [45]:
# create sentence vectors using the above function for training and validation set
xtrain_glove = [sent2vec(x) for x in tqdm(xtrain)]
xvalid_glove = [sent2vec(x) for x in tqdm(xval)]

100%|██████████| 111699/111699 [01:23<00:00, 1336.36it/s]
100%|██████████| 47872/47872 [00:35<00:00, 1363.23it/s]


In [46]:
xtrain_glove = np.array(xtrain_glove)
xvalid_glove = np.array(xvalid_glove)

In [None]:
# Fitting a simple lr on glove features
clf = OneVsRestClassifier(LogisticRegression(C=5))
clf.fit(xtrain_glove, ytrain)
predictions = clf.predict_proba(xvalid_glove)

print ("logloss: %0.4f " % calc_loss(np.array(yval), predictions))

logloss: 0.0570 


In [None]:
# Fitting a simple xgboost on glove features
clf = OneVsRestClassifier(xgb.XGBClassifier(max_depth=7, n_estimators=200, colsample_bytree=0.8, 
                        subsample=0.8, nthread=10, learning_rate=0.1, silent=False))
clf.fit(xtrain_glove, ytrain)
predictions = clf.predict_proba(xvalid_glove)

print ("logloss: %0.4f " % calc_loss(np.array(yval), predictions))

In [None]:
# Fitting a simple xgboost on glove features
clf = xgb.XGBClassifier(max_depth=7, n_estimators=200, colsample_bytree=0.8, 
                        subsample=0.8, nthread=10, learning_rate=0.1, silent=False)
clf.fit(xtrain_glove, ytrain)
predictions = clf.predict_proba(xvalid_glove)

print ("logloss: %0.3f " % multiclass_logloss(yvalid, predictions))

## DEEP LEARNING

### Fitting simple 2 layers model

In [None]:
from keras.layers import GlobalMaxPooling1D, Conv1D, MaxPooling1D, Flatten, Bidirectional, SpatialDropout1D
from keras.preprocessing import sequence, text
from keras.callbacks import EarlyStopping
from keras.models import Sequential
from keras.layers.recurrent import LSTM, GRU
from keras.layers.core import Dense, Activation, Dropout
from keras.layers.embeddings import Embedding
from keras.layers.normalization import BatchNormalization
from keras.utils import np_utils

In [None]:
#useful variables
EMB_SIZE = xtrain_glove.shape[1]
OUTPUT_SIZE = ytrain.shape[1]

In [None]:
# scale the data before any neural net:
scl = StandardScaler()
xtrain_glove_scl = scl.fit_transform(xtrain_glove)
xvalid_glove_scl = scl.transform(xvalid_glove)

In [None]:
# create a simple 3 layer sequential neural net
model = Sequential()

model.add(Dense(300, input_dim=EMB_SIZE, activation='relu'))
model.add(Dropout(0.7))
model.add(BatchNormalization())

model.add(Dense(300, activation='relu'))
model.add(Dropout(0.5))
model.add(BatchNormalization())

model.add(Dense(OUTPUT_SIZE))
model.add(Activation('sigmoid'))

# compile the model
model.compile(loss='binary_crossentropy', optimizer='adam')
#model.summary()

In [None]:
earlyStopping = EarlyStopping(monitor='val_loss', patience=3, verbose=0, mode='auto')

In [None]:
model.fit(xtrain_glove_scl, y=ytrain,
          batch_size=128,
          callbacks=[earlyStopping],
          validation_split=0.1,
          epochs=100,
          verbose=1, 
          validation_data=(xvalid_glove_scl, yval))

In [None]:
model.predict_proba(xvalid_glove_scl)

### LSTM models

In [None]:
from keras.preprocessing.text import Tokenizer
from keras.preprocessing.text import text_to_word_sequence
from keras.preprocessing.sequence import pad_sequences
from tqdm import tqdm

In [None]:
OUTPUT_SIZE = ytrain.shape[1]

In [None]:
# using keras tokenizer here: need to tokenize text to apply LSTM
token = Tokenizer(num_words=None)

In [None]:
token.fit_on_texts(list(xtrain) + list(xval))
xtrain_seq = token.texts_to_sequences(xtrain)
xvalid_seq = token.texts_to_sequences(xval)

In [None]:
# zero pad the sequences -> can improve here
max_len = 140
xtrain_pad = pad_sequences(xtrain_seq, maxlen=max_len)
xvalid_pad = pad_sequences(xvalid_seq, maxlen=max_len)

In [None]:
#prepare test
xtest_seq = token.texts_to_sequences(corpus_test["comment_text"])
xtest_pad = pad_sequences(xtest_seq, maxlen=max_len)

In [None]:
word_index = token.word_index

In [None]:
# create an embedding matrix for the words we have in the dataset
embedding_matrix = np.zeros((len(word_index) + 1, 300))
for word, i in tqdm(word_index.items()):
    embedding_vector = embeddings_index.get(word)
    if embedding_vector is not None:
        embedding_matrix[i] = embedding_vector

In [None]:
# A simple LSTM with glove embeddings and two dense layers
model = Sequential()
model.add(Embedding(input_dim=len(word_index) + 1,
                     output_dim=300,
                     weights=[embedding_matrix],
                     input_length=max_len,
                     trainable=False))

model.add(SpatialDropout1D(0.3))
model.add(LSTM(100, dropout=0.3, recurrent_dropout=0.3))

model.add(Dense(1024, activation='relu'))
model.add(Dropout(0.8))

model.add(Dense(1024, activation='relu'))
model.add(Dropout(0.8))

model.add(Dense(OUTPUT_SIZE))
model.add(Activation('sigmoid'))
model.compile(loss='binary_crossentropy', optimizer='adam')

In [None]:
model.summary()

In [None]:
# Fit the model with early stopping callback
earlystop = EarlyStopping(monitor='val_loss', min_delta=0, patience=3, verbose=0, mode='auto')

In [None]:
model.fit(x=xtrain_pad, y=ytrain, batch_size=512, epochs=100, verbose=1, validation_data=(xvalid_pad, yval), callbacks=[earlystop])

## Additionnal features

### Handmade features

In [None]:
#subjective --> not used yet
punctuation = list("?.,!:-;|") + [".."] + ["..."] + ['"']
biased_punct = ["!"] + ["..."]
#smileys = [":-)", ":)", ":(", ":|", ":-("]

In [None]:
def feature_extraction(df):
    #df_out = pd.DataFrame(df["comment_text"].apply(lambda x: sum([s in x for s in smileys])), columns=["n_smileys"])
    df_out = pd.DataFrame(df["comment_text"].apply(lambda x: len(x)), columns=["n_carac"])
    df_out["n_upper"] = df["comment_text"].apply(lambda x: sum(a.isupper() for a in list(x)))
    df_out["n_ratio"] = df["comment_text"].apply(lambda x: float(sum(a.isupper() for a in list(x))) / len(x))
    df_out["n_words"] = df["comment_text"].apply(lambda x: len(x.split(' '))
    df_out["n_smileys"] = df["comment_text"].apply(lambda x: sum([s in x for s in smileys]))
    df_out["n_carac"] = df["comment_text"].apply(lambda x: len(x))
    
    for p in punctuation:
        new_entry = "n_" + str(p)
        df_out[new_entry] = df["comment_text"].str.count("\\"+ p)

    return df_out

In [None]:
#get insults from external dictionnary
insults_google = pd.read_csv('data/insults_google.txt', names=["words"])
insults_others = pd.read_csv('data/insults.txt', names=['words'])
insults_set = set({v:k for k,v in insults_google["words"].to_dict().items()})
#insults_set = set({v:k for k,v in insults_others["words"].to_dict().items()})
insults_set.update(set({v:k for k,v in insults_others["words"].to_dict().items()}))

In [None]:
xtrain_df = count_vec.transform(xtrain)
xval_df = count_vec.transform(xval)

In [None]:
xtrain_inv_count = count_vec.inverse_transform(xtrain_df)
xval_inv_count = count_vec.inverse_transform(xval_df)

In [None]:
insults_train = pd.Series([int(len(set(xtrain_inv_count[i]).intersection(insults_set))>=1) for i in range(len(xtrain))], index=ytrain.index)

In [None]:
insults_val = pd.Series([int(len(set(xval_inv_count[i]).intersection(insults_set))>=1) for i in range(len(xval))], index=yval.index)

In [None]:
from sklearn.metrics import accuracy_score, precision_score

In [None]:
for k in ytrain.keys():
    print k, ":", accuracy_score(insults_train, ytrain[k]), ',', precision_score(insults_train, ytrain[k])

In [None]:
#No cross-validation with logistic regression
lr_ovr = OneVsRestClassifier(LogisticRegression(C=10)).fit(xtrain_tfidf, ytrain)
y_pred = lr_ovr.predict_proba(xval_tfidf)
print("Loss: %0.4f" %calc_loss(yval.as_matrix(), y_pred))

In [None]:
df_pred = pd.DataFrame(y_pred, columns=outputs, index=yval.index)

In [None]:
import scipy as sp

In [None]:
sp.sparse.coo_matrix(insults_train).T

In [None]:
imp_xtrain = sp.sparse.hstack([xtrain_tfidf, sp.sparse.coo_matrix(insults_train).T])
imp_xval = sp.sparse.hstack([xval_tfidf, sp.sparse.coo_matrix(insults_val).T])

In [None]:
#No cross-validation with logistic regression
lr_ovr = OneVsRestClassifier(LogisticRegression(C=10)).fit(imp_xtrain, ytrain)
y_pred = lr_ovr.predict_proba(imp_xval)
print("Loss: %0.4f" %calc_loss(yval.as_matrix(), y_pred))

In [None]:
#preproc for challenge
import scipy as sp
xtrain_tfidf = tfidf.transform(train_text)
xtest_tfidf = tfidf.transform(test_text)
xtrain_df = count_vec.transform(train_text)
xtest_df = count_vec.transform(test_text)

In [None]:
xtrain_inv_count = count_vec.inverse_transform(xtrain_df)
xtest_inv_count = count_vec.inverse_transform(xtest_df)

In [None]:
insults_train = pd.Series([int(len(set(xtrain_inv_count[i]).intersection(insults_set))>=1) 
                           for i in range(len(train_text))], index=corpus_train['id'])
insults_test = pd.Series([int(len(set(xtest_inv_count[i]).intersection(insults_set))>=1) 
                         for i in range(len(test_text))], index=corpus_test['id'])

In [None]:
imp_xtrain = sp.sparse.hstack([xtrain_tfidf, sp.sparse.coo_matrix(insults_train).T])
imp_xtest = sp.sparse.hstack([xtest_tfidf, sp.sparse.coo_matrix(insults_test).T])

In [None]:
lr_ovr = OneVsRestClassifier(LogisticRegression(C=10)).fit(imp_xtrain, corpus_train[outputs])
y_pred = lr_ovr.predict_proba(imp_xtest)

### Create output for competition

#### TFIDF on full train

In [None]:
tfidf_train = tfidf.transform(train_text)

In [None]:
tfidf_test = tfidf.transform(test_text)

In [None]:
lr_ovr = OneVsRestClassifier(LogisticRegression(C=10)).fit(tfidf_train, corpus_train[outputs])

In [None]:
y_pred = lr_ovr.predict_proba(tfidf_test)

#### Using GloVe representation

In [None]:
df_train = count_vec.transform(train_text)
df_test = count_vec.transform(test_text)

In [None]:
# create sentence vectors using the above function for training and validation set
xtrain_glove = [sent2vec(x) for x in tqdm(corpus_train["comment_text"])]
xtest_glove = [sent2vec(x) for x in tqdm(corpus_test["comment_text"])]

In [None]:
xtrain_glove = np.array(xtrain_glove)
xtest_glove = np.array(xtest_glove)

In [None]:
scl = StandardScaler()
xtrain_glove_scl = scl.fit_transform(xtrain_glove)
xtest_glove_scl = scl.transform(xtest_glove)

In [None]:
model.fit(xtrain_glove_scl, y=corpus_train[outputs], batch_size=64, callbacks=[earlyStopping], validation_split=0.1,
          epochs=100, verbose=1)

In [None]:
y_pred = model.predict_proba(xtest_pad)

In [None]:
corpus_train.loc[:, outputs].head()

In [None]:
#fit model on full training
mod_full = OneVsRestClassifier(LogisticRegression(C=5)).fit(xtrain_glove,  corpus_train[outputs])

In [None]:
mod_full = OneVsRestClassifier(xgb.XGBClassifier(max_depth=7, n_estimators=200, colsample_bytree=0.8, 
                        subsample=0.8, nthread=4, learning_rate=0.1
                       )).fit(df_train,  np.array(corpus_train[outputs]))

In [None]:
y_pred = mod_full.predict_proba(xtest_glove)

In [None]:
def output_compet(corp, predictions, filename):
    df_output = pd.concat([corp["id"], pd.DataFrame(predictions, columns=[outputs])], axis=1)
    df_output.to_csv(filename, index=False)

In [None]:
output_compet(corpus_test, y_pred, "lr_insultsEng.csv")