In [1]:
import pandas as pd

In [2]:
from sklearn.feature_extraction.text import TfidfVectorizer

In [3]:
train = pd.read_csv("../data/train.csv")

In [6]:
test = pd.read_csv("../data/test.csv")

In [7]:
target_labels = ['toxic', 'severe_toxic', 'obscene', 'threat', 'insult', 'identity_hate']
num_classes = len(target_labels)

In [8]:
corpus = train["comment_text"].append(test["comment_text"])

In [9]:
vectorizer = TfidfVectorizer()
vectorizer.fit(corpus)

TfidfVectorizer(analyzer='word', binary=False, decode_error='strict',
        dtype=<class 'numpy.int64'>, encoding='utf-8', input='content',
        lowercase=True, max_df=1.0, max_features=None, min_df=1,
        ngram_range=(1, 1), norm='l2', preprocessor=None, smooth_idf=True,
        stop_words=None, strip_accents=None, sublinear_tf=False,
        token_pattern='(?u)\\b\\w\\w+\\b', tokenizer=None, use_idf=True,
        vocabulary=None)

In [10]:
train_tf = vectorizer.transform(train["comment_text"])

In [34]:
from keras.models import Model, Input
from keras.layers import Dense, Dropout

In [13]:
dict_size = len(vectorizer.vocabulary_)
dict_size

353967

In [51]:
# model = Sequential([
#     Dense(64, input_shape=(dict_size,), activation="relu"),
#     Dropout(0.3),
#     Dense(32, activation="relu"),
#     Dropout(0.4),
#     Dense(16, activation="relu"),
#     Dropout(0.5),
#     Dense(16, activation="relu"),
#     Dense(num_classes, activation="sigmoid")
# ])

inputs = Input(shape=(dict_size,), sparse=True)
model = Dense(64, activation="relu")(inputs)
model = Dropout(0.3)(model)
model = Dense(32, activation="relu")(model)
model = Dropout(0.4)(model)
model = Dense(16, activation="relu")(model)
model = Dense(num_classes, activation="sigmoid")(model)

model = Model(inputs=inputs, outputs=model)

In [52]:
model.compile(optimizer="adam", loss="binary_crossentropy")

In [53]:
model.summary()

_________________________________________________________________
Layer (type)                 Output Shape              Param #   
input_7 (InputLayer)         (None, 353967)            0         
_________________________________________________________________
dense_56 (Dense)             (None, 64)                22653952  
_________________________________________________________________
dropout_33 (Dropout)         (None, 64)                0         
_________________________________________________________________
dense_57 (Dense)             (None, 32)                2080      
_________________________________________________________________
dropout_34 (Dropout)         (None, 32)                0         
_________________________________________________________________
dense_58 (Dense)             (None, 16)                528       
_________________________________________________________________
dense_59 (Dense)             (None, 6)                 102       
Total para

In [54]:
from keras.callbacks import EarlyStopping

early_stopping = EarlyStopping(patience=2)

In [56]:
model.fit(train_tf, train.drop(["id", "comment_text"], axis=1),validation_split=0.2,epochs=3, callbacks=[early_stopping], batch_size=512)

Train on 127656 samples, validate on 31915 samples
Epoch 1/3
Epoch 2/3
Epoch 3/3


<keras.callbacks.History at 0x7f1b41f084a8>

In [59]:
model.save("tf_dense.h5")

In [58]:
from keras.models import load_model

In [60]:
model = load_model("tf_dense.h5")

In [61]:
test_tf = vectorizer.transform(test["comment_text"])

In [62]:
prediction = model.predict(test_tf)

In [65]:
prediction[0]

array([ 0.99528801,  0.24654736,  0.92363667,  0.03770474,  0.76164937,
        0.1471846 ], dtype=float32)

In [66]:
submission = pd.DataFrame(data=prediction,columns=target_labels,index=test["id"] )
submission[:5]

Unnamed: 0_level_0,toxic,severe_toxic,obscene,threat,insult,identity_hate
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
00001cee341fdb12,0.995288,0.2465474,0.923637,0.037705,0.761649,0.147185
0000247867823ef7,0.002132,2.182103e-06,0.000153,1.8e-05,0.000158,3.2e-05
00013b17ad220c46,0.017056,0.0002089096,0.003333,0.000883,0.003252,0.00122
00017563c3f7919a,0.001192,4.987517e-07,6e-05,5e-06,6.2e-05,1e-05
00017695ad8997eb,0.00291,6.060063e-06,0.000283,4.7e-05,0.000283,7.3e-05


In [67]:
submission.to_csv("submission_tf_dense.csv")