https://github.com/tensorflow/workshops/blob/master/extras/keras-bag-of-words/keras-bow-model.ipynb

In [170]:
from keras.preprocessing.sequence import pad_sequences
from keras.preprocessing.text import Tokenizer
import numpy as np
import tensorflow as tf

from keras.models import Sequential
from keras.layers import Dense, Activation, Dropout

import os
import pandas as pd

In [171]:
def process_x(x):
    return x.split(' +++$+++ ')[1]

def process_y(x):
    return x.split(' +++$+++ ')[0]


file_name = 'training_label.txt'

train = os.path.join(os.getcwd(),'dataset', file_name)
df_train = pd.read_csv(train,header=None, delimiter='\n')
df_train.columns = ['raw']
df_train['x'] = df_train['raw'].apply(process_x)
df_train['y'] = df_train['raw'].apply(process_y)

df_train

Unnamed: 0,raw,x,y
0,1 +++$+++ are wtf ... awww thanks !,are wtf ... awww thanks !,1
1,1 +++$+++ leavingg to wait for kaysie to arriv...,leavingg to wait for kaysie to arrive myspacin...,1
2,0 +++$+++ i wish i could go and see duffy when...,i wish i could go and see duffy when she comes...,0
3,1 +++$+++ i know eep ! i can ' t wait for one ...,i know eep ! i can ' t wait for one more day ....,1
4,0 +++$+++ so scared and feeling sick . fuck ! ...,so scared and feeling sick . fuck ! hope someo...,0
5,0 +++$+++ my b day was thurs . i wanted 2 do 5...,my b day was thurs . i wanted 2 do 5 this week...,0
6,1 +++$+++ e3 is in the trending topics only ju...,e3 is in the trending topics only just noticed...,1
7,1 +++$+++ where did you get him from i know so...,where did you get him from i know someone who ...,1
8,0 +++$+++ dam just got buzzed by another huge ...,dam just got buzzed by another huge fly ! this...,0
9,1 +++$+++ tomorrowwwwwwwww !!! you ' ll love t...,tomorrowwwwwwwww !!! you ' ll love tomorrow ' ...,1


In [172]:
x_train = df_train['x'].values
y_train = df_train['y'].values

print(x_train.shape)
print(y_train.shape)

(200000,)
(200000,)


In [173]:
num_words = 1000
tokenize = Tokenizer(num_words=num_words, char_level=False)

In [174]:
# https://github.com/keras-team/keras-preprocessing/blob/master/keras_preprocessing/text.py

# from x_train, find top 2000 most frequent words
tokenize.fit_on_texts(x_train) 

x_train = tokenize.texts_to_matrix(x_train)

x_train.shape


(200000, 1000)

In [175]:
from keras import utils

y_train = y_train.astype(int)

# Converts the labels to a one-hot representation
num_classes = np.max(y_train) + 1
y_train = utils.to_categorical(y_train, num_classes)


In [176]:
y_train[:5]

array([[0., 1.],
       [0., 1.],
       [1., 0.],
       [0., 1.],
       [1., 0.]], dtype=float32)

In [177]:
model = Sequential()
model.add(Dense(512, input_shape=(num_words,)))
model.add(Activation('relu'))
model.add(Dropout(0.5))
model.add(Dense(2))
model.add(Activation('softmax'))

model.summary()

_________________________________________________________________
Layer (type)                 Output Shape              Param #   
dense_22 (Dense)             (None, 512)               512512    
_________________________________________________________________
activation_21 (Activation)   (None, 512)               0         
_________________________________________________________________
dropout_10 (Dropout)         (None, 512)               0         
_________________________________________________________________
dense_23 (Dense)             (None, 2)                 1026      
_________________________________________________________________
activation_22 (Activation)   (None, 2)                 0         
Total params: 513,538
Trainable params: 513,538
Non-trainable params: 0
_________________________________________________________________


In [178]:
model.compile(loss='categorical_crossentropy',
              optimizer='adam',
              metrics=['accuracy'])

In [181]:
# https://github.com/keras-team/keras/blob/master/keras/engine/training.py

model.fit(x_train, y_train,
          batch_size=32,
          epochs=2,
          verbose=1,
          validation_split=0.25)

Train on 150000 samples, validate on 50000 samples
Epoch 1/2
Epoch 2/2


<keras.callbacks.History at 0xbc38282e8>

In [182]:
def process_x_test(x):
    s = x.find(',') + 1
    return x[s:]
    
path_test = os.path.join(os.getcwd(),'dataset', 'testing_data.txt')
df_test = pd.read_csv(path_test,header=0, delimiter='\n')
df_test.columns = ['raw']
df_test['x'] = df_test['raw'].apply(process_x_test)
df_test

Unnamed: 0,raw,x
0,"0,my dog ate our dinner . no , seriously ... h...","my dog ate our dinner . no , seriously ... he ..."
1,"1,omg last day sooon n of primary noooooo x im...",omg last day sooon n of primary noooooo x im g...
2,"2,stupid boys .. they ' re so .. stupid !",stupid boys .. they ' re so .. stupid !
3,"3,hi ! do u know if the nurburgring is open fo...",hi ! do u know if the nurburgring is open for ...
4,"4,having lunch in the office , and thinking of...","having lunch in the office , and thinking of h..."
5,"5,shopping was fun",shopping was fun
6,"6,wondering where all the nice weather has gone .",wondering where all the nice weather has gone .
7,"7,morning ! yeeessssssss new mimi in aug",morning ! yeeessssssss new mimi in aug
8,"8,umm ... maybe that ' s how the british spell...",umm ... maybe that ' s how the british spell it ?
9,"9,yes it ' s 3 : 50 am . yes i ' m still awake...",yes it ' s 3 : 50 am . yes i ' m still awake ....


In [183]:
docs = df_test['x']
x_test = docs.values

x_test = tokenize.texts_to_matrix(x_test)
x_test.shape


(200000, 1000)

In [185]:
predict = model.predict(x_test[:50], batch_size=32)
predict

prediction = model.predict(x_test[:50])
prediction

array([[9.01173532e-01, 9.88264456e-02],
       [9.74505901e-01, 2.54941322e-02],
       [8.14075828e-01, 1.85924158e-01],
       [2.40420505e-01, 7.59579539e-01],
       [4.46480721e-01, 5.53519309e-01],
       [4.11436446e-02, 9.58856404e-01],
       [8.97427559e-01, 1.02572456e-01],
       [1.78915516e-01, 8.21084499e-01],
       [3.37654382e-01, 6.62345648e-01],
       [1.96569517e-01, 8.03430498e-01],
       [3.22653796e-03, 9.96773422e-01],
       [7.85535574e-02, 9.21446443e-01],
       [5.46460273e-03, 9.94535446e-01],
       [8.92391920e-01, 1.07608058e-01],
       [1.18512725e-02, 9.88148749e-01],
       [8.88507515e-02, 9.11149263e-01],
       [4.48865026e-01, 5.51135004e-01],
       [8.59149873e-01, 1.40850142e-01],
       [1.69796333e-01, 8.30203712e-01],
       [5.75517952e-01, 4.24482048e-01],
       [8.88975561e-01, 1.11024491e-01],
       [7.09210992e-01, 2.90788978e-01],
       [7.22133934e-01, 2.77866125e-01],
       [9.27593768e-01, 7.24062622e-02],
       [3.111798

In [191]:
predicted_label = np.argmax(prediction, axis = 1)
predicted_label

array([0, 0, 0, 1, 1, 1, 0, 1, 1, 1, 1, 1, 1, 0, 1, 1, 1, 0, 1, 0, 0, 0,
       0, 0, 1, 1, 0, 1, 0, 1, 1, 0, 1, 1, 1, 0, 1, 0, 1, 1, 0, 0, 0, 1,
       1, 0, 0, 0, 1, 1])

In [194]:
for i in range(50):
    print('-------')
#     print(i)
    print('predicted_label',predicted_label[i])
    print(docs[i])
    

-------
predicted_label 0
my dog ate our dinner . no , seriously ... he ate it .
-------
predicted_label 0
omg last day sooon n of primary noooooo x im gona be swimming out of school wif the amount of tears am gona cry
-------
predicted_label 0
stupid boys .. they ' re so .. stupid !
-------
predicted_label 1
hi ! do u know if the nurburgring is open for tourists today ? we want to go , but there is an event today
-------
predicted_label 1
having lunch in the office , and thinking of how to resolve this discount form issue
-------
predicted_label 1
shopping was fun
-------
predicted_label 0
wondering where all the nice weather has gone .
-------
predicted_label 1
morning ! yeeessssssss new mimi in aug
-------
predicted_label 1
umm ... maybe that ' s how the british spell it ?
-------
predicted_label 1
yes it ' s 3 : 50 am . yes i ' m still awake . yes i can ' t sleep . yes i ' ll regret it tomorrow . haha i love you mr saturday
-------
predicted_label 1
cute heart shaped portal cube . 

In [218]:
a = np.array(['not very good','not good', 'hate', 'shit', 'bitch'])
a = tokenize.texts_to_matrix(a)
a.shape
model.predict(a)


array([[0.8353665 , 0.16463357],
       [0.817721  , 0.18227904],
       [0.9027109 , 0.09728914],
       [0.73662263, 0.26337734],
       [0.37564966, 0.6243503 ]], dtype=float32)