In [1]:
from keras.layers import Embedding, Dense, Conv1D, MaxPooling1D, \
Dropout, Activation, Input, Flatten, Concatenate
from keras.models import Model
from keras.regularizers import l2

def cnn_model(FILTER_SIZES, \
              # filter sizes as a list
              MAX_NB_WORDS, \
              # total number of words
              MAX_DOC_LEN, \
              # max words in a doc
              EMBEDDING_DIM=200, \
              # word vector dimension
              NUM_FILTERS=64, \
              # number of filters for all size
              DROP_OUT=0.5, \
              # dropout rate
              NUM_OUTPUT_UNITS=1, \
              # number of output units
              NUM_DENSE_UNITS=100,\
              # number of units in dense layer
              PRETRAINED_WORD_VECTOR=None,\
              # Whether to use pretrained word vectors
              LAM=0.0):            
              # regularization coefficient
    
    main_input = Input(shape=(MAX_DOC_LEN,), \
                       dtype='int32', name='main_input')
    
    if PRETRAINED_WORD_VECTOR is not None:
        embed_1 = Embedding(input_dim=MAX_NB_WORDS+1, \
                        output_dim=EMBEDDING_DIM, \
                        input_length=MAX_DOC_LEN, \
                        weights=[PRETRAINED_WORD_VECTOR],\
                        trainable=False,\
                        name='embedding')(main_input)
    else:
        embed_1 = Embedding(input_dim=MAX_NB_WORDS+1, \
                        output_dim=EMBEDDING_DIM, \
                        input_length=MAX_DOC_LEN, \
                        name='embedding')(main_input)
    # add convolution-pooling-flat block
    conv_blocks = []
    for f in FILTER_SIZES:
        conv = Conv1D(filters=NUM_FILTERS, kernel_size=f, \
                      activation='relu', name='conv_'+str(f))(embed_1)
        conv = MaxPooling1D(MAX_DOC_LEN-f+1, name='max_'+str(f))(conv)
        conv = Flatten(name='flat_'+str(f))(conv)
        conv_blocks.append(conv)
    
    if len(conv_blocks)>1:
        z=Concatenate(name='concate')(conv_blocks)
    else:
        z=conv_blocks[0]
        
    drop=Dropout(rate=DROP_OUT, name='dropout')(z)

    dense = Dense(NUM_DENSE_UNITS, activation='relu',\
                    kernel_regularizer=l2(LAM),name='dense')(drop)
    preds = Dense(NUM_OUTPUT_UNITS, activation='sigmoid', name='output')(dense)
    model = Model(inputs=main_input, outputs=preds)
    
    model.compile(loss="binary_crossentropy", \
              optimizer="adam", metrics=["accuracy"]) 
    
    return model


Using TensorFlow backend.


In [2]:
import pandas as pd
import nltk,string


data=pd.read_csv('train.csv',encoding='Latin1', header=None)
trainDataReview=data[0]
trainDataSentiment=data[1]
del trainDataSentiment[0]
del trainDataReview[0]
#data
data1=pd.read_csv('test.csv',encoding='Latin1', header=None)
#data1
testdata1_Review=data1[0]
testdata1_Sentiment=data1[1]
del testdata1_Review[0]
del testdata1_Sentiment[0]


In [4]:

from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences
from keras.callbacks import EarlyStopping, ModelCheckpoint
import numpy as np

# set the maximum number of words to be used
MAX_NB_WORDS=10000

# set sentence/document length
MAX_DOC_LEN=1000

# get a Keras tokenizer
# https://keras.io/preprocessing/text/
tokenizer = Tokenizer(num_words=MAX_NB_WORDS)
tokenizer.fit_on_texts(trainDataReview)

# convert each document to a list of word index as a sequence
sequences = tokenizer.texts_to_sequences(trainDataReview)

# pad all sequences into the same length 
# if a sentence is longer than maxlen, pad it in the right
# if a sentence is shorter than maxlen, truncate it in the right
padded_seq = pad_sequences(sequences, \
                                 maxlen=MAX_DOC_LEN, \
                                 padding='post', \
                                 truncating='post')

test_seq = tokenizer.texts_to_sequences(testdata1_Review)

# pad all sequences into the same length 
# if a sentence is longer than maxlen, pad it in the right
# if a sentence is shorter than maxlen, truncate it in the right
padded_test_seq = pad_sequences(test_seq, \
                                 maxlen=MAX_DOC_LEN, \
                                 padding='post', \
                                 truncating='post')

EMBEDDING_DIM=300
FILTER_SIZES=[2,3,4]

# set the number of output units
# as the number of classes
output_units_num=1
num_filters=64

# set the dense units
dense_units_num= num_filters*len(FILTER_SIZES)

BTACH_SIZE = 32
NUM_EPOCHES = 100

BEST_MODEL_FILEPATH='best_model'

# With well trained word vectors, sample size can be reduced
# Assume we only have 500 labeled data
# split dataset into train (70%) and test sets (20%)

# create the model with embedding matrix
model=cnn_model(FILTER_SIZES, MAX_NB_WORDS, \
                MAX_DOC_LEN, \
                EMBEDDING_DIM=300,\
                NUM_OUTPUT_UNITS=output_units_num, \
                NUM_FILTERS=num_filters,\
                NUM_DENSE_UNITS=dense_units_num)

earlyStopping=EarlyStopping(monitor='val_loss', patience=1, verbose=2, mode='min')
checkpoint = ModelCheckpoint(BEST_MODEL_FILEPATH, monitor='val_acc', \
                             verbose=2, save_best_only=True, mode='max')
    
training=model.fit(padded_seq, trainDataSentiment, \
          batch_size=BTACH_SIZE, epochs=NUM_EPOCHES, \
          callbacks=[earlyStopping, checkpoint],\
          validation_data=[padded_test_seq, testdata1_Sentiment], verbose=2)



Train on 1000 samples, validate on 200 samples
Epoch 1/100
Epoch 00000: val_acc improved from -inf to 0.72000, saving model to best_model
38s - loss: 0.6912 - acc: 0.5120 - val_loss: 0.6757 - val_acc: 0.7200
Epoch 2/100
Epoch 00001: val_acc did not improve
38s - loss: 0.6269 - acc: 0.6850 - val_loss: 0.6196 - val_acc: 0.6850
Epoch 3/100
Epoch 00002: val_acc improved from 0.72000 to 0.76500, saving model to best_model
37s - loss: 0.4680 - acc: 0.8190 - val_loss: 0.5035 - val_acc: 0.7650
Epoch 4/100
Epoch 00003: val_acc improved from 0.76500 to 0.77000, saving model to best_model
37s - loss: 0.2650 - acc: 0.9120 - val_loss: 0.4629 - val_acc: 0.7700
Epoch 5/100
Epoch 00004: val_acc improved from 0.77000 to 0.81500, saving model to best_model
38s - loss: 0.1271 - acc: 0.9630 - val_loss: 0.4251 - val_acc: 0.8150
Epoch 6/100
Epoch 00005: val_acc did not improve
38s - loss: 0.0650 - acc: 0.9850 - val_loss: 0.3905 - val_acc: 0.8050
Epoch 7/100
Epoch 00006: val_acc did not improve
38s - loss: 0

In [5]:
from sklearn.metrics import classification_report

print(training.history)

model.load_weights("best_model")

# predict
pred=model.predict(padded_test_seq)
pred=np.where(pred>0.5,1,0)
print(pred[0:5])
# evaluate the model
scores = model.evaluate(padded_test_seq, testdata1_Sentiment, verbose=0)
print("%s: %.2f%%" % (model.metrics_names[1], scores[1]*100))

setimentTestNum=pd.to_numeric(testdata1_Sentiment)
setimentTrainNum=pd.to_numeric(trainDataSentiment)
print(classification_report(setimentTestNum, pred))

{'val_loss': [0.67569261550903326, 0.61960777759551999, 0.50349781990051268, 0.4629493045806885, 0.4251461398601532, 0.39050834774971011, 0.47210723161697388, 0.41147908449172976], 'val_acc': [0.71999999999999997, 0.68500000000000005, 0.76500000000000001, 0.77000000000000002, 0.81499999999999995, 0.80500000000000005, 0.78500000000000003, 0.81999999999999995], 'loss': [0.6911957511901855, 0.62689066886901856, 0.4679721283912659, 0.26501093983650209, 0.12712891827523709, 0.065046991229057313, 0.046971832394599913, 0.028428204238414766], 'acc': [0.51200000000000001, 0.68500000000000005, 0.81899999999999995, 0.91200000000000003, 0.96299999999999997, 0.98499999999999999, 0.98699999999999999, 0.99299999999999999]}
[[0]
 [1]
 [0]
 [1]
 [1]]
acc: 82.00%
             precision    recall  f1-score   support

          0       0.81      0.84      0.82        99
          1       0.84      0.80      0.82       101

avg / total       0.82      0.82      0.82       200



In [6]:
import gensim

wv_model = gensim.models.KeyedVectors.load_word2vec_format('GoogleNews-vectors-negative300.bin', binary=True) 




In [7]:
MAX_NB_WORDS=12000
EMBEDDING_DIM=300

# tokenizer.word_index provides the mapping 
# between a word and word index for all words
NUM_WORDS = min(MAX_NB_WORDS, len(tokenizer.word_index))

# "+1" is for padding symbol
embedding_matrix = np.zeros((NUM_WORDS+1, EMBEDDING_DIM))

ignored_words=[]
for word, i in tokenizer.word_index.items():
    # if word_index is above the max number of words, ignore it
    if i >= NUM_WORDS:
        continue
    if word in wv_model.wv:
        embedding_matrix[i]=wv_model.wv[word]
    else:
        ignored_words.append(word)
        

In [8]:
# set the maximum number of words to be used
MAX_NB_WORDS=12000

# set sentence/document length
MAX_DOC_LEN=1000

# get a Keras tokenizer
# https://keras.io/preprocessing/text/
tokenizer = Tokenizer(num_words=MAX_NB_WORDS)
tokenizer.fit_on_texts(trainDataReview)

# convert each document to a list of word index as a sequence
sequences = tokenizer.texts_to_sequences(trainDataReview)

# pad all sequences into the same length 
# if a sentence is longer than maxlen, pad it in the right
# if a sentence is shorter than maxlen, truncate it in the right
padded_sequences = pad_sequences(sequences, \
                                 maxlen=MAX_DOC_LEN, \
                                 padding='post', \
                                 truncating='post')

test_sequences = tokenizer.texts_to_sequences(testdata1_Review)

# pad all sequences into the same length 
# if a sentence is longer than maxlen, pad it in the right
# if a sentence is shorter than maxlen, truncate it in the right
padded_test_sequences = pad_sequences(test_sequences, \
                                 maxlen=MAX_DOC_LEN, \
                                 padding='post', \
                                 truncating='post')




In [9]:

EMBEDDING_DIM=300
FILTER_SIZES=[2,3,4]

# set the number of output units
# as the number of classes
output_units_num=1
num_filters=64

# set the dense units
dense_units_num= num_filters*len(FILTER_SIZES)

BTACH_SIZE = 32
NUM_EPOCHES = 100

# With well trained word vectors, sample size can be reduced
# Assume we only have 500 labeled data
# split dataset into train (70%) and test sets (20%)


# create the model with embedding matrix
model=cnn_model(FILTER_SIZES, MAX_NB_WORDS, \
                MAX_DOC_LEN, \
                EMBEDDING_DIM=300,\
                NUM_OUTPUT_UNITS=output_units_num, \
                NUM_FILTERS=num_filters,\
                NUM_DENSE_UNITS=dense_units_num,\
                PRETRAINED_WORD_VECTOR=embedding_matrix)

earlyStopping=EarlyStopping(monitor='val_acc', patience=3, verbose=2, mode='max')
checkpoint = ModelCheckpoint(BEST_MODEL_FILEPATH, monitor='val_acc', \
                             verbose=2, save_best_only=True, mode='max')
    
training=model.fit(padded_sequences, trainDataSentiment, \
          batch_size=BTACH_SIZE, epochs=NUM_EPOCHES, \
          callbacks=[earlyStopping, checkpoint],\
          validation_data=[padded_test_sequences, testdata1_Sentiment], verbose=2)

Train on 1000 samples, validate on 200 samples
Epoch 1/100
Epoch 00000: val_acc improved from -inf to 0.53000, saving model to best_model
27s - loss: 0.7347 - acc: 0.4990 - val_loss: 0.6805 - val_acc: 0.5300
Epoch 2/100
Epoch 00001: val_acc improved from 0.53000 to 0.70500, saving model to best_model
27s - loss: 0.6453 - acc: 0.6320 - val_loss: 0.6311 - val_acc: 0.7050
Epoch 3/100
Epoch 00002: val_acc improved from 0.70500 to 0.77000, saving model to best_model
26s - loss: 0.5901 - acc: 0.6870 - val_loss: 0.5566 - val_acc: 0.7700
Epoch 4/100
Epoch 00003: val_acc improved from 0.77000 to 0.79000, saving model to best_model
26s - loss: 0.4721 - acc: 0.7950 - val_loss: 0.4700 - val_acc: 0.7900
Epoch 5/100
Epoch 00004: val_acc did not improve
26s - loss: 0.3927 - acc: 0.8230 - val_loss: 0.4425 - val_acc: 0.7900
Epoch 6/100
Epoch 00005: val_acc did not improve
27s - loss: 0.3016 - acc: 0.8680 - val_loss: 0.4473 - val_acc: 0.7900
Epoch 7/100
Epoch 00006: val_acc improved from 0.79000 to 0.80

In [10]:
print(training.history)

model.load_weights("best_model")

# predict
pred=model.predict(padded_test_seq)
pred=np.where(pred>0.5,1,0)
print(pred[0:5])
# evaluate the model
scores = model.evaluate(padded_test_seq, testdata1_Sentiment, verbose=0)
print("%s: %.2f%%" % (model.metrics_names[1], scores[1]*100))

setimentTestNum=pd.to_numeric(testdata1_Sentiment)
setimentTrainNum=pd.to_numeric(trainDataSentiment)
print(classification_report(setimentTestNum, pred))

{'val_loss': [0.68049312829971309, 0.63106023550033574, 0.55663531303405767, 0.47004890680313111, 0.44251871466636655, 0.44729318380355837, 0.39193913459777829, 0.36304752171039584, 0.38067011952400209, 0.40003479480743409, 0.40885510921478274, 0.38361028075218201, 0.42110904097557067, 0.39088418245315554, 0.45965344905853273, 0.43311402678489686], 'val_acc': [0.53000000000000003, 0.70499999999999996, 0.77000000000000002, 0.79000000000000004, 0.79000000000000004, 0.79000000000000004, 0.80000000000000004, 0.81999999999999995, 0.82499999999999996, 0.82499999999999996, 0.80500000000000005, 0.82999999999999996, 0.81499999999999995, 0.82999999999999996, 0.81000000000000005, 0.81999999999999995], 'loss': [0.73471386241912839, 0.64533096146583557, 0.59013731098175048, 0.47205068016052248, 0.3926629219055176, 0.30155634641647339, 0.23514338469505311, 0.15578502854704857, 0.11201140630245209, 0.079159229844808582, 0.063684763222932816, 0.041681142210960385, 0.035580520361661913, 0.0389182134419