In [1]:
import numpy as np
from keras.preprocessing.text import one_hot
from keras.preprocessing.sequence import pad_sequences
from keras.models import Sequential
from keras.layers import Dense
from keras.layers import Flatten
from keras.layers.embeddings import Embedding
from keras.preprocessing.text import Tokenizer
from sklearn.model_selection import train_test_split
import pickle
from paths import *
from keras.layers.convolutional import Convolution1D
from keras.layers.convolutional import MaxPooling1D
from keras.layers import LSTM
from keras.layers import  GRU
from create_embedding import *
from helper import *

Using TensorFlow backend.


In [None]:
MAX_LENGTH = 30 # maximum number of words in each tweet
dimension = 200 # dimension of word embeddings 

In [None]:
#load clean train and test data full ( remove repetition)
X = pickle.load(open(CLEANED_DATA_PATH + 'full_X_Cleaned.pkl', "rb"))
X_test = pickle.load(open(CLEANED_DATA_PATH + 'full_X_test_Cleaned.pkl', "rb"))

In [None]:
sequence_X , sequence_test , embedding_matrix , y , vocab_size= prepare_DL_input(X , X_test , dimension , MAX_LENGTH )

In [None]:
# for testing our model before submission we split our training data 
# to see which one gives better results
X_train, X_test, y_train, y_test= train_test_split(sequence_X, y, test_size=0.2)

In [9]:
#Model 1 : NN + not pretrained embeddings 
model = Sequential()
model.add(Embedding(vocab_size, dimension , input_length=MAX_LENGTH))
model.add(Flatten())
model.add(Dense(1, activation='sigmoid'))
# compile the model
model.compile(optimizer='adam', loss='binary_crossentropy', metrics=['accuracy'])
# fit the model
model.fit(X_train, y_train, epochs=2, batch_size=128, verbose=1, shuffle=True)
# check accuracy on split data ( train and test )
loss, accuracy = model.evaluate(X_test, y_test, verbose=0)
print('Accuracy: %f' % (accuracy*100)) #0.79

name = 'model1'
model.save(DL_MODELS_PATH + name +'.h5')

Instructions for updating:
Colocations handled automatically by placer.
Model: "sequential_1"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding_1 (Embedding)      (None, 30, 16)            286608    
_________________________________________________________________
flatten_1 (Flatten)          (None, 480)               0         
_________________________________________________________________
dense_1 (Dense)              (None, 1)                 481       
Total params: 287,089
Trainable params: 287,089
Non-trainable params: 0
_________________________________________________________________
None


In [None]:
#model 2 : NN + pretrained embedding 
model = Sequential()
model.add(Embedding(vocab_size, dimension, weights=[embedding_matrix], input_length=MAX_LENGTH))
model.add(Flatten())
model.add(Dense(1, activation='sigmoid'))
# compile the model
model.compile(optimizer='adam', loss='binary_crossentropy', metrics=['accuracy'])
# fit the model
model.fit(X_train, y_train, epochs=2, batch_size=128, verbose=1, shuffle=True)
# check accuracy on split data ( train and test )
loss, accuracy = model.evaluate(X_test, y_test, verbose=0)
print('Accuracy: %f' % (accuracy*100))#0.79

name = 'model2'
model.save(DL_MODELS_PATH + name +'.h5')

In [None]:
#model 3 : CNN + LSTM + pretrained embedding
model = Sequential()
model.add(Embedding(vocab_size, dimension, input_length=MAX_LENGTH, weights=[embedding_matrix]))
model.add(Convolution1D(nb_filter=32, filter_length=3, border_mode='same', activation='relu'))
model.add(MaxPooling1D(pool_size=2))
model.add(LSTM(100))
model.add(Dense(1, activation='sigmoid'))
# compile the model
model.compile(optimizer='adam', loss='binary_crossentropy', metrics=['accuracy'])
# fit the model
model.fit(X_train, y_train, epochs=2, batch_size=128, verbose=1, shuffle=True)
# check accuracy on split data ( train and test )
loss, accuracy = model.evaluate(X_test, y_test, verbose=0)
print('Accuracy: %f' % (accuracy*100)) #0.49

name = 'model3'
model.save(DL_MODELS_PATH + name +'.h5')

In [None]:
#model 4: CNN 
model = Sequential()
model.add(Embedding(vocab_size, dimension, input_length=MAX_LENGTH, weights=[embedding_matrix]))
model.add(Convolution1D(nb_filter=32, filter_length=3, border_mode='same', activation='relu'))
model.add(MaxPooling1D(pool_size=2))
model.add(Flatten())
model.add(Dense(250, activation='relu'))
model.add(Dense(1, activation='sigmoid'))
# compile the model
model.compile(optimizer='adam', loss='binary_crossentropy', metrics=['accuracy'])
# fit the model
model.fit(X_train, y_train, epochs=2, batch_size=128, verbose=1, shuffle=True)
# check accuracy on split data ( train and test )
loss, accuracy = model.evaluate(X_test, y_test, verbose=0)
print('Accuracy: %f' % (accuracy*100))# 0.65

name = 'model4'
model.save(DL_MODELS_PATH + name +'.h5')

In [None]:
#model 5 : 3 GRU layers + pretrained embedding
model = Sequential()
model.add(Embedding(vocab_size, dimension, input_length=MAX_LENGTH, weights=[embedding_matrix]))
model.add(GRU(units=16, name = "gru_1",return_sequences=True))
model.add(GRU(units=8, name = "gru_2" ,return_sequences=True))
model.add(GRU(units=4, name= "gru_3"))
model.add(Dense(1, activation='sigmoid',name="dense_1"))
# compile the model
model.compile(optimizer='adam', loss='binary_crossentropy', metrics=['accuracy'])
# fit the model
model.fit(X_train, y_train, epochs=2, batch_size=128, verbose=1, shuffle=True)
# check accuracy on split data ( train and test )
loss, accuracy = model.evaluate(X_test, y_test, verbose=0)
print('Accuracy: %f' % (accuracy*100)) # 0.837 AiCrowd

name = 'model5'
model.save(DL_MODELS_PATH + name +'.h5')

In [None]:
#model 6 : 1 GRU layer + pretrained embedding
model = Sequential()
model.add(Embedding(vocab_size, dimension, input_length=MAX_LENGTH, weights=[embedding_matrix]))
model.add(GRU(100))
model.add(Dense(1, activation='sigmoid'))
# compile the model
model.compile(optimizer='adam', loss='binary_crossentropy', metrics=['accuracy'])
# fit the model
model.fit(X_train, y_train, epochs=2, batch_size=128, verbose=1, shuffle=True)
# check accuracy on split data ( train and test )
loss, accuracy = model.evaluate(X_test, y_test, verbose=0)
print('Accuracy: %f' % (accuracy*100))#0.835 Aicrowd

name = 'model6'
model.save(DL_MODELS_PATH + name +'.h5')

In [None]:
#model 7 : GRU + pretrained embedding + dropout
model = Sequential()
model.add(Embedding(vocab_size, dimension, input_length=MAX_LENGTH, weights=[embedding_matrix]))
model.add(GRU(units=16, name = "gru_1",return_sequences=True, dropout_W=0.2))
model.add(GRU(units=8, name = "gru_2" ,return_sequences=True , dropout_W=0.2))
model.add(GRU(units=4, name= "gru_3" , dropout_W=0.2))
model.add(Dense(1, activation='sigmoid'))
# compile the model
model.compile(optimizer='adam', loss='binary_crossentropy', metrics=['accuracy'])
# fit the model
model.fit(X_train, y_train, epochs=2, batch_size=128, verbose=1, shuffle=True)
# check accuracy on split data ( train and test )
loss, accuracy = model.evaluate(X_test, y_test, verbose=0)
print('Accuracy: %f' % (accuracy*100)) #0.847 AiCrowd

name = 'model7'
model.save(DL_MODELS_PATH + name +'.h5')

In [None]:
#mode 8 : GRU + not pretrained embedding + dropout
model = Sequential()
model.add(Embedding(vocab_size, dimension , input_length=MAX_LENGTH))
model.add(GRU(units=16, name = "gru_1",return_sequences=True, dropout_W=0.2))
model.add(GRU(units=8, name = "gru_2" ,return_sequences=True , dropout_W=0.2))
model.add(GRU(units=4, name= "gru_3" , dropout_W=0.2))
model.add(Dense(1, activation='sigmoid'))
# compile the model
model.compile(optimizer='adam', loss='binary_crossentropy', metrics=['accuracy'])
# fit the model
model.fit(X_train, y_train, epochs=2, batch_size=128, verbose=1, shuffle=True)
# check accuracy on split data ( train and test )
loss, accuracy = model.evaluate(X_test, y_test, verbose=0)
print('Accuracy: %f' % (accuracy*100)) #0.843 AiCrowd

name = 'model8'
model.save(DL_MODELS_PATH + name +'.h5')

In [None]:
#model 9 : LSTM + pretrained embedding  
model = Sequential()
model.add(Embedding(vocab_size, dimension, input_length=MAX_LENGTH, weights=[embedding_matrix]))
model.add(LSTM(100, dropout_W=0.2, dropout_U=0.2))
model.add(Dense(1, activation='sigmoid'))
# compile the model
model.compile(optimizer='adam', loss='binary_crossentropy', metrics=['accuracy'])
# fit the model
model.fit(X_train, y_train, epochs=2, batch_size=128, verbose=1, shuffle=True)
# check accuracy on split data ( train and test )
loss, accuracy = model.evaluate(X_test, y_test, verbose=0)
print('Accuracy: %f' % (accuracy*100))# 0.837 AiCrowd

name = 'model9'
model.save(DL_MODELS_PATH + name +'.h5')

In [None]:
#model 10 : GRU + CNN + pretrained embedding 
model = Sequential()
model.add(Embedding(vocab_size, dimension, input_length=MAX_LENGTH, weights=[embedding_matrix]))
model.add(Convolution1D(filters=32, filter_length=3, border_mode='same', activation='relu'))
model.add(MaxPooling1D(pool_length=2))
model.add(GRU(100))
model.add(Dense(1, activation='sigmoid'))
# compile the model
model.compile(optimizer='adam', loss='binary_crossentropy', metrics=['accuracy'])
# fit the model
model.fit(X_train, y_train, epochs=2, batch_size=128, verbose=1, shuffle=True)
# check accuracy on split data ( train and test )
loss, accuracy = model.evaluate(X_test, y_test, verbose=0)
print('Accuracy: %f' % (accuracy*100))#0.836

name = 'model10'
model.save(DL_MODELS_PATH + name +'.h5')