In [1]:
import numpy as np
from keras.models import Model, Sequential
from keras.layers import Dense, Input, Dropout, LSTM, Activation
from keras.layers import Bidirectional, GlobalMaxPool1D
from keras.layers.embeddings import Embedding
from keras.preprocessing import sequence
from keras.initializers import glorot_uniform

from matplotlib import pyplot as plt
%matplotlib inline

In [2]:
from google.colab import drive
from google.colab import files
drive.mount('/content/drive')

Mounted at /content/drive


In [3]:
from gensim.models import KeyedVectors
def load_w2v(filepath,binary):
    return KeyedVectors.load_word2vec_format(filepath, binary=binary)

In [4]:
!cp drive/MyDrive/wiki.ar.vec /content

In [5]:
# word2vec (via Aziz Alto), : https://drive.google.com/open?id=0ByiDbCx0i9pEQV9ZUEFIb0hwMmM
# 185 MB; vocab=159175; takes ~5-10 secs to load
#w2v = load_w2v("/home/zmccoy/sandbox/sentiment-analysis-arabic/arabic-news.bin", binary=True)
# --
# fastText (via Facebook Research), Wikipedia: https://s3-us-west-1.amazonaws.com/fasttext-vectors/wiki.ar.vec
# 1.5 GB; vocab=610977; takes ~3 mins to load
w2v = load_w2v("/content/wiki.ar.vec", binary=False) #/content/wiki.ar.vec
print(len(w2v.vocab))

610977


In [6]:
MAX_SEQUENCE_LENGTH = 85 #176 # based on our inputs; TODO: remove outliers? dynamically calculate!
MAX_NB_WORDS = len(w2v.vocab)
EMBEDDING_DIM = 300 # w2v, fastText; GloVe=50

In [7]:

sample = w2v["حسن"]
print(sample.shape)
#print(sample)
print(w2v.most_similar("حسن"))

(300,)
[('،حسن', 0.6795670390129089), ('أبوحسن', 0.6792263388633728), ('#حسن', 0.6771581172943115), ('وحسن', 0.6517007946968079), ('لحسن', 0.6505534648895264), ('\u200fحسن', 0.6446623206138611), ('كحسن', 0.6202388405799866), ('عبدالحسن', 0.6161733865737915), ('بوحسن', 0.6157495975494385), ('د/حسن', 0.6062546968460083)]


In [8]:
import os
import glob
import codecs
from keras.preprocessing.text import text_to_word_sequence
from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences

def get_stop_words():
    path = "/content/list.txt" #"/home/zmccoy/sandbox/sentiment-analysis-arabic/stop-words-list.txt" #/content/list.txt
    stop_words = []
    with codecs.open(path, "r", encoding="utf-8", errors="ignore") as myfile:
        stop_words = myfile.readlines()
    stop_words = [word.strip() for word in stop_words]
    return stop_words

def get_label(path):
    filename = path[path.rfind('/'):]
    if filename.startswith("/pos"):
        return np.array([1, 0])
    return np.array([0, 1])

def get_text(path):
    text = ""
    text_words = []
    # use codecs.open to ingnore non utf-8 encoded chars; instead of "with open(path, 'r') as myfile:"
    with codecs.open(path, "r", encoding="utf-8", errors="ignore") as myfile:
        text = myfile.read()
    words = text.split(" ")
    stop_words = get_stop_words()
    for word in words:
        if word not in stop_words:
            text_words.append(word)
    return ' '.join(text_words)

def get_text_sequences(texts):
    tokenizer = Tokenizer(num_words=MAX_NB_WORDS)
    tokenizer.fit_on_texts(texts)
    sequences = tokenizer.texts_to_sequences(texts)
    word_index = tokenizer.word_index
    #print('Found %s unique tokens.' % len(word_index))
    data = pad_sequences(sequences, maxlen=MAX_SEQUENCE_LENGTH)
    return data, word_index

def get_data_encoded(path):
    vectors = []
    padded_vectors = []
    unknown_words = []
    labels = []
    all_paths = glob.glob(os.path.join(path,'*.txt'))
    np.random.shuffle(all_paths)
    for path in all_paths:
        # use codecs.open to ingnore non utf-8 encoded chars; instead of "with open(path, 'r') as myfile:"
        with codecs.open(path, "r", encoding='utf-8', errors='ignore') as myfile:
            text = myfile.read()
            word_sequences = text_to_word_sequence(text,
                                        filters='ا\n\ufeff\xa0!"#$%&()*+,-./:;<=>?@[\\]^_`{|}~\t',
                                        #filters='!"#$%&()*+,-./:;<=>?@[\\]^_`{|}~\t\n',
                                        lower=True,
                                        split=" ")
            vector = []
            for word in word_sequences:
                stop_words = get_stop_words()
                if word not in stop_words:
                    try:
                        vector.append(w2v[word])
                    except KeyError:
                        # TODO: incrementally update w2v with OOVs (out-of-vocabularies)
                        #print("' " + str(word) + "' not in vocabulary")
                        unknown_words.append(word)
            vectors.append(vector)
            label = get_label(path)
            labels.append(label)
    padded_vectors = sequence.pad_sequences(vectors, maxlen=MAX_SEQUENCE_LENGTH)
    return np.array(padded_vectors), np.array(labels), unknown_words
    
def get_data(path):
    texts = []
    labels = []
    all_paths = glob.glob(os.path.join(path,'*.txt'))
    np.random.shuffle(all_paths)
    for path in all_paths: 
        texts.append(get_text(path))
        labels.append(get_label(path))
    sequences, word_index = get_text_sequences(texts)
    return sequences, np.array(labels), word_index

In [10]:

print(get_stop_words())

['،', 'ء', 'ءَ', 'آ', 'آب', 'آذار', 'آض', 'آل', 'آمينَ', 'آناء', 'آنفا', 'آه', 'آهاً', 'آهٍ', 'آهِ', 'أ', 'أبدا', 'أبريل', 'أبو', 'أبٌ', 'أجل', 'أجمع', 'أحد', 'أخبر', 'أخذ', 'أخو', 'أخٌ', 'أربع', 'أربعاء', 'أربعة', 'أربعمئة', 'أربعمائة', 'أرى', 'أسكن', 'أصبح', 'أصلا', 'أضحى', 'أطعم', 'أعطى', 'أعلم', 'أغسطس', 'أفريل', 'أفعل به', 'أفٍّ', 'أقبل', 'أكتوبر', 'أل', 'ألا', 'ألف', 'ألفى', 'أم', 'أما', 'أمام', 'أمامك', 'أمامكَ', 'أمد', 'أمس', 'أمسى', 'أمّا', 'أن', 'أنا', 'أنبأ', 'أنت', 'أنتم', 'أنتما', 'أنتن', 'أنتِ', 'أنشأ', 'أنه', 'أنًّ', 'أنّى', 'أهلا', 'أو', 'أوت', 'أوشك', 'أول', 'أولئك', 'أولاء', 'أولالك', 'أوّهْ', 'أى', 'أي', 'أيا', 'أيار', 'أيضا', 'أيلول', 'أين', 'أيّ', 'أيّان', 'أُفٍّ', 'ؤ', 'إحدى', 'إذ', 'إذا', 'إذاً', 'إذما', 'إذن', 'إزاء', 'إلى', 'إلي', 'إليكم', 'إليكما', 'إليكنّ', 'إليكَ', 'إلَيْكَ', 'إلّا', 'إمّا', 'إن', 'إنَّ', 'إى', 'إياك', 'إياكم', 'إياكما', 'إياكن', 'إيانا', 'إياه', 'إياها', 'إياهم', 'إياهما', 'إياهن', 'إياي', 'إيهٍ', 'ئ', 'ا', 'ا?', 'ا?ى', 'االا', 'االتى', 'اب

In [12]:
!unzip Twitter.zip

Archive:  Twitter.zip
  inflating: Twitter/Data set Description.txt  
   creating: Twitter/Negative/
  inflating: Twitter/Negative/negative1.txt  
  inflating: Twitter/Negative/negative10.txt  
 extracting: Twitter/Negative/negative100.txt  
  inflating: Twitter/Negative/negative1000.txt  
  inflating: Twitter/Negative/negative101.txt  
  inflating: Twitter/Negative/negative102.txt  
 extracting: Twitter/Negative/negative103.txt  
  inflating: Twitter/Negative/negative104.txt  
  inflating: Twitter/Negative/negative105.txt  
  inflating: Twitter/Negative/negative106.txt  
  inflating: Twitter/Negative/negative107.txt  
  inflating: Twitter/Negative/negative108.txt  
  inflating: Twitter/Negative/negative109.txt  
  inflating: Twitter/Negative/negative11.txt  
  inflating: Twitter/Negative/negative110.txt  
  inflating: Twitter/Negative/negative111.txt  
  inflating: Twitter/Negative/negative112.txt  
  inflating: Twitter/Negative/negative113.txt  
  inflating: Twitter/Negative/negative

In [13]:
word_index_train = {}
word_index_valid = {}
word_index_test = {}
X_train, y_train, word_index_train = get_data("/content/Twitter") #/content/Twitter
#X_train, y_train, word_index_train = get_data("/home/zmccoy/sandbox/sentiment-analysis-arabic/Erik/train")
print(X_train.shape)
print(y_train.shape)
print(len(word_index_train))
X_valid, y_valid, word_index_valid = get_data("/content/Twitter")
#X_valid, y_valid, word_index_valid = get_data("/home/zmccoy/sandbox/sentiment-analysis-arabic/Erik/valid")
print(X_valid.shape)
print(y_valid.shape)
print(len(word_index_valid))
#X_test, y_test, word_index_test = get_data("/home/zmccoy/sandbox/sentiment-analysis-arabic/Erik/test")
#print(X_test.shape)
#print(y_test.shape)
#print(len(word_index_test))
word_index = {}
for wi in (word_index_train, word_index_valid, word_index_test): word_index.update(wi)
print(len(word_index))

(1, 85)
(1, 2)
181
(1, 85)
(1, 2)
181
181


In [14]:
X_train_enc, y_train, unk_train = get_data_encoded("/content/Twitter")
#X_train_enc, y_train, unk_train = get_data_encoded("/home/zmccoy/sandbox/sentiment-analysis-arabic/Erik/train")
print(X_train_enc.shape)
print(y_train.shape)
print(len(unk_train))
X_valid_enc, y_valid, unk_valid = get_data_encoded("/content/Twitter")
#X_valid_enc, y_valid, unk_valid = get_data_encoded("/home/zmccoy/sandbox/sentiment-analysis-arabic/Erik/valid")
print(X_valid_enc.shape)
print(y_valid.shape)
print(len(unk_valid))
#X_test_enc, y_test, unk_test = get_data_encoded("/home/zmccoy/sandbox/sentiment-analysis-arabic/Twitter/Combined/test")
#X_test_enc, y_test, unk_test = get_data_encoded("/home/zmccoy/sandbox/sentiment-analysis-arabic/Erik/test")
#print(X_test_enc.shape)
#print(y_test.shape)
#print(len(unk_test))

(1, 85, 300)
(1, 2)
92
(1, 85, 300)
(1, 2)
92


In [15]:
def create_embedding_matrix(word_index):
    nb_words = min(MAX_NB_WORDS, len(word_index))+1
    embedding_matrix = np.zeros((nb_words, EMBEDDING_DIM))
    for word, ii in word_index.items():
        if word in w2v.vocab:
            embedding_matrix[ii] = w2v.word_vec(word)
    return embedding_matrix

embedding_matrix = create_embedding_matrix(word_index)
print(embedding_matrix.shape)

(182, 300)


In [16]:
embedding_layer = w2v.get_keras_embedding(train_embeddings=False)
print(embedding_layer.input_dim)
print(embedding_layer.output_dim)
#embedding_layer.set_weights(embedding_matrix)
#embedding_layer.get_weights()

610977
300


In [17]:
from keras.callbacks import ModelCheckpoint, EarlyStopping
def get_callbacks(weights_filepath):
    checkpoint = ModelCheckpoint(filepath=weights_filepath, monitor='val_loss', verbose=1, save_best_only=True, mode='min')
    early_stopping = EarlyStopping(monitor='val_loss', patience=4)
    callbacks = [checkpoint, early_stopping]
    return callbacks

In [18]:
def get_model_pretrained_embedding(trainable=False):
    weights_filename = "weights/keras-lstm-pretrained-embedding.h5"
    if trainable is True:
        weights_filename = "weights/keras-lstm-pretrained-embedding-trainable.h5"
    #embedding_layer = Embedding(len(word_index)+1, EMBEDDING_DIM, weights=[embedding_matrix], input_length=MAX_SEQUENCE_LENGTH, trainable=trainable)
    embedding_layer = w2v.get_keras_embedding(train_embeddings=trainable)
    return get_model_embedding(embedding_layer), weights_filename
    
def get_model_from_scratch_embedding():
    embedding_layer = Embedding(len(word_index)+1, EMBEDDING_DIM, input_length=MAX_SEQUENCE_LENGTH)
    return get_model_embedding(embedding_layer), "weights/keras-lstm-from-scratch-embedding.h5"
    
def get_model_embedding(embedding_layer):
    sentence_indices = Input(shape=(MAX_SEQUENCE_LENGTH,),dtype='int32')
    embeddings = embedding_layer(sentence_indices)   
    # Propagate the embeddings through an LSTM layer with 128-dimensional hidden state
    # Returned output should be a batch of sequences.
    X = LSTM(128, return_sequences=True)(embeddings)
    X = Dropout(0.5)(X)
    # Returned output should be a single hidden state, not a batch of sequences.
    X = LSTM(128)(X)
    X = Dropout(0.5)(X)
    X = Dense(2)(X)
    X = Activation('softmax')(X)
    model = Model(inputs=sentence_indices,outputs=X)
    return model  

def get_model_no_embedding():
    model = Sequential()
    model.add(LSTM(128, return_sequences=True, input_shape=(MAX_SEQUENCE_LENGTH,EMBEDDING_DIM)))
    model.add(Dropout(0.5))
    model.add(LSTM(128))
    model.add(Dropout(0.5))
    model.add(Dense(2))
    model.add(Activation('softmax'))
    return model, "weights/keras-lstm-no-embedding.h5"

def get_model_pretrained_embedding_bidirectional():
    sentence_indices = Input(shape=(MAX_SEQUENCE_LENGTH,),dtype='int32')
    embedding_layer = Embedding(len(word_index)+1, EMBEDDING_DIM, weights=[embedding_matrix], input_length=MAX_SEQUENCE_LENGTH, trainable=True)
    embeddings = embedding_layer(sentence_indices)   
    X = Bidirectional(LSTM(60, return_sequences=True, dropout=0.1, recurrent_dropout=0.1))(embeddings)
    X = GlobalMaxPool1D()(X)
    X = Dropout(0.1)(X)
    X = Dense(50)(X)
    X = Activation("relu")(X)
    X = Dropout(0.1)(X)
    X = Dense(2)(X)
    X = Activation('softmax')(X)
    model = Model(inputs=sentence_indices,outputs=X)
    return model, "weights/keras-lstm-pretrained-embedding-bidirectional-trainable.h5"

In [19]:
#1
model_from_scratch, weights_from_scratch = get_model_from_scratch_embedding()
#2A
model_pt_frozen, weights_pt_frozen = get_model_pretrained_embedding()
#2B
model_pt_trainable, weights_pt_trainable = get_model_pretrained_embedding(trainable=True)
#3
model_no_embedding, weights_no_embedding = get_model_no_embedding()
# Bidirectional
model_bidirectional, weights_bidirectional = get_model_pretrained_embedding_bidirectional()
#model.load_weights(weights_filepath)
#model.summary()



In [20]:
model_from_scratch.compile(loss="categorical_crossentropy", optimizer='adam', metrics=['accuracy'])
model_pt_frozen.compile(loss="categorical_crossentropy", optimizer='adam', metrics=['accuracy'])
model_pt_trainable.compile(loss="categorical_crossentropy", optimizer='adam', metrics=['accuracy'])
model_no_embedding.compile(loss="categorical_crossentropy", optimizer='adam', metrics=['accuracy'])
model_bidirectional.compile(loss="categorical_crossentropy", optimizer='adam', metrics=['accuracy'])

In [None]:
batch_size = 32
epochs = 10
model_from_scratch.fit(X_train, y_train, batch_size=batch_size, epochs=epochs, validation_data=(X_valid, y_valid), callbacks=get_callbacks(weights_from_scratch))
model_pt_frozen.fit(X_train, y_train, batch_size=batch_size, epochs=epochs, validation_data=(X_valid, y_valid), callbacks=get_callbacks(weights_pt_frozen))
model_pt_trainable.fit(X_train, y_train, batch_size=batch_size, epochs=epochs, validation_data=(X_valid, y_valid), callbacks=get_callbacks(weights_pt_trainable)) 
history = model_no_embedding.fit(X_train_enc, y_train, batch_size=batch_size, epochs=epochs, validation_data=(X_valid_enc, y_valid), callbacks=get_callbacks(weights_no_embedding))
model_bidirectional.fit(X_train, y_train, batch_size=batch_size, epochs=epochs, validation_data=(X_valid, y_valid), callbacks=get_callbacks(weights_bidirectional))

Epoch 1/10

Epoch 00001: val_loss improved from inf to 0.66912, saving model to weights/keras-lstm-from-scratch-embedding.h5
Epoch 2/10

Epoch 00002: val_loss improved from 0.66912 to 0.63110, saving model to weights/keras-lstm-from-scratch-embedding.h5
Epoch 3/10

Epoch 00003: val_loss improved from 0.63110 to 0.58202, saving model to weights/keras-lstm-from-scratch-embedding.h5
Epoch 4/10

Epoch 00004: val_loss improved from 0.58202 to 0.51662, saving model to weights/keras-lstm-from-scratch-embedding.h5
Epoch 5/10

Epoch 00005: val_loss improved from 0.51662 to 0.42608, saving model to weights/keras-lstm-from-scratch-embedding.h5
Epoch 6/10

Epoch 00006: val_loss improved from 0.42608 to 0.29965, saving model to weights/keras-lstm-from-scratch-embedding.h5
Epoch 7/10

Epoch 00007: val_loss improved from 0.29965 to 0.14831, saving model to weights/keras-lstm-from-scratch-embedding.h5
Epoch 8/10

Epoch 00008: val_loss improved from 0.14831 to 0.03702, saving model to weights/keras-lst