In [1]:
import numpy
import pandas as pd
from keras.datasets import imdb
from keras.models import Sequential
from keras.layers import Dense
from keras.layers import LSTM, Dropout, SimpleRNN
from keras.layers.convolutional import Conv1D
from keras.layers.convolutional import MaxPooling1D
from keras.layers.embeddings import Embedding
from keras.preprocessing import sequence
from sklearn.preprocessing import LabelEncoder
from sklearn.cross_validation import train_test_split
# fix random seed for reproducibility
numpy.random.seed(7)

Using TensorFlow backend.


## Tutorial Baseline and Modifications For IMDB data set

In [226]:
# load the dataset but only keep the top n words, zero the rest
top_words = 5000
data = imdb.load_data(num_words=top_words)

In [228]:
X = data[0][0]
y = data[0][1]
x_train, x_test, y_train, y_test = train_test_split(X, y, test_size=.2, 
                                                    random_state=41)

In [229]:
# truncate and pad input sequences
max_review_length = 500
x_train = sequence.pad_sequences(x_train, maxlen=max_review_length)
x_test = sequence.pad_sequences(x_test, maxlen=max_review_length)

In [208]:
# create the model
embedding_vecor_length = 32
model = Sequential()
model.add(Embedding(2*top_words, embedding_vecor_length, input_length=max_review_length))
model.add(Conv1D(filters=32, kernel_size=3, padding='same', activation='relu'))
model.add(MaxPooling1D(pool_size=2))
model.add(Dropout(0.2))
model.add(LSTM(100))
model.add(Dropout(0.2))
model.add(Dense(1, activation='sigmoid'))
model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])
print(model.summary())
model.fit(x_train, y_train, epochs=10, batch_size=128)

_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding_9 (Embedding)      (None, 500, 32)           320000    
_________________________________________________________________
conv1d_4 (Conv1D)            (None, 500, 32)           3104      
_________________________________________________________________
max_pooling1d_4 (MaxPooling1 (None, 250, 32)           0         
_________________________________________________________________
dropout_9 (Dropout)          (None, 250, 32)           0         
_________________________________________________________________
lstm_7 (LSTM)                (None, 100)               53200     
_________________________________________________________________
dropout_10 (Dropout)         (None, 100)               0         
_________________________________________________________________
dense_7 (Dense)              (None, 1)                 101       
Total para

<keras.callbacks.History at 0x122a01518>

In [209]:
# Final evaluation of the model
scores = model.evaluate(x_test, y_test, verbose=0)
print("Accuracy: %.2f%%" % (scores[1]*100))

Accuracy: 87.34%


## NLTK Movie Reviews as Test Set and IMDB test
Here I decided to test the LSTM on another data set of movie reviews, to see how well the model generalises.

In [2]:
import nltk
from nltk.corpus import movie_reviews

In [3]:
documents = [(list(movie_reviews.words(fileid)), category)
                for category in movie_reviews.categories()
                for fileid in movie_reviews.fileids(category)]

import random
random.shuffle(documents)

In [4]:
#getting rid of stopwords
unwanted = set(nltk.corpus.stopwords.words("english"))
unwanted.update(list('!"#$%&\'()*+,-./:;<=>? @[\\]^_`{|}~£'))
unwanted.update(['also', 'even', '--'])
words = movie_reviews.words()
words = filter(lambda x: x not in unwanted, words)

#calculating frequency dictionary for all words
all_words = nltk.FreqDist(w.lower() for w in words)
most_frequent = all_words.most_common(5000)

In [6]:
#from tuples to dictionary
def tuple_to_dict(some_list):
    new_list = {}
    for pair in some_list:
        new_list[pair[0]] = pair[1]
    return new_list

vocabulary = list(tuple_to_dict(most_frequent).keys())

In [22]:
X_nltk = []
Y_nltk = []
for d in documents:
    Y_nltk.append(d[1])
    words_to_ints = []
    for word in d[0]:
        if word in vocabulary:
            words_to_ints.append(vocabulary.index(word))
        else:
            words_to_ints.append(0)
    X_nltk.append(words_to_ints)

In [9]:
lb = LabelEncoder()
Y_nltk = lb.fit_transform(Y_nltk)

In [11]:
X_nltk = sequence.pad_sequences(X_nltk, maxlen=500)

In [248]:
scores = model.evaluate(X_nltk, Y_nltk, verbose=0)
print("Accuracy: %.2f%%" % (scores[1]*100))

Accuracy: 50.85%


We can see that the model doesn't really generalise very well. What I understood after I've done all of this is that the frequencies of the words are different - even if the words are the same since they are calculated at different times - so unless I calculate frequencies of all the words collectively, it won't work as intended. So this is where I'm calculating the frequencies of IMDB and NLTK movie reviews collectively.

In [8]:
import glob
neg_files_train=glob.glob("./aclImdb/train/neg/*.txt")
neg_files_test=glob.glob("./aclImdb/test/neg/*.txt")
pos_files_train=glob.glob("./aclImdb/train/pos/*.txt")
pos_files_test=glob.glob("./aclImdb/test/pos/*.txt")

In [9]:
def open_and_read(some_list):
    docs = []
    for file in some_list:
        f = open(file, "r")
        text = f.readlines()
        docs.append(nltk.word_tokenize(text[0]))
        f.close()
    return docs

In [10]:
neg_docs_tr = open_and_read(neg_files_train)
neg_docs_test = open_and_read(neg_files_test)
pos_docs_tr = open_and_read(pos_files_train)
pos_docs_test = open_and_read(pos_files_test)

In [11]:
#calculating frequencies for the whole text
def iterate_and_append(lists):
    all_words = []
    for l in lists:
        for doc in l:
            for word in doc:
                all_words.append(word.lower())
    return all_words

In [16]:
all_text = iterate_and_append([neg_docs_tr, neg_docs_test, pos_docs_tr, pos_docs_test])

In [17]:
all_text += movie_reviews.words()

In [17]:
unwanted = set(nltk.corpus.stopwords.words("english"))
unwanted.update(list('!"#$%&\'()*+,-./:;<=>? @[\\]^_`{|}~£1234567890abcdefghijklmnopqrstuvwxyz'))
unwanted.update(['also', 'even', '--', 'film', 'director', 'character',"'ve", "'m", "'d","n't",'``',"''","'s","'re","'ll",'...','br','movie','one'])

all_text = filter(lambda x: x not in unwanted, all_text)

In [18]:
#calculating frequency dictionary for all words
words_from_everywhere = nltk.FreqDist(all_text)
most_frequent_words = words_from_everywhere.most_common(6000)

In [20]:
mfw = tuple_to_dict(most_frequent_words)
vocabulary = list(mfw.keys())

In [34]:
from tqdm import tqdm
def transform_words_into_ints(list_of_docs):
    x = []
    for doc in tqdm(list_of_docs):
        words_to_ints=[]
        for word in doc:
            if word in vocabulary:
                words_to_ints.append(vocabulary.index(word))
            else:
                words_to_ints.append(0)
        x.append(words_to_ints)
    return x

In [35]:
X_all_train = transform_words_into_ints(neg_docs_tr) + transform_words_into_ints(pos_docs_tr)
Y_all_train = numpy.concatenate((numpy.zeros((12500,), dtype=numpy.int), numpy.ones((12500,), dtype=numpy.int)))

100%|██████████| 12500/12500 [07:41<00:00, 27.08it/s]
100%|██████████| 12500/12500 [07:52<00:00, 26.45it/s]


In [36]:
X_all_test = transform_words_into_ints(neg_docs_test) + transform_words_into_ints(pos_docs_test)
Y_all_test = numpy.concatenate((numpy.zeros((12500,), dtype=numpy.int), numpy.ones((12500,), dtype=numpy.int)))

100%|██████████| 12500/12500 [07:06<00:00, 22.21it/s]
100%|██████████| 12500/12500 [06:52<00:00, 30.34it/s]


In [37]:
nltk_test = []
for doc in tqdm(documents):
    word_to_ints=[]
    for word in doc:
        if word in vocabulary:
            word_to_ints.append(vocabulary.index(word))
        else:
            word_to_ints.append(0)
    nltk_test.append(word_to_ints)

100%|██████████| 2000/2000 [00:01<00:00, 1678.72it/s]


In [38]:
X_all_train += nltk_test[:1000]
Y_train_all = numpy.append(Y_all_train, Y_nltk[:1000])
Y_test_all = numpy.append(Y_all_test, Y_nltk[1000:])
X_all_test += nltk_test[1000:]

In [39]:
X_all_train = sequence.pad_sequences(X_all_train, maxlen=300)
X_all_test = sequence.pad_sequences(X_all_test, maxlen=300)

I'm going to use the same network architecture from above for this task - except that this model will be called __model_nltk_plus__.

In [160]:
# create the model
embedding_vector_length = 64
model_nltk_plus = Sequential()
model_nltk_plus.add(Embedding(6000, embedding_vector_length, input_length=300))
model_nltk_plus.add(Conv1D(filters=32, kernel_size=3, padding='same', activation='relu'))
model_nltk_plus.add(MaxPooling1D(pool_size=6))
model_nltk_plus.add(Dropout(0.2))
model_nltk_plus.add(LSTM(100))
model_nltk_plus.add(Dropout(0.2))
model_nltk_plus.add(Dense(1, activation='sigmoid'))
model_nltk_plus.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])
print(model_nltk_plus.summary())
model_nltk_plus.fit(X_all_train, Y_train_all, epochs=6, batch_size=64)

_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding_13 (Embedding)     (None, 300, 64)           384000    
_________________________________________________________________
conv1d_8 (Conv1D)            (None, 300, 32)           6176      
_________________________________________________________________
max_pooling1d_8 (MaxPooling1 (None, 50, 32)            0         
_________________________________________________________________
dropout_21 (Dropout)         (None, 50, 32)            0         
_________________________________________________________________
lstm_11 (LSTM)               (None, 100)               53200     
_________________________________________________________________
dropout_22 (Dropout)         (None, 100)               0         
_________________________________________________________________
dense_12 (Dense)             (None, 1)                 101       
Total para

<keras.callbacks.History at 0x1c49d10b38>

In [161]:
# Final evaluation of the model
scores = model_nltk_plus.evaluate(X_all_test, Y_test_all, verbose=0)
print("Accuracy: %.2f%%" % (scores[1]*100))

Accuracy: 84.22%


## LSTM Classification For Tweets in Russian
The tweets are taken from here: http://study.mokoron.com/.
The tweets there were classified with the help of an automatic system into positive and negative ones. I'm going to use them to see how LSTM will classify them.

In [23]:
X_pos = pd.read_csv("./twitter_russian/positive.csv",sep=';')
X_neg = pd.read_csv("./twitter_russian/negative.csv",sep=';')

In [24]:
X_neg.head()

Unnamed: 0,Id,Tdate,Tname,Ttext,Ttype,Trep,Tfav,Tstcount,Tfol,Tfrlen,ListCount,na
0,408906762813579264,1386325944,dugarchikbellko,на работе был полный пиддес :| и так каждое за...,-1,0,0,0,8064,111,94,2
1,408906818262687744,1386325957,nugemycejela,"Коллеги сидят рубятся в Urban terror, а я из-з...",-1,0,0,0,26,42,39,0
2,408906858515398656,1386325966,4post21,@elina_4post как говорят обещаного три года жд...,-1,0,0,0,718,49,249,0
3,408906914437685248,1386325980,Poliwake,"Желаю хорошего полёта и удачной посадки,я буду...",-1,0,0,0,10628,207,200,0
4,408906914723295232,1386325980,capyvixowe,"Обновил за каким-то лешим surf, теперь не рабо...",-1,0,0,0,35,17,34,0


In [25]:
data = pd.concat([X_pos,X_neg])
X_rus = data['Ttext']
y_rus = data['Ttype']
y_rus[y_rus==-1] = 0

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  after removing the cwd from sys.path.


In [26]:
unwanted_rus = set(nltk.corpus.stopwords.words("russian"))
unwanted_rus.update(list('!"#$%&\'()*+,-./:;<=>?@[\\]^_`{|}~£1234567890tяуто'))
unwanted_rus.update(["ff", "ru", '--', "to", '``',"''","...","co", "rt","http",'lt',u'я',u'и',u'то',u'нибудь','…'])

all_words = []
X_rus_tokenised = []
for tweet in X_rus:
    words = nltk.word_tokenize(tweet)
    cur_words = [w.lower() for w in words]
    cur_words = list(filter(lambda x: x not in unwanted_rus, cur_words))
    X_rus_tokenised.append(cur_words)
    all_words += cur_words

In [37]:
freq_words = nltk.FreqDist(all_words)
most_com = list(tuple_to_dict(freq_words.most_common(10000)).keys())

In [29]:
encoded_rus = []
for tweet in X_rus_tokenised:
    words_to_ints = []
    for word in tweet:
        if word in most_com:
            words_to_ints.append(most_com.index(word))
        else:
            words_to_ints.append(0)
    encoded_rus.append(words_to_ints)

In [30]:
X_rus_train, X_rus_test, y_rus_train, y_rus_test = train_test_split(encoded_rus, y_rus, test_size = 0.2)

In [31]:
X_rus_train = sequence.pad_sequences(X_rus_train, maxlen=50)
X_rus_test = sequence.pad_sequences(X_rus_test, maxlen=50)

In [63]:
# create the model
embedding_vector_length = 32
model_rus = Sequential()
model_rus.add(Embedding(6000, embedding_vector_length, input_length=50))
model_rus.add(Conv1D(filters=32, kernel_size=3, padding='same', activation='relu'))
model_rus.add(MaxPooling1D(pool_size=5))
#model_rus.add(Dropout(0.2))
model_rus.add(LSTM(100, unit_forget_bias=True))
model_rus.add(Dropout(0.2))
model_rus.add(Dense(1, activation='sigmoid'))
model_rus.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])
print(model_rus.summary())
model_rus.fit(X_rus_train, y_rus_train, validation_data=(X_rus_test[:20000], y_rus_test[:20000]), epochs=3, batch_size=128)

_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding_3 (Embedding)      (None, 50, 32)            192000    
_________________________________________________________________
conv1d_3 (Conv1D)            (None, 50, 32)            3104      
_________________________________________________________________
max_pooling1d_3 (MaxPooling1 (None, 10, 32)            0         
_________________________________________________________________
lstm_3 (LSTM)                (None, 100)               53200     
_________________________________________________________________
dropout_3 (Dropout)          (None, 100)               0         
_________________________________________________________________
dense_3 (Dense)              (None, 1)                 101       
Total params: 248,405
Trainable params: 248,405
Non-trainable params: 0
_________________________________________________________________
None

<keras.callbacks.History at 0x1c4d12ee48>

In [64]:
scores = model_rus.evaluate(X_rus_test, y_rus_test)
print("\nAccuracy: %.2f%%" % (scores[1]*100))

Accuracy: 74.74%


### Encoding Characters Instead of Words

In [38]:
# create mapping of unique chars to integers
chars = sorted(list(set(" ".join(most_com))))
char_to_int = dict((c, i) for i, c in enumerate(chars))

In [None]:
char_encoded = []
for tweet in X_rus_tokenised:
    encoded_tweet = []
    for word in tweet:
        if word in most_com:
            encoded_tweet += [char_to_int[l] for l in word]
        else:
            encoded_tweet += [0 for l in word]
    char_encoded.append(encoded_tweet)

In [53]:
char_encoded = sequence.pad_sequences(char_encoded, maxlen=140)

In [54]:
x_char_train, x_char_test, y_char_train, y_char_test = train_test_split(char_encoded, y_rus, test_size=0.2)

In [None]:
#create the model
embedding_vector_length = 16
model_rus = Sequential()
model_rus.add(Embedding(100, embedding_vector_length, input_length=140))
model_rus.add(Conv1D(filters=16, kernel_size=3, padding='same', activation='relu'))
model_rus.add(MaxPooling1D(pool_size=5))
#model_rus.add(Dropout(0.2))
model_rus.add(LSTM(80, unit_forget_bias=True))
model_rus.add(Dropout(0.2))
model_rus.add(Dense(1, activation='sigmoid'))
model_rus.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])
print(model_rus.summary())
model_rus.fit(x_char_train, y_char_train, validation_data=(x_char_test[:10000], y_char_test[:10000]), epochs=3, batch_size=128)

In [62]:
scores = model_rus.evaluate(x_char_test, y_char_test)
print("\nAccuracy: %.2f%%" % (scores[1]*100))


Accuracy: 69.68%
