In [22]:
import re
import numpy as np
import pandas as pd
import nltk

from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize


In [23]:
nltk.download('stopwords')

[nltk_data] Downloading package stopwords to /home/ub/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.


True

In [28]:
nltk.download('punkt')

[nltk_data] Downloading package punkt to /home/ub/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.


True

In [7]:
## loading data
def load_data(fname):
    data_set = {
        'train': 'train.tsv',
        'test': 'test.tsv'}
    if fname not in data_set:
        raise Exception('wrong fname')
    data_df = pd.read_csv(data_set[fname], sep='\t')
    return data_df

In [5]:
train_df = load_data('train')

In [9]:
test_df = load_data('test')

In [11]:
train_df.head()

Unnamed: 0,PhraseId,SentenceId,Phrase,Sentiment
0,1,1,A series of escapades demonstrating the adage ...,1
1,2,1,A series of escapades demonstrating the adage ...,2
2,3,1,A series,2
3,4,1,A,2
4,5,1,series,2


In [12]:
train_reviews = train_df.Phrase.values

In [17]:
print(train_reviews.shape)

(156060,)


In [18]:
print(train_reviews[:5])

['A series of escapades demonstrating the adage that what is good for the goose is also good for the gander , some of which occasionally amuses but none of which amounts to much of a story .'
 'A series of escapades demonstrating the adage that what is good for the goose'
 'A series' 'A' 'series']


In [24]:
stopwords_set = set(stopwords.words('english'))

In [35]:
def process_reviews(reviews):
    processed_reviews = []
    for sent in reviews:
        words = word_tokenize(sent)
        words = [w for w in words if w not in stopwords_set]
        processed_reviews.append(words)
    return processed_reviews

In [36]:
train_processed_reviews = process_reviews(train_reviews)

In [38]:
print(train_processed_reviews[:5])

[['A', 'series', 'escapades', 'demonstrating', 'adage', 'good', 'goose', 'also', 'good', 'gander', ',', 'occasionally', 'amuses', 'none', 'amounts', 'much', 'story', '.'], ['A', 'series', 'escapades', 'demonstrating', 'adage', 'good', 'goose'], ['A', 'series'], ['A'], ['series']]


In [39]:
test_df.head()

Unnamed: 0,PhraseId,SentenceId,Phrase
0,156061,8545,An intermittently pleasing but mostly routine ...
1,156062,8545,An intermittently pleasing but mostly routine ...
2,156063,8545,An
3,156064,8545,intermittently pleasing but mostly routine effort
4,156065,8545,intermittently pleasing but mostly routine


In [41]:
test_reviews = test_df.Phrase.values

In [42]:
test_processed_reviews = process_reviews(test_reviews)

In [43]:
print(test_processed_reviews[:5])

[['An', 'intermittently', 'pleasing', 'mostly', 'routine', 'effort', '.'], ['An', 'intermittently', 'pleasing', 'mostly', 'routine', 'effort'], ['An'], ['intermittently', 'pleasing', 'mostly', 'routine', 'effort'], ['intermittently', 'pleasing', 'mostly', 'routine']]


In [47]:
print('training data\'s len: ', len(train_processed_reviews))

training data's len:  156060


In [48]:
print('testing data\'s len: ', len(test_processed_reviews))

testing data's len:  66292


In [50]:
print('all data\'s len: ',len(train_processed_reviews + test_processed_reviews))

all data's len:  222352


In [54]:
processed_reviews = train_processed_reviews + test_processed_reviews

In [62]:
from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences

In [64]:
max_sent_len = 0
for sent in processed_reviews:
    if len(sent) > max_sent_len:
        max_sent_len = len(sent)

In [65]:
print(max_sent_len)

39


In [79]:
from gensim.corpora import Dictionary

In [80]:
dic = Dictionary(processed_reviews)

In [83]:
dic_size = len(dic.keys())

In [84]:
print(dic_size)

21516


In [85]:
train_word_ids = []

In [86]:
for sent in train_processed_reviews:
    word_ids = [dic.token2id[word] for word in sent]
    train_word_ids.append(word_ids)

In [89]:
print(train_word_ids[:5])

[[2, 15, 8, 7, 3, 10, 11, 4, 10, 9, 0, 14, 6, 13, 5, 12, 16, 1], [2, 15, 8, 7, 3, 10, 11], [2, 15], [2], [15]]


In [87]:
test_word_ids = []

In [88]:
for sent in test_processed_reviews:
    word_ids = [dic.token2id[word] for word in sent]
    test_word_ids.append(word_ids)

In [90]:
print(test_word_ids[:5])

[[618, 8150, 7286, 1010, 1539, 2080, 1], [618, 8150, 7286, 1010, 1539, 2080], [618], [8150, 7286, 1010, 1539, 2080], [8150, 7286, 1010, 1539]]


In [91]:
train_paded = pad_sequences(train_word_ids)

In [92]:
print(train_paded[:5])

[[ 0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  2 15  8
   7  3 10 11  4 10  9  0 14  6 13  5 12 16  1]
 [ 0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0
   0  0  0  0  0  0  0  0  2 15  8  7  3 10 11]
 [ 0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0
   0  0  0  0  0  0  0  0  0  0  0  0  0  2 15]
 [ 0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0
   0  0  0  0  0  0  0  0  0  0  0  0  0  0  2]
 [ 0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0
   0  0  0  0  0  0  0  0  0  0  0  0  0  0 15]]


In [93]:
test_paded = pad_sequences(test_word_ids)

In [94]:
print(test_paded)

[[   0    0    0 ... 1539 2080    1]
 [   0    0    0 ... 1010 1539 2080]
 [   0    0    0 ...    0    0  618]
 ...
 [   0    0    0 ...    2 9180    0]
 [   0    0    0 ...    0    2 9180]
 [   0    0    0 ...    0 1548 4874]]


In [95]:
from keras.utils import np_utils

In [96]:
train_y = np_utils.to_categorical(train_df.Sentiment, num_classes=5)

In [97]:
print(train_y[:5])

[[0. 1. 0. 0. 0.]
 [0. 0. 1. 0. 0.]
 [0. 0. 1. 0. 0.]
 [0. 0. 1. 0. 0.]
 [0. 0. 1. 0. 0.]]


In [98]:
from keras.models import Sequential
from keras.layers import Dense, Dropout, Activation, Embedding, LSTM, GRU

In [99]:
lstm_model = Sequential()
lstm_model.add(Embedding(dic_size, 128))
lstm_model.add(LSTM(128, dropout=0.2))
lstm_model.add(Dense(5))
lstm_model.add(Dropout(0.5))
lstm_model.add(Activation('softmax'))

In [103]:
lstm_model.compile(loss='categorical_crossentropy', optimizer='adam', 
                   metrics=['accuracy'])
lstm_model.fit(train_paded, train_y, batch_size=256, epochs=10, verbose=1)

Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


<keras.callbacks.History at 0x7f0e9a2b9b70>

In [106]:
test_pred = lstm_model.predict_classes(test_paded)



In [114]:
test_df['Sentiment'] = test_pred.reshape(-1, 1)

In [115]:
print(test_df.head())

   PhraseId  SentenceId                                             Phrase  \
0    156061        8545  An intermittently pleasing but mostly routine ...   
1    156062        8545  An intermittently pleasing but mostly routine ...   
2    156063        8545                                                 An   
3    156064        8545  intermittently pleasing but mostly routine effort   
4    156065        8545         intermittently pleasing but mostly routine   

   Sentiment  
0          3  
1          3  
2          2  
3          3  
4          3  


In [116]:
# public score: 0.62374
test_df.to_csv('lstm-clf.csv', columns=['PhraseId', 'Sentiment'], 
               index=False, header=True)

In [82]:
from gensim.models import KeyedVectors

In [125]:
mv = KeyedVectors.load_word2vec_format('word2vec.6B.100d.txt', binary=False)

In [127]:
mv.most_similar('man')

[('woman', 0.8323494791984558),
 ('boy', 0.7914870977401733),
 ('one', 0.7788748741149902),
 ('person', 0.7526815533638),
 ('another', 0.7522236108779907),
 ('old', 0.7409116625785828),
 ('life', 0.7371696829795837),
 ('father', 0.7370322346687317),
 ('turned', 0.7347695231437683),
 ('who', 0.7345511317253113)]