In [229]:
import numpy as np
from keras.datasets import imdb
from matplotlib import pyplot
from keras.models import Sequential
from keras.layers import Dense
from keras.layers import Flatten
from keras.layers.embeddings import Embedding
from keras.preprocessing import sequence
from nltk.tokenize import RegexpTokenizer
import nltk
from nltk.corpus import stopwords
from nltk.stem.porter import PorterStemmer
import json

In [230]:
def preprocess(X):
    tokenizer = RegexpTokenizer(r'\w+')
    X = [tokenizer.tokenize(x) for x in X]
    stop_words = set(stopwords.words('english'))
    X_stop=[]
    for words in X:
        words = [w for w in words if not w in stop_words]
        X_stop.append(words)
    porter = PorterStemmer()
    X_stem=[]
    for words in X_stop:
        stemmed = [porter.stem(word) for word in words]
        X_stem.append(stemmed)
    return X_stem

In [231]:
import csv
X_train = []
X_test = []
Y_train = []
with open('train.csv') as f:
    read = csv.reader(f)
    next(read, None)
    for (item,label,sentence) in read:
        X_train.append(sentence)
        Y_train.append(label)
with open('test.csv') as f:
    read = csv.reader(f)
    next(read, None)
    for (item,sentence) in read:
        X_test.append(sentence)

In [232]:
X_train = preprocess(X_train)
X_test = preprocess(X_test)
Y_train = list(map(int, Y_train))

In [174]:
result = [len(x) for x in X_train_pad]
print(sum(result)/len(result))
print(max(result))

100.0
100


In [239]:
from gensim.models import Word2Vec
model = Word2Vec(X_train+X_test, min_count=1)

In [109]:
weights = model.wv.syn0
np.save(open('embeddings', 'wb'), weights)
vocab = dict([(k, v.index) for k, v in model.wv.vocab.items()])
with open('vocab', 'w') as f:
    f.write(json.dumps(vocab))


  """Entry point for launching an IPython kernel.


In [166]:
with open('vocab', 'r') as f:
    data = json.loads(f.read())
word2idx = data
idx2word = dict([(v, k) for k, v in data.items()])

In [167]:
weights = np.load(open('embeddings', 'rb'))

In [206]:
weights.shape[1]

100

In [259]:
max_words = 10

In [256]:
X_train_idx = [[word2idx[text] if word2idx[text] < 5000 else 0 for text in x] for x in X_train]
X_test_idx = [[word2idx[text] if word2idx[text] < 5000 else 0 for text in x] for x in X_test]

In [257]:
result = [len(x) for x in X_train_idx]
print(sum(result)/len(result))
print(max(result))

8.737501125123764
110


In [260]:
X_train_pad = sequence.pad_sequences(X_train_idx, maxlen=max_words)
X_test_pad = sequence.pad_sequences(X_test_idx, maxlen=max_words)

In [267]:
layer = Embedding(input_dim=5000,
                      output_dim=weights.shape[1],
                      weights=[weights[0:5000]], input_length=10)
model = Sequential()
model.add(layer)
model.add(Flatten())
model.add(Dense(250, activation='relu'))
model.add(Dense(1, activation='sigmoid'))
model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])
print(model.summary())

_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding_16 (Embedding)     (None, 10, 100)           500000    
_________________________________________________________________
flatten_6 (Flatten)          (None, 1000)              0         
_________________________________________________________________
dense_19 (Dense)             (None, 250)               250250    
_________________________________________________________________
dense_20 (Dense)             (None, 1)                 251       
Total params: 750,501
Trainable params: 750,501
Non-trainable params: 0
_________________________________________________________________
None


In [268]:
X_final_train = X_train_pad[0:-20000]
Y_final_train = Y_train[0:-20000]
X_val = X_train_pad[-20000:]
Y_val = Y_train[-20000:]

In [269]:
model.fit(X_final_train, Y_final_train, validation_data=(X_val, Y_val), epochs=50, batch_size=128, verbose=2)
# Final evaluation of the model
scores = model.evaluate(X_val, Y_val, verbose=2)
print("Accuracy: %.2f%%" % (scores[1]*100))

Train on 79989 samples, validate on 20000 samples
Epoch 1/50
 - 10s - loss: 0.5742 - acc: 0.6965 - val_loss: 0.5407 - val_acc: 0.7245
Epoch 2/50
 - 10s - loss: 0.5145 - acc: 0.7397 - val_loss: 0.5253 - val_acc: 0.7363
Epoch 3/50
 - 10s - loss: 0.4821 - acc: 0.7632 - val_loss: 0.5326 - val_acc: 0.7297
Epoch 4/50
 - 10s - loss: 0.4489 - acc: 0.7849 - val_loss: 0.5402 - val_acc: 0.7342
Epoch 5/50
 - 10s - loss: 0.4114 - acc: 0.8069 - val_loss: 0.5941 - val_acc: 0.7029
Epoch 6/50
 - 10s - loss: 0.3672 - acc: 0.8319 - val_loss: 0.6047 - val_acc: 0.7280
Epoch 7/50
 - 10s - loss: 0.3148 - acc: 0.8603 - val_loss: 0.6742 - val_acc: 0.7241
Epoch 8/50
 - 10s - loss: 0.2674 - acc: 0.8838 - val_loss: 0.7767 - val_acc: 0.6960
Epoch 9/50
 - 10s - loss: 0.2268 - acc: 0.9025 - val_loss: 0.8230 - val_acc: 0.7119
Epoch 10/50
 - 10s - loss: 0.1978 - acc: 0.9159 - val_loss: 0.9086 - val_acc: 0.7046
Epoch 11/50
 - 10s - loss: 0.1720 - acc: 0.9279 - val_loss: 1.0066 - val_acc: 0.7075
Epoch 12/50
 - 10s - los

{'sad': 67,
 'apl': 32842,
 'friend': 92,
 'I': 0,
 'miss': 21,
 'new': 52,
 'moon': 1088,
 'trailer': 1545,
 'omg': 163,
 'alreadi': 142,
 '7': 492,
 '30': 481,
 'O': 466,
 'omgaga': 57910,
 'Im': 262,
 'sooo': 462,
 'im': 38,
 'gunna': 1803,
 'cri': 279,
 'dentist': 2272,
 'sinc': 215,
 '11': 857,
 'supos': 12463,
 '2': 43,
 'get': 2,
 'crown': 4851,
 'put': 186,
 '30min': 8953,
 'think': 17,
 'mi': 2173,
 'bf': 1784,
 'cheat': 2174,
 't_t': 6458,
 'worri': 293,
 'much': 39,
 'juuuuuuuuuuuuuuuuussssst': 57911,
 'chillin': 1940,
 'sunni': 717,
 'again': 1458,
 'work': 14,
 'tomorrow': 100,
 'TV': 862,
 'tonight': 130,
 'hand': 450,
 'uniform': 5634,
 'today': 44,
 'hmmmm': 2713,
 'wonder': 225,
 'number': 574,
 'must': 199,
 'posit': 940,
 'thank': 4,
 'hater': 2082,
 'face': 364,
 'day': 11,
 '112': 18296,
 '102': 7280,
 'weekend': 151,
 'suck': 110,
 'far': 261,
 'jb': 3325,
 'isnt': 986,
 'show': 73,
 'australia': 945,
 'ok': 122,
 'that': 29,
 'win': 283,
 'lt': 87,
 'thi': 320,
 

In [240]:
model

<gensim.models.word2vec.Word2Vec at 0x3c9211d0>

In [241]:
X = X_train+X_test

In [247]:
fdist = nltk.FreqDist(np.hstack(X))

In [255]:
word2idx['apl']

32842