In [1]:
from tqdm import tqdm
import os

In [2]:
data = "../input/keras-imdb/aclImdb_v1/aclImdb"
train = os.path.join(data,'train')
labels = []
texts = []
for label_type in ['neg','pos']:
    dir_n = os.path.join(train,label_type) 
    print('Loading ',label_type)
    for fname in tqdm(os.listdir(dir_n)):
        if fname[-4:] == ".txt":
            f = open(os.path.join(dir_n,fname))
            texts.append(f.read())
            f.close()
            if label_type == 'neg':
                labels.append(0)
            else:
                labels.append(1)

Loading  neg


100%|██████████| 12500/12500 [00:44<00:00, 278.57it/s]


Loading  pos


100%|██████████| 12500/12500 [00:42<00:00, 291.26it/s]


In [3]:
len(labels),len(texts)

(25000, 25000)

In [4]:
import numpy as np
np.mean(labels)

0.5

In [5]:
print('Label: ',labels[22003])
print(texts[22003])

Label:  1
The Clouded Yellow is a compact psychological thriller with interesting characterizations. Barry Jones and Kenneth More are both terrific in supporting roles in characters that both have more to them than what meets the eye. Jean Simmons is quite good, and Trevor Howard makes a fascinatingly offbeat suspense hero.


In [6]:
print('Label: ',labels[5])
print(texts[5])

Label:  0
Ritchie's first two films were snappy, stylish entertainment. Here, he raids two recent classics  'The Usual Suspects' and 'Fight Club'  and still comes out empty-handed.<br /><br />Despite parading itself as a con-mystery (with the sub-'Usual Suspects' twaddle "the greatest con he ever pulled was convincing you that he was you" or whatever it was...) and attempting a 'Fight-Club' twist about which characters are real and which are internal manifestations, the film struggles to maintain interest in its second half. By the last third, you know you're being lead down a blind alley, and tediously slowly at that.<br /><br />Cons, chess and game theory are all great subjects, but Ritchie delves into them too superficially and too repetitively to make much use of the material.<br /><br />The only thing that keeps the movie (almost) watchable is Ritchie's bold way with with a scene and Maurice-Jones's dynamic camera. If Ritchie stuck to a more satisfying plot, and succumbed to tig

In [7]:
from keras.preprocessing.text import Tokenizer
words = 10000
tokenizer = Tokenizer(num_words = words)
tokenizer.fit_on_texts(texts)
sequences = tokenizer.texts_to_sequences(texts)

In [8]:
word_idx = tokenizer.word_index
print('Token for "The"',word_idx['the'])
print('Token for "Movie"',word_idx['movie'])
print('Token for "Generator"',word_idx['generator'])

Token for "The" 1
Token for "Movie" 17
Token for "Generator" 19947


In [9]:
sequences[22003][:10]

[1, 4179, 6, 3, 1983, 707, 16, 218, 7461, 3508]

In [10]:
from keras.preprocessing.sequence import pad_sequences
max_length = 100
data = pad_sequences(sequences,maxlen=max_length)
print(data.shape)

(25000, 100)


In [11]:
labels = np.asarray(labels)
indices = np.arange(data.shape[0])
np.random.shuffle(indices)
data = data[indices]
labels = labels[indices]

training_samples = 20000
validation_samples = 5000

x_train = data[:training_samples]
y_train = labels[:training_samples]
x_val = data[training_samples: training_samples + validation_samples]
y_val = labels[training_samples: training_samples + validation_samples]

In [12]:
from keras.models import Sequential
from keras.layers import Embedding, Flatten, Dense

embedding_dim = 50

model = Sequential()
model.add(Embedding(words, embedding_dim, input_length=max_length))
model.add(Flatten())
model.add(Dense(32, activation='relu'))
model.add(Dense(1, activation='sigmoid'))
model.summary()

Model: "sequential"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding (Embedding)        (None, 100, 50)           500000    
_________________________________________________________________
flatten (Flatten)            (None, 5000)              0         
_________________________________________________________________
dense (Dense)                (None, 32)                160032    
_________________________________________________________________
dense_1 (Dense)              (None, 1)                 33        
Total params: 660,065
Trainable params: 660,065
Non-trainable params: 0
_________________________________________________________________


In [13]:
model.compile(optimizer='adam',loss='binary_crossentropy',metrics=['acc'])

In [14]:
history = model.fit(x_train, y_train,epochs=10,batch_size=32,validation_data=(x_val, y_val))

Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


In [15]:
glove = '../input/glove-global-vectors-for-word-representation'
print('Loading word vectors')
embeddings_idx = {} 
f = open(os.path.join(glove, 'glove.6B.100d.txt'))
for line in tqdm(f):
    values = line.split()
    word = values[0] 
    embedding = np.asarray(values[1:], dtype='float32') 
    embeddings_idx[word] = embedding
f.close()
print('Found %s word vectors.' % len(embeddings_idx))

1689it [00:00, 16887.12it/s]

Loading word vectors


400000it [00:18, 21634.55it/s]

Found 400000 word vectors.





In [16]:
all_embeddings = np.stack(embeddings_idx.values())
emb_mean = all_embeddings.mean()
emb_std = all_embeddings.std() 
emb_mean,emb_std

  if (await self.run_code(code, result,  async_=asy)):


(0.004451992, 0.4081574)

In [17]:
embedding_dim = 100
nb_words = min(words, len(word_idx))
embedding_matrix = np.random.normal(emb_mean, emb_std, (nb_words, embedding_dim))
for word, i in word_idx.items():
    if i >= words: 
        continue
    embedding_vector = embeddings_idx.get(word)
    if embedding_vector is not None: 
        embedding_matrix[i] = embedding_vector

In [18]:
model = Sequential()
model.add(Embedding(words, embedding_dim, input_length=max_length, weights = [embedding_matrix], trainable = False))
model.add(Flatten())
model.add(Dense(32, activation='relu'))
model.add(Dense(1, activation='sigmoid'))
model.summary()

Model: "sequential_1"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding_1 (Embedding)      (None, 100, 100)          1000000   
_________________________________________________________________
flatten_1 (Flatten)          (None, 10000)             0         
_________________________________________________________________
dense_2 (Dense)              (None, 32)                320032    
_________________________________________________________________
dense_3 (Dense)              (None, 1)                 33        
Total params: 1,320,065
Trainable params: 320,065
Non-trainable params: 1,000,000
_________________________________________________________________


In [19]:
model.compile(optimizer='adam',loss='binary_crossentropy',metrics=['acc'])

In [20]:
history = model.fit(x_train, y_train,epochs=10,batch_size=32,validation_data=(x_val, y_val))

Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


In [21]:
text = 'It is a really nice day. The weather is calm and cool. Something really great happened today! India won the cricket match!'
seq_gen = tokenizer.texts_to_sequences([text])
print('raw seq:',seq_gen)
seq_gen = pad_sequences(seq_gen, maxlen=max_length)
print('padded seq:',seq_gen)
prediction = model.predict(seq_gen)
print('positivity:',prediction)

raw seq: [[9, 6, 3, 63, 324, 248, 1, 5836, 6, 4876, 2, 643, 139, 63, 84, 571, 635, 2858, 1196, 1, 1011]]
padded seq: [[   0    0    0    0    0    0    0    0    0    0    0    0    0    0
     0    0    0    0    0    0    0    0    0    0    0    0    0    0
     0    0    0    0    0    0    0    0    0    0    0    0    0    0
     0    0    0    0    0    0    0    0    0    0    0    0    0    0
     0    0    0    0    0    0    0    0    0    0    0    0    0    0
     0    0    0    0    0    0    0    0    0    9    6    3   63  324
   248    1 5836    6 4876    2  643  139   63   84  571  635 2858 1196
     1 1011]]
positivity: [[0.92374164]]


In [22]:
text = 'It is a really unpleasant day. It is freezing cold out there. A lot of anti-social and criminal activities occured in the country today!!'
seq_gen = tokenizer.texts_to_sequences([text])
print('raw seq:',seq_gen)
seq_gen = pad_sequences(seq_gen, maxlen=max_length)
print('padded seq:',seq_gen)
prediction = model.predict(seq_gen)
print('positivity:',prediction)

raw seq: [[9, 6, 3, 63, 3986, 248, 9, 6, 1040, 43, 47, 3, 173, 4, 1207, 1030, 2, 1672, 4945, 8, 1, 701, 635]]
padded seq: [[   0    0    0    0    0    0    0    0    0    0    0    0    0    0
     0    0    0    0    0    0    0    0    0    0    0    0    0    0
     0    0    0    0    0    0    0    0    0    0    0    0    0    0
     0    0    0    0    0    0    0    0    0    0    0    0    0    0
     0    0    0    0    0    0    0    0    0    0    0    0    0    0
     0    0    0    0    0    0    0    9    6    3   63 3986  248    9
     6 1040   43   47    3  173    4 1207 1030    2 1672 4945    8    1
   701  635]]
positivity: [[0.44843894]]
