In [1]:
import os
import re
import numpy as np

In [7]:
from nltk.corpus import stopwords
import string

def clean_text(doc):
    doc = doc.lower()
    tokens = doc.split()
    table = str.maketrans('', '', string.punctuation)
    tokens = [w.translate(table) for w in tokens]
    tokens = [word for word in tokens if word.isalpha()]
    stop_words = set(stopwords.words('english'))
    tokens = [w for w in tokens if not w in stop_words]
    tokens = [word for word in tokens if len(word) > 1]
    return tokens

In [8]:
dataset = './aclImdb'
train_dir = dataset + '/train'
test_dir = dataset + '/test'

#train data in arrays
labels, texts = [], []

for label_type in ['neg','pos']:
        files = os.listdir(train_dir + '/' + label_type)
        for file in files:
                file = open(train_dir + '/' + label_type + '/' + file, 'r', encoding="utf8")
                data = file.read()
                data = clean_text(data)
                texts.append(data)
                labels.append(label_type)
                file.close()

labels = [item.replace('pos', '1') for item in labels]
labels = [item.replace('neg', '0') for item in labels]
traindata, trainlabels = texts, labels
##########################

In [9]:
print (traindata[0])

['story', 'man', 'unnatural', 'feelings', 'pig', 'starts', 'opening', 'scene', 'terrific', 'example', 'absurd', 'comedy', 'formal', 'orchestra', 'audience', 'turned', 'insane', 'violent', 'mob', 'crazy', 'chantings', 'singers', 'unfortunately', 'stays', 'absurd', 'whole', 'time', 'general', 'narrative', 'eventually', 'making', 'putting', 'even', 'era', 'turned', 'cryptic', 'dialogue', 'would', 'make', 'shakespeare', 'seem', 'easy', 'third', 'grader', 'technical', 'level', 'better', 'might', 'think', 'good', 'cinematography', 'future', 'great', 'vilmos', 'zsigmond', 'future', 'stars', 'sally', 'kirkland', 'frederic', 'forrest', 'seen', 'briefly']


In [10]:
#test data in arrays
labels, texts = [], []

for label_type in ['neg','pos']:
        files = os.listdir(test_dir + '/' + label_type)
        for file in files:
                file = open(test_dir + '/' + label_type + '/' + file, 'r', encoding="utf8")
                data = file.read()
                data = clean_text(data)
                texts.append(data)
                labels.append(label_type)
                file.close()

labels = [item.replace('pos', '1') for item in labels]
labels = [item.replace('neg', '0') for item in labels]   
testdata, testlabels = texts, labels
#########################

In [11]:
vocab = {}
counter = {}

for line in traindata:
    for word in line:
        if word not in vocab:
            vocab[word] = len(vocab) + 1
            counter[word] = 1
        else:
            counter[word] = counter[word]+1

In [12]:
print (len(vocab))
print (len(counter))

117233
117233


In [13]:
min_occ = 2
tokens = [k for k,c in counter.items() if c >= min_occ]
print (len(tokens))

56185


In [14]:
from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences

tokenizer = Tokenizer()
tokenizer.fit_on_texts(traindata)
sequence = tokenizer.texts_to_sequences(traindata)

  from ._conv import register_converters as _register_converters
Using TensorFlow backend.


In [46]:
max_len = max([len(line) for line in traindata])
x_train = pad_sequences(sequence, maxlen = max_len)
y_train = trainlabels

In [47]:
x_train = np.asarray(x_train)
x_train = x_train.astype(int)
y_train = np.asarray(y_train)
y_train = y_train.astype(int)
print(x_train)
print (y_train)

[[    0     0     0 ...  8089    33  3130]
 [    0     0     0 ...  1280 28213  4280]
 [    0     0     0 ...  2194   691  1712]
 ...
 [    0     0     0 ...   965   107  9442]
 [    0     0     0 ...  8727  3562    14]
 [    0     0     0 ...    10  2168    27]]
[0 0 0 ... 1 1 1]


In [48]:
vocab_size = len(tokenizer.word_index) + 1

from keras import models
from keras.layers import Conv1D, Embedding, MaxPooling1D, Flatten, Dense
#defining model
model = models.Sequential()
model.add(Embedding(vocab_size, 100, input_length = max_len))
model.add(Conv1D(filters = 32, kernel_size=8, activation='relu'))
model.add(MaxPooling1D(pool_size=2))
model.add(Flatten())
model.add(Dense(10, activation='relu'))
model.add(Dense(1, activation='sigmoid'))
print (model.summary())

_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding_5 (Embedding)      (None, 1423, 100)         11723400  
_________________________________________________________________
conv1d_5 (Conv1D)            (None, 1416, 32)          25632     
_________________________________________________________________
max_pooling1d_5 (MaxPooling1 (None, 708, 32)           0         
_________________________________________________________________
flatten_5 (Flatten)          (None, 22656)             0         
_________________________________________________________________
dense_9 (Dense)              (None, 10)                226570    
_________________________________________________________________
dense_10 (Dense)             (None, 1)                 11        
Total params: 11,975,613
Trainable params: 11,975,613
Non-trainable params: 0
________________________________________________________________

In [50]:
model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])
model.fit(x_train, y_train, epochs=2, batch_size=100)

Epoch 1/2
Epoch 2/2


<keras.callbacks.History at 0x2c88a4785f8>

In [51]:
sequence = tokenizer.texts_to_sequences(testdata)
x_test = pad_sequences(sequence, maxlen = max_len)
y_test = trainlabels
x_test = np.asarray(x_test)
x_test = x_test.astype(int)
y_test = np.asarray(y_test)
y_test = y_test.astype(int)
print(x_train)
print (y_train)

[[    0     0     0 ...  8089    33  3130]
 [    0     0     0 ...  1280 28213  4280]
 [    0     0     0 ...  2194   691  1712]
 ...
 [    0     0     0 ...   965   107  9442]
 [    0     0     0 ...  8727  3562    14]
 [    0     0     0 ...    10  2168    27]]
[0 0 0 ... 1 1 1]


In [52]:
print (model.evaluate(x_test, y_test, batch_size=100))

[0.3823783507943153, 0.8645199992656708]


In [98]:
stopwords = ["a", "about", "above", "after", "again", "against", "all", 
             "am", "an", "and", "any", "are", "as", "at", "be", "because", 
             "been", "before", "being", "below", "between", "both", "but", 
             "by", "could", "did", "do", "does", "doing", "down", "during", 
             "each", "few", "for", "from", "further", "had", "has", "have", 
             "having", "he", "he'd", "he'll", "he's", "her", "here", "here's", 
             "hers", "herself", "him", "himself", "his", "how", "how's", "i", 
             "i'd", "i'll", "i'm", "i've", "if", "in", "into", "is", "it", 
             "it's", "its", "itself", "let's", "me", "more", "most", "my", 
             "myself", "nor", "of", "on", "once", "only", "or", "other", 
             "ought", "our", "ours", "ourselves", "out", "over", "own", 
             "same", "she", "she'd", "she'll", "she's", "should", "so", 
             "some", "such", "than", "that", "that's", "the", "their", 
             "theirs", "them", "themselves", "then", "there", "there's", 
             "these", "they", "they'd", "they'll", "they're", "they've", 
             "this", "those", "through", "to", "too", "under", "until", 
             "up", "very", "was", "we", "we'd", "we'll", "we're", "we've", 
             "were", "what", "what's", "when", "when's", "where", "where's", 
             "which", "while", "who", "who's", "whom", "why", "why's", "with", 
             "would", "you", "you'd", "you'll", "you're", "you've", "your", 
             "yours", "yourself", "yourselves", "a"]

# A list of contractions from http://stackoverflow.com/questions/19790188/expanding-english-language-contractions-in-python
contractions = { 
"ain't": "am not",
"aren't": "are not",
"can't": "cannot",
"can't've": "cannot have",
"'cause": "because",
"could've": "could have",
"couldn't": "could not",
"couldn't've": "could not have",
"didn't": "did not",
"doesn't": "does not",
"don't": "do not",
"hadn't": "had not",
"hadn't've": "had not have",
"hasn't": "has not",
"haven't": "have not",
"he'd": "he would",
"he'd've": "he would have",
"he'll": "he will",
"he's": "he is",
"how'd": "how did",
"how'll": "how will",
"how's": "how is",
"i'd": "i would",
"i'll": "i will",
"i'm": "i am",
"i've": "i have",
"isn't": "is not",
"it'd": "it would",
"it'll": "it will",
"it's": "it is",
"let's": "let us",
"ma'am": "madam",
"mayn't": "may not",
"might've": "might have",
"mightn't": "might not",
"must've": "must have",
"mustn't": "must not",
"needn't": "need not",
"oughtn't": "ought not",
"shan't": "shall not",
"sha'n't": "shall not",
"she'd": "she would",
"she'll": "she will",
"she's": "she is",
"should've": "should have",
"shouldn't": "should not",
"that'd": "that would",
"that's": "that is",
"there'd": "there had",
"there's": "there is",
"they'd": "they would",
"they'll": "they will",
"they're": "they are",
"they've": "they have",
"wasn't": "was not",
"we'd": "we would",
"we'll": "we will",
"we're": "we are",
"we've": "we have",
"weren't": "were not",
"what'll": "what will",
"what're": "what are",
"what's": "what is",
"what've": "what have",
"where'd": "where did",
"where's": "where is",
"who'll": "who will",
"who's": "who is",
"won't": "will not",
"wouldn't": "would not",
"you'd": "you would",
"you'll": "you will",
"you're": "you are"
}
def clean_text(text):
    dirty = text.copy()
    for word in dirty:
        if word in contractions:
            text(word) = contractions[word]
    dirty = text.copy()
    for word in dirty:
        if word in stopwords:
            text.remove(word)
    return text

SyntaxError: can't assign to function call (<ipython-input-98-18d5ec804e61>, line 103)

In [97]:
for line in traindata:
    line = clean_text(line)

TypeError: list indices must be integers or slices, not str

In [95]:
print (traindata[0])

['story', 'man', 'has', 'unnatural', 'feelings', 'pig', 'starts', 'opening', 'scene', 'a', 'terrific', 'example', 'absurd', 'comedy', 'a', 'formal', 'orchestra', 'audience', 'is', 'turned', 'an', 'insane', 'violent', 'mob', 'crazy', 'chantings', 'singers', 'unfortunately', 'stays', 'absurd', 'whole', 'time', 'no', 'general', 'narrative', 'eventually', 'making', 'just', 'off', 'putting', 'even', 'from', 'era', 'turned', 'off', 'the', 'cryptic', 'dialogue', 'make', 'shakespeare', 'seem', 'easy', 'a', 'third', 'grader', 'a', 'technical', 'level', "it's", 'better', 'you', 'might', 'think', 'with', 'some', 'good', 'cinematography', 'future', 'great', 'vilmos', 'zsigmond', 'future', 'stars', 'sally', 'kirkland', 'frederic', 'forrest', 'can', 'be', 'seen', 'briefly']


In [80]:
print (len(stopwords))

153


In [33]:
vocab = {}
counter = {}

for line in traindata:
    for word in line:
        if word not in vocab:
            vocab[word] = len(vocab) + 1
            counter[word] = 1
        else:
            counter[word] = counter[word]+1

In [35]:
print (len(vocab))
print (len(counter))

171319
171319


In [36]:
counter = sorted(counter.values())

In [37]:
counter = counter[-10000:]

print (len(counter))

10000


In [38]:
for line in traindata:
    for word in line:
        if word not in counter:
            line.remove(word)

In [40]:
print (traindata[0])

['man', 'has', 'feelings', 'starts', 'a', 'opening', 'that', 'a', 'example', 'a', 'orchestra', 'is', 'into', 'insane', 'mob', 'chantings', 'of', 'unfortunately', 'stays', 'absurd', 'time', 'no', 'narrative', 'making', 'just', 'off', 'even', 'from', 'the', 'era', 'be', 'off', 'the', 'cryptic', 'would', 'shakespeare', 'easy', 'a', 'grader', 'a', 'level', "it's", 'better', 'you', 'think', 'with', 'some', 'cinematography', 'future', 'vilmos', 'future', 'sally', 'and', 'forrest', 'be', 'briefly']


In [44]:
from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences

max_words = 10000
maxlen=100
tokenizer = Tokenizer(num_words=max_words)
tokenizer.fit_on_texts(traindata)
sequences = tokenizer.texts_to_sequences(traindata)
word_index = tokenizer.word_index
traindata = pad_sequences(sequences, maxlen=100)

In [48]:
print (len(word_index))

114983


In [52]:

#embedding layer
embeddings_index = {}
f = open('../Downloads/glove.6B.100d.txt', encoding="utf8")
for line in f:
        values = line.split()
        word = values[0]
        coefs = np.asarray(values[1:], dtype='float32')
        embeddings_index[word] = coefs
f.close()

embedding_dim = 100
embedding_matrix = np.zeros((max_words, embedding_dim))

for word, i in word_index.items():
        if i < max_words:
                embedding_vector = embeddings_index.get(word)
                embedding_matrix[i] = embedding_vector
######################

In [55]:

#make and train neural network
from keras.models import Sequential
from keras.layers import Embedding, Flatten, Dense

model = Sequential()
model.add(Embedding(max_words, embedding_dim, input_length=maxlen))
model.add(Flatten())
model.add(Dense(32, activation='relu'))
model.add(Dense(1, activation='sigmoid'))
print (model.summary())
model.compile(optimizer='rmsprop', loss='binary_crossentropy',metrics=['acc'])
model.fit(traindata, trainlabels, epochs=10, batch_size=32)
#####################

_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding_2 (Embedding)      (None, 100, 100)          1000000   
_________________________________________________________________
flatten_2 (Flatten)          (None, 10000)             0         
_________________________________________________________________
dense_3 (Dense)              (None, 32)                320032    
_________________________________________________________________
dense_4 (Dense)              (None, 1)                 33        
Total params: 1,320,065
Trainable params: 1,320,065
Non-trainable params: 0
_________________________________________________________________
None


AttributeError: 'str' object has no attribute 'ndim'

In [25]:
counter2 = counter.copy()

for word, value in counter.items():
    if counter[word] < 2:
        counter2.pop(word)
counter = counter2.copy()
print (len(counter2))

73144


In [26]:
for line in traindata:
    for word in line:
        if word not in counter:
            line.remove(word)

In [28]:
print (traindata[0])
print (len(traindata[0]))

['story', 'of', 'a', 'man', 'who', 'has', 'unnatural', 'feelings', 'for', 'a', 'pig', 'starts', 'out', 'with', 'a', 'opening', 'scene', 'that', 'is', 'a', 'terrific', 'example', 'of', 'absurd', 'comedy', 'a', 'formal', 'orchestra', 'audience', 'is', 'turned', 'into', 'an', 'insane', 'violent', 'mob', 'by', 'the', 'crazy', 'of', "it's", 'singers', 'unfortunately', 'it', 'stays', 'absurd', 'the', 'whole', 'time', 'with', 'no', 'general', 'narrative', 'eventually', 'making', 'it', 'just', 'too', 'off', 'putting', 'even', 'those', 'from', 'the', 'era', 'should', 'be', 'turned', 'off', 'the', 'cryptic', 'dialogue', 'would', 'make', 'shakespeare', 'seem', 'easy', 'to', 'a', 'third', 'grader', 'on', 'a', 'technical', 'level', "it's", 'better', 'than', 'you', 'might', 'think', 'with', 'some', 'good', 'cinematography', 'by', 'future', 'great', 'vilmos', 'zsigmond', 'future', 'stars', 'sally', 'kirkland', 'and', 'frederic', 'forrest', 'can', 'be', 'seen', 'briefly']
111


In [29]:
vocab = {}

for line in traindata:
    for word in line:
        if word not in vocab:
            vocab[word] = len(vocab) + 1
          

In [30]:
print (len(vocab))

78116
