In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import tensorflow as tf
import os

In [3]:
import os
import random

input_dir = './scrapped pages'
content = []

def read_pages (input_dir):
    dir_name = os.path.join(input_dir)
    
    for fname in os.listdir(dir_name):
        if (fname[-4:]=='.txt'):
            f = open (os.path.join(dir_name, fname), encoding='utf-8', mode='r')
            c = f.read()
            content.append(c)
            f.close()
                
#read positive & negative reviews
read_pages(input_dir)

print ('Number of web pages read is %d'%(len(content)))

Number of web pages read is 152


# Save generated merged content in new file

In [4]:
def save_content(filename, content):
    
    file = open(filename,'w',encoding='utf-8')
    lines = '\n'.join(content)
    file.write(lines)
    file.close()
    
save_content('merged-content.txt', content)

# Read the Merged file

In [5]:
def read_file(file):
    f = open(file, 'r', encoding='utf-8')
    lines = f.readlines()
    data = '\t'.join([line.strip() for line in lines])
    return data

data = read_file('merged-content.txt')

# Clean the Read file

In [84]:
import nltk
nltk.download('stopwords')

import re
import string
from nltk.corpus import stopwords
import pandas as pd

def clean_doc(doc):
    
    #replace -- with a space
    doc = doc.replace('-',' ')
    
    #replace -- with a space
    doc = doc.replace('.',' ')
    
    #split into tokens by white space
    tokens = doc.split()
    
    # prepare for char filtering
    re_punc = re.compile('[%s]'%re.escape(string.punctuation))
    
    #remove punctuation from words
    tokens = [re_punc.sub('',w) for w in tokens]
    
    #remove remaining tokens that are not alphanumerics
    tokens = [word for word in tokens if word.isalpha()]
    
    #make lower case
    tokens = [word.lower() for word in tokens]
    
    #remove stop words & words which appear less than 1 times
    stops = set(stopwords.words("english"))
    tokens = [word for word in tokens if not word in stops and len(word)>=3]

    return tokens

tokens = clean_doc(data)
    
print (tokens[0])
print ('Number of tokens are ', len(tokens))
print ('Number of unique tokens is ', len(set(tokens)))

[nltk_data] Downloading package stopwords to C:\Users\Aditya
[nltk_data]     Jain\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


term
Number of tokens are  70548
Number of unique tokens is  6852


In [85]:
MAX_WORDS=80000
BATCH_SIZE=16
EPOCHS=2
OOV_TOKEN=0
EMBEDDING_DIM = 100
MAX_SEQ_LENGTH=20
VALIDATION_SPLIT_RATIO= 0.3
LSTM_UNITS=64

# Generate Sequences

    - This method takes in bunch of tokens and then converts those tokens into sequences. The idea here is that we create
    sequences of 21 tokens (words) each and then feed each sequence one at a time to the network. each word in the sequence represents a value at a given timesteps. All words in effect become the features

In [86]:
def generate_sequences(size, tokens):
    sequences = list()
    size_seq = size + 1
    for i in range (0, len(tokens)-size):
        seq = tokens[i:size_seq+i]
        line = ' '.join(seq)
        sequences.append(line)
        
    print ('Total number of sequenes is ', len(sequences))
    return sequences

sequences = generate_sequences(MAX_SEQ_LENGTH, tokens)

Total number of sequenes is  70528


# Save generated sequences in new file

In [87]:
def save_sequences(filename, sequences):
    
    file = open(filename,'w',encoding='utf-8')
    lines = '\n'.join(sequences)
    file.write(lines)
    file.close()
    
save_sequences('merged-content-cleaned.txt', sequences)

# Read cleaned sequences from file

In [88]:
def read_sequences(file):
    f = open(file, 'r', encoding='utf-8')
    lines = f.read()
    return lines

lines = read_sequences('merged-content-cleaned.txt')
lines = lines.split('\n')

### Tokenize comments

In [89]:
tokenizer = tf.keras.preprocessing.text.Tokenizer(num_words=MAX_WORDS, oov_token=OOV_TOKEN, filters='')
tokenizer.fit_on_texts(lines)
word2Idx = tokenizer.word_index
VOCAB_LEN = len (word2Idx)+1

print ('Found %d unique words'%(len(word2Idx)))
print (word2Idx['isa'])

sequences = tokenizer.texts_to_sequences(lines)
padded_input_sequences = tf.keras.preprocessing.sequence.pad_sequences(sequences, padding='post', maxlen=max_seq_len_from_data)

print ('Created %d padded input sequences'%(len(padded_input_sequences)))

Found 6853 unique words
59
Created 70528 padded input sequences


In [90]:
sequences = np.array(sequences)
X = padded_input_sequences[:,:MAX_SEQ_LENGTH]
y = padded_input_sequences[:,-1]
y = tf.keras.utils.to_categorical(y, num_classes=VOCAB_LEN)

#### Load GloVe embeddings

In [None]:
embeddings_index = {}

f = open ('glove.6B.100d.txt','r', encoding='utf-8')
for line in f:
    values = line.split()
    word = values[0]
    coefs = np.asarray(values[1:])
    embeddings_index[word]=coefs

f.close()
print ('Found %s word embeddings'%(len(embeddings_index)))

In [46]:
num_words = min (MAX_WORDS, VOCAB_LEN)
print ('Min words to be considered are %d'%(num_words))

loaded_embeddings_matrix = np.zeros((num_words, EMBEDDING_DIM))

for word, i in word2Idx.items():
    if (i<num_words):
        if word in embeddings_index.keys():
            embedding_vector = embeddings_index[word]
            loaded_embeddings_matrix[i] = embedding_vector

print (loaded_embeddings_matrix.shape)

Min words to be considered are 6854
(6854, 100)


# Configure Callbacks which would be executed during the training phase

### Build Model

In [55]:
embed_layer = tf.keras.layers.Embedding(input_dim = num_words,
                               output_dim = EMBEDDING_DIM,
                               input_length=max_seq_len_from_data, 
                               embeddings_initializer=tf.keras.initializers.Constant(loaded_embeddings_matrix),
                               trainable=True)

In [91]:
input_ = tf.keras.layers.Input(shape=(max_seq_len_from_data))

x = embed_layer(input_)

lstm_layer_0 = tf.keras.layers.LSTM(units=LSTM_UNITS, return_sequences=True)
x = lstm_layer_0(x)

#lstm_layer_1 = tf.keras.layers.LSTM(units=LSTM_UNITS, return_sequences=True, return_state=True)
#x,h,c = lstm_layer_1(x)
dense_layer1 = tf.keras.layers.Dense(num_words, activation='relu')
x = dense_layer1(x)

dense_layer2 = tf.keras.layers.Dense(num_words, activation='softmax')
output = dense_layer2(x)

model = tf.keras.models.Model(input_, output)

model.compile (optimizer='adam', loss='categorical_crossentropy', metrics=['accuracy'])
model.summary()

Model: "model_12"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
input_27 (InputLayer)        [(None, 20)]              0         
_________________________________________________________________
embedding_1 (Embedding)      multiple                  685400    
_________________________________________________________________
lstm_22 (LSTM)               (None, 20, 64)            42240     
_________________________________________________________________
dense_14 (Dense)             (None, 20, 6854)          445510    
_________________________________________________________________
dense_15 (Dense)             (None, 20, 6854)          46984170  
Total params: 48,157,320
Trainable params: 48,157,320
Non-trainable params: 0
_________________________________________________________________


In [60]:
from sklearn.model_selection import train_test_split

train_sentences, test_sentences, train_targets, test_targets = train_test_split (X, y)

In [92]:
print (train_sentences.shape)
print (test_sentences.shape)
print (train_targets.shape)
print (test_targets.shape)
print (train_sentences[0])
print (train_targets[0])

(52896, 20)
(17632, 20)
(52896, 6854)
(17632, 6854)
[460 461 680 679  11 198 151 171  34  13 181  12  25 423  11 681 372  21
  72 289]
[0. 0. 0. ... 0. 0. 0.]


In [93]:
model.fit (train_sentences, train_targets, batch_size=BATCH_SIZE, epochs=EPOCHS, validation_split=VALIDATION_SPLIT_RATIO)

ValueError: A target array with shape (52896, 6854) was passed for an output of shape (None, 20, 6854) while using as loss `categorical_crossentropy`. This loss expects targets to have the same shape as the output.