In [0]:
#if you are not using google collab please ignore this
#to mount google drive
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [0]:
#loading essential libraries
from __future__ import print_function
from keras.callbacks import LambdaCallback
from keras.models import Sequential
from keras.layers import Dense, Activation, Dropout
from keras.layers import LSTM
from keras.optimizers import RMSprop
from keras.utils.data_utils import get_file
import numpy as np
import random
import sys
import io
import pandas as pd
import tensorflow as tf
import numpy as np
import os
import time

Using TensorFlow backend.


In [0]:
#splitting the dataset into positive and negative and storing it in local file system
#if you are using google colab, then just upload the imdb dataset file to "google collab folder" in your drive
#else change the path to the location where file is stored (Sorry for the trouble of chnaging path)
dataset = pd.read_csv("/content/drive/My Drive/Colab Notebooks/IMDB Dataset.csv")
positive_review = dataset[dataset['sentiment'] == "positive"]
negative_review = dataset[dataset['sentiment'] == "negative"]
positive_review.to_csv('./positive_review.csv')
negative_review.to_csv('./negative_review.csv')

# Text Generation for Positive reviews

In [0]:
text = open("./positive_review.csv", 'rb').read().decode(encoding='utf-8')
extract = 0.1
text = text[:int(extract*len(text))]

#Mapping chars to integers
chars = sorted(list(set(text)))
# creating 2 dictionaries with character to integer and integer to character
char_indices = dict((c, i) for i, c in enumerate(chars))
indices_char = dict((i, c) for i, c in enumerate(chars))

#splitting sentences and creating an array with last character
maxlen = 40
step = 3
sentences = []
next_chars = []
for i in range(0, len(text) - maxlen, step):
    sentences.append(text[i: i + maxlen])
    next_chars.append(text[i + maxlen])

#reshaping the sentences into boolean so that it be passed into model
x = np.zeros((len(sentences), maxlen, len(chars)), dtype=np.bool)
y = np.zeros((len(sentences), len(chars)), dtype=np.bool)
for i, sentence in enumerate(sentences):
    for t, char in enumerate(sentence):
        x[i, t, char_indices[char]] = 1
    y[i, char_indices[next_chars[i]]] = 1

#creating a model
model = Sequential()
model.add(LSTM(128, input_shape=(maxlen, len(chars)),return_sequences = True ))
model.add(LSTM(128, return_sequences = False))
model.add(Dropout(0.2))
model.add(Dense(len(chars)))
model.add(Activation('softmax'))
model.summary()
model.compile(optimizer='RMSprop', loss='categorical_crossentropy')

# helper function to sample an index from a probability array
#I got this helper function from the lstm_text_generation example from
#keras. https://github.com/keras-team/keras/blob/master/examples/lstm_text_generation.py
def sample(preds, temperature=1.0):
    preds = np.asarray(preds).astype('float64')
  # using a categorical distribution to predict the character returned by the model
    preds = np.log(preds) / temperature
    exp_preds = np.exp(preds)
    preds = exp_preds / np.sum(exp_preds)
    probas = np.random.multinomial(1, preds, 1)
    return np.argmax(probas)

# Callback function to print predicted text generated by our LSTM. 
#It prints generated text with 5 different temperatures [0.2, 0.5, 1.0, 1.2]. 
#0.2 will generate text with more ordinary word. 1.2 will generate wilder guesses.

def on_epoch_end(epoch, logs):
    # Function invoked at end of each epoch. Prints generated text.
    print()
    print('----- Generating text after Epoch: %d' % epoch)

    start_index = random.randint(0, len(text) - maxlen - 1)
    for diversity in [0.2, 0.5, 1.0, 1.2]:
        print('----- diversity:', diversity)

        generated = ''
        sentence = text[start_index: start_index + maxlen]
        generated += sentence
        print('----- Generating with seed: "' + sentence + '"')
        sys.stdout.write(generated)
#for predicting next character
        for i in range(400):
            x_pred = np.zeros((1, maxlen, len(chars)))
            for t, char in enumerate(sentence):
                x_pred[0, t, char_indices[char]] = 1.

            preds = model.predict(x_pred, verbose=0)[0]
            next_index = sample(preds, diversity)
            next_char = indices_char[next_index]

            generated += next_char
            sentence = sentence[1:] + next_char

            sys.stdout.write(next_char)
            sys.stdout.flush()
        print()
print_callback = LambdaCallback(on_epoch_end=on_epoch_end)

#comparing loss after each epoch and saving weights with least loss
from keras.callbacks import ModelCheckpoint
filepath = "weights.hdf5"
checkpoint = ModelCheckpoint(filepath, monitor='loss',
                             verbose=1, save_best_only=True,
                             mode='min')
from keras.callbacks import ReduceLROnPlateau
reduce_lr = ReduceLROnPlateau(monitor='loss', factor=0.2,
                              patience=1, min_lr=0.001)

callbacks = [print_callback, checkpoint, reduce_lr]
model.fit(x, y, batch_size=2048, epochs=10, callbacks=callbacks)

Model: "sequential_5"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
lstm_9 (LSTM)                (None, 40, 128)           136704    
_________________________________________________________________
lstm_10 (LSTM)               (None, 128)               131584    
_________________________________________________________________
dropout_5 (Dropout)          (None, 128)               0         
_________________________________________________________________
dense_5 (Dense)              (None, 138)               17802     
_________________________________________________________________
activation_5 (Activation)    (None, 138)               0         
Total params: 286,090
Trainable params: 286,090
Non-trainable params: 0
_________________________________________________________________
Epoch 1/10

----- Generating text after Epoch: 0
----- diversity: 0.2
----- Generating with seed: "tful. Christy w

<keras.callbacks.callbacks.History at 0x7fccd3f04b70>

In [0]:
def generate_text(length, diversity):
    # Get random starting text
    start_index = random.randint(0, len(text) - maxlen - 1)
    generated = ''
    sentence = text[start_index: start_index + maxlen]
    generated += sentence
    for i in range(length):
            x_pred = np.zeros((1, maxlen, len(chars)))
            for t, char in enumerate(sentence):
                x_pred[0, t, char_indices[char]] = 1.
# predicting next character in the model
            preds = model.predict(x_pred, verbose=0)[0]
            next_index = sample(preds, diversity)
            next_char = indices_char[next_index]

            generated += next_char
            sentence = sentence[1:] + next_char
    return generated

In [0]:
model1 = model
generated_text = generate_text(100, 1)
print(generate_text(100, 1))

el it greatly detracts for one's ability refaction, but new the exciting arridg bothing to threams of 8ghtrescan are I dog to becompos a dia


In [0]:
print(generate_text(100, 0.8))

e) is wearing a simply bucket with eye here and the empared, ""Amery"" and really too and about the stull complete and have more one of the 


In [0]:
print(generate_text(100, 0.6))

en criticized to be over-the-top and lough to the movie are not late of the seep and film and convertable thing that uncention from the comm


In [0]:
print(generate_text(100, 0.4))

ves out of stock farcical situations, rather started to the movie that it something you that the sexuel the plot of the complete of the terr


In [0]:
print(generate_text(100, 0.2))

eo realism mixed whit the best Ken Loach and the performance of the starts of the sent and the movie is a simple of the movie that the best 


#Calculating Perplexity for positive reviews


In [0]:
import nltk
nltk.download('punkt')

test_set = len(generated_text.split())

import random
r = random.sample(range(test_set),15)
test_set_sample = [generated_text.split()[i] for i in r]

import collections, nltk
# we first tokenize the text corpus
tokens = nltk.word_tokenize(text)

#here you construct the language model for creating a dictionary
def unigram(tokens): 
  # creating a dictionary   
    model1 = collections.defaultdict(lambda: 0.01) # For words outside the scope of its knowledge, it assigns a low probability of 0.01
    for f in tokens:
        try:
            model1[f] += 1
        except KeyError:
            model1 [f] = 1
            continue
    N = float(sum(model1.values()))
    for word in model1:
        model1[word] = model1[word]/N
    return model1

#computes perplexity of the bigram model on a testset  
def perplexity(testset, model1):
    testset= nltk.word_tokenize(testset)
    perplexity = 1
    N = 2 #change values of N for calculating perplexity of tri - gram or other models
    for word in testset:
        N += 1
        #calculating inverse probability of occurence of words
        perplexity = perplexity * (1/model1[word])
    perplexity = pow(perplexity, 1/float(N)) 
    return perplexity

model1 = unigram(tokens)

perplexities = []

for i in range(len(test_set_sample)):
    pp= perplexity(test_set_sample[i], model1)
    perplexities.append(pp)
print("perplexity is" , np.mean(pp))

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


perplexity is 17.752356585903406


# Text generation for positive reviews using statistical modelliing(n-gram)

In [0]:
pip install --upgrade nltk

Collecting nltk
[?25l  Downloading https://files.pythonhosted.org/packages/92/75/ce35194d8e3022203cca0d2f896dbb88689f9b3fce8e9f9cff942913519d/nltk-3.5.zip (1.4MB)
[K     |████████████████████████████████| 1.4MB 3.4MB/s 
Building wheels for collected packages: nltk
  Building wheel for nltk (setup.py) ... [?25l[?25hdone
  Created wheel for nltk: filename=nltk-3.5-cp36-none-any.whl size=1434676 sha256=d668bbee1b0206a25f6b83166a3edfa9b99719671650c7fe81cda6c94d2ecf55
  Stored in directory: /root/.cache/pip/wheels/ae/8c/3f/b1fe0ba04555b08b57ab52ab7f86023639a526d8bc8d384306
Successfully built nltk
Installing collected packages: nltk
  Found existing installation: nltk 3.2.5
    Uninstalling nltk-3.2.5:
      Successfully uninstalled nltk-3.2.5
Successfully installed nltk-3.5


In [0]:
import nltk
from nltk.util import ngrams
from nltk.lm import MLE
from nltk import word_tokenize

nltk.download('punkt')

docs = pd.read_csv("./positive_review.csv")
extract = 0.2
docs = docs[:int(extract*len(docs))]
del docs['sentiment']

texts = []
for s in docs['review']:
    texts.append(word_tokenize(s))
#using nltk lm model for padding
from nltk.lm.preprocessing import padded_everygram_pipeline
train, vocab = padded_everygram_pipeline(3, texts)
#creating MLE model
model = MLE(3) 
model.fit(train, vocab)

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.


In [0]:
# processing formed sentences to remove unwanted characters
from nltk.tokenize.treebank import TreebankWordDetokenizer

detokenize = TreebankWordDetokenizer().detokenize

def generate_sent(model, num_words, random_seed=42):
    content = []
    for token in model.generate(num_words, random_seed=random_seed):
        if token == '<s>':
            continue
        if token == '</s>':
            break
        content.append(token)
    return detokenize(content)

In [0]:
#sentence before pre - processing
model1 = model
word_list = model.generate(200, random_seed = 12)
generated_text = (' '.join(word for word in word_list))
print(generated_text)

effort put into a FPS rut . Enter Julian 's devoted to Wells 's lifetime scientists certainly fled the rightwing fascistic governments of Hitler 's suicide and Nelson faces a double doing most of the animals killed by William Welch and L.Q . Jones and Alvy Moore. < br / > < br / > < br / > < br / > < br / > On a purely fun level . Hard to imagine Rochester to be athletes . Apparently Silver never wrote another screenplay after this was . But the great actor ! ! </s> </s> </s> </s> </s> </s> </s> </s> </s> </s> </s> </s> </s> </s> </s> </s> </s> </s> </s> </s> </s> </s> </s> </s> </s> </s> </s> </s> </s> </s> </s> </s> </s> </s> </s> </s> </s> </s> </s> </s> </s> </s> </s> </s> </s> </s> </s> </s> </s> </s> </s> </s> </s> </s> </s> </s> </s> </s> </s> </s> </s> </s> </s> </s> </s> </s> </s> </s> </s> </s> </s> </s> </s> </s> </s> </s> </s> </s> </s> </s> </s> </s> </s> </s> </s> </s> </s> </s> </s> </s> </s> </s> </s> </s> </s> </s> </s> </s> </s> </s> </s> </s> </s>


In [0]:
#sentences after pre - processing
model1 = model
generated_text = generate_sent(model, 200, random_seed=12)
print(generated_text)

effort put into a FPS rut . Enter Julian's devoted to Wells's lifetime scientists certainly fled the rightwing fascistic governments of Hitler's suicide and Nelson faces a double doing most of the animals killed by William Welch and L.Q . Jones and Alvy Moore. <br /> <br /> <br /> <br /> <br /> On a purely fun level . Hard to imagine Rochester to be athletes . Apparently Silver never wrote another screenplay after this was . But the great actor!!


In [0]:
#sentences after pre - processing
generate_sent(model, 200, random_seed=250)

'because this is a``Valentine Day"event . The movie follows two young boys deemed kissing the girl, from the field and running for president . I will hunt them up at the clinic and then the first time up``a living from the boys find out who is fourteen and a scarf . A cool video to this complex story that begins the film\'s wonderfully restrained here . <br /> This is not surprising . It\'s hilarious death scene (if you try to be . But hey, it was funny as hell. <br /> Writer Armistead Maupin, and we see how Smith had an abusive boyfriend . Her relationship with Iris, clearly a serious drama into a semi-cheesy production, but they\'re hot for each other on a literally filthy rich doctor (I was captivated by this trio of criminals . <br /> Fortunately, this is ONE of the cabins where pornographic material is funny as it always increases my appreciation for Rob Roy", which take up arms for their lack of ability to make the Sea Monsters'

In [0]:
#sentences after pre - processing
generate_sent(model, 200, random_seed=-10)

'idea of re-uniting the members to justice, hope, but when you\'re studying physics, and some of the movie, the woman whose parents were English . They colonized it, before revisions, didn\'t speak English and Spanish on the silver screen . This film is enjoyable enough to realise just how real they kept me in for good . After knocking her up to the island commonwealth, and that detail are good as the first time in a movie well deserves the appelation . The first movie as it is a lesbian woman could fall apart due to a budget movie with his life and my friends on the rise of the things I\'ve seen one of the sets are still widely available . Still these minor problems aside it was boring and didn\'t think I\'ve been dead!"~One Of Andrews Most Memorable Lines In This Film is interesting even though ghosts are not that kind of laughter, adventure, emotion and sadness . When his fixation rewards him, she is hurt and kill people to tell him that we won\'t be surprised that George'

In [0]:
#sentences after pre - processing
generate_sent(model, 200, random_seed=33)

"hypnotic appeal . For some reason the United States got involved with the greatest trash ever made. <br /> Lastly, there were no problem; however the feel in the plot, but copied the format and incorporates it so easy (which proves to be a bit hit and bruised my leg badly just before moving her models into place; makes complete sense in a movie from an underlying sentiment of the most accurately written and profound . He is an official selection for the first time I cut his throat from being just another thickly formulated love story will play the lead.He was always one of my all-time favorite movie in digital format then I would recommend this wonderful movie, which is well worth it!! Great stuff! <br /> The use of the 60s (taking rare food neatly summarises their society: They're born innocent but is also the oldest ever British (I'm going to be both virile and gentle and kind of exorcism. <br /> <br /> <br /> Rating : 8/10 <br /> <"

In [0]:
#sentences after pre - processing
generate_sent(model, 200, random_seed=69)

"near Lake Tahoe, just to see this film so loaded with human-looking Cylons, wound up in a cartoon . Most of the other side, especially remind us why this has aged remarkably well . Early appearances from Cherie Chung and Chow Yun Fat are also in terms of capturing the mood for a crazy-quilt hour of lame comedies that misfire, it cannot help but smile and a prostitute . Chances are that with his violent ways . A classic late 50's horror TV shows and movies, while together Keaton and Gedde Watanabe shine in their role well . My mom was complaining about CG animators). The acting is solid and suitable . This summarizes the other major and minor roles . O'Toole was long-established as a law reporter and then again what just happened. <br /> <br /> Miss Fritton adding to the early 1970's . This movie was the first season of Cosby is released . As a teenager, embarrassed by his boyfriend who brings the magic and mystery, something with great cinematography . The scenery (meaning it's a must

# Calculating perplexity for statistical modelling positive reviews


In [0]:
import nltk
nltk.download('punkt')

test_set = len(generated_text.split())

import random
r = random.sample(range(test_set),15)
test_set_sample = [generated_text.split()[i] for i in r]

import collections, nltk
# we first tokenize the text corpus
tokens = nltk.word_tokenize(text)

#here you construct the language model for creating a dictionary
def unigram(tokens): 
  # creating a dictionary   
    model1 = collections.defaultdict(lambda: 0.01) #For words outside the scope of its knowledge, it assigns a low probability of 0.01
    for f in tokens:
        try:
            model1[f] += 1
        except KeyError:
            model1 [f] = 1
            continue
    N = float(sum(model1.values()))
    for word in model1:
        model1[word] = model1[word]/N
    return model1

#computes perplexity of the trigram model on a testset  
def perplexity(testset, model1):
    testset= nltk.word_tokenize(testset)
    #testset = testset.split()
    perplexity = 1
    N = 3 #change values of N for calculating perplexity of bi - gram or other models
  #calculating inverse probability of occurence of words
    for word in testset:
        N += 1
        perplexity = perplexity * (1/model1[word])
    perplexity = pow(perplexity, 1/float(N)) 
    return perplexity

model1 = unigram(tokens)

perplexities = []

for i in range(len(test_set_sample)):
    pp= perplexity(test_set_sample[i], model1)
    perplexities.append(pp)
print("perplexity is", np.mean(pp))

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


perplexity is 2.9172088591914003


# Text generation for negative _reviews

In [0]:
text = open("./negative_review.csv", 'rb').read().decode(encoding='utf-8')
extract = 0.1
text = text[:int(extract*len(text))]

#Mapping chars to integers
chars = sorted(list(set(text)))
# creating 2 dictionaries with character to integer and integer to character
char_indices = dict((c, i) for i, c in enumerate(chars))
indices_char = dict((i, c) for i, c in enumerate(chars))

#splitting sentences and creating an array with last character
maxlen = 40
step = 3
sentences = []
next_chars = []
for i in range(0, len(text) - maxlen, step):
    sentences.append(text[i: i + maxlen])
    next_chars.append(text[i + maxlen])

#reshaping the sentences into boolean so that it be passed into model
x = np.zeros((len(sentences), maxlen, len(chars)), dtype=np.bool)
y = np.zeros((len(sentences), len(chars)), dtype=np.bool)
for i, sentence in enumerate(sentences):
    for t, char in enumerate(sentence):
        x[i, t, char_indices[char]] = 1
    y[i, char_indices[next_chars[i]]] = 1

#creating a model
model = Sequential()
model.add(LSTM(128, input_shape=(maxlen, len(chars)),return_sequences = True ))
model.add(LSTM(128, return_sequences = False))
model.add(Dropout(0.2))
model.add(Dense(len(chars)))
model.add(Activation('softmax'))
model.summary()
model.compile(optimizer='RMSprop', loss='categorical_crossentropy')

# helper function to sample an index from a probability array
#I got this helper function from the lstm_text_generation example from
#keras. https://github.com/keras-team/keras/blob/master/examples/lstm_text_generation.py
def sample(preds, temperature=1.0):
    preds = np.asarray(preds).astype('float64')
  # using a categorical distribution to predict the character returned by the model
    preds = np.log(preds) / temperature
    exp_preds = np.exp(preds)
    preds = exp_preds / np.sum(exp_preds)
    probas = np.random.multinomial(1, preds, 1)
    return np.argmax(probas)

# Callback function to print predicted text generated by our LSTM. 
#It prints generated text with 5 different temperatures [0.2, 0.5, 1.0, 1.2]. 
#0.2 will generate text with more ordinary word. 1.2 will generate wilder guesses.

def on_epoch_end(epoch, logs):
    # Function invoked at end of each epoch. Prints generated text.
    print()
    print('----- Generating text after Epoch: %d' % epoch)

    start_index = random.randint(0, len(text) - maxlen - 1)
    for diversity in [0.2, 0.5, 1.0, 1.2]:
        print('----- diversity:', diversity)

        generated = ''
        sentence = text[start_index: start_index + maxlen]
        generated += sentence
        print('----- Generating with seed: "' + sentence + '"')
        sys.stdout.write(generated)
#for predicting next character
        for i in range(400):
            x_pred = np.zeros((1, maxlen, len(chars)))
            for t, char in enumerate(sentence):
                x_pred[0, t, char_indices[char]] = 1.

            preds = model.predict(x_pred, verbose=0)[0]
            next_index = sample(preds, diversity)
            next_char = indices_char[next_index]

            generated += next_char
            sentence = sentence[1:] + next_char

            sys.stdout.write(next_char)
            sys.stdout.flush()
        print()
print_callback = LambdaCallback(on_epoch_end=on_epoch_end)

#comparing loss after each epoch and saving weights with least loss
from keras.callbacks import ModelCheckpoint
filepath = "weights.hdf5"
checkpoint = ModelCheckpoint(filepath, monitor='loss',
                             verbose=1, save_best_only=True,
                             mode='min')
from keras.callbacks import ReduceLROnPlateau
reduce_lr = ReduceLROnPlateau(monitor='loss', factor=0.2,
                              patience=1, min_lr=0.001)

callbacks = [print_callback, checkpoint, reduce_lr]
model.fit(x, y, batch_size=2048, epochs=10, callbacks=callbacks)

Model: "sequential_6"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
lstm_11 (LSTM)               (None, 40, 128)           138240    
_________________________________________________________________
lstm_12 (LSTM)               (None, 128)               131584    
_________________________________________________________________
dropout_6 (Dropout)          (None, 128)               0         
_________________________________________________________________
dense_6 (Dense)              (None, 141)               18189     
_________________________________________________________________
activation_6 (Activation)    (None, 141)               0         
Total params: 288,013
Trainable params: 288,013
Non-trainable params: 0
_________________________________________________________________
Epoch 1/10

----- Generating text after Epoch: 0
----- diversity: 0.2
----- Generating with seed: "e A Nightmare O

<keras.callbacks.callbacks.History at 0x7fccd4be8eb8>

In [0]:
def generate_text(length, diversity):
    # Get random starting text
    start_index = random.randint(0, len(text) - maxlen - 1)
    generated = ''
    sentence = text[start_index: start_index + maxlen]
    generated += sentence
    for i in range(length):
            x_pred = np.zeros((1, maxlen, len(chars)))
            for t, char in enumerate(sentence):
                x_pred[0, t, char_indices[char]] = 1.
# predicting next character in the model
            preds = model.predict(x_pred, verbose=0)[0]
            next_index = sample(preds, diversity)
            next_char = indices_char[next_index]

            generated += next_char
            sentence = sentence[1:] + next_char
    return generated

In [0]:
model1 = model
generated_text = generate_text(100, 1)
print(generate_text(100, 1))

crown things all up, the ghost who was offessed Plyse at the makes that they probably money on they can be an penfiling incenteor and she's 


In [0]:
print(generate_text(100, 0.8))

ughter also adding in rude, disrespectfully have been engable real to be sumper aproming in the film. I can came to be surplite for a dree a


In [0]:
print(generate_text(100, 0.6))

s laid back ""cool"" style has ruined the bit that something and look of halls can madnical in a girl a deal real of the sige story in the c


In [0]:
print(generate_text(100, 0.4))

hirt off 3-4 times. Yes we all get it that the story waste the film in a movie of the book of a lot of the comerity of the story in the poss


In [0]:
print(generate_text(100, 0.2))

 it's good"" deal. <br /><br />I wish I was the story of the movie is a film of a story of the story and the movie is a serious of the time 


# Calculating perplexity for negative review

In [0]:
import nltk
nltk.download('punkt')

test_set = len(generated_text.split())

import random
r = random.sample(range(test_set),15)
test_set_sample = [generated_text.split()[i] for i in r]

import collections, nltk
# we first tokenize the text corpus
tokens = nltk.word_tokenize(text)

#here you construct the language model for creating a dictionary
def unigram(tokens): 
  # creating a dictionary   
    model1 = collections.defaultdict(lambda: 0.01) # For words outside the scope of its knowledge, it assigns a low probability of 0.01
    for f in tokens:
        try:
            model1[f] += 1
        except KeyError:
            model1 [f] = 1
            continue
    N = float(sum(model1.values()))
    for word in model1:
        model1[word] = model1[word]/N
    return model1

#computes perplexity of the bigram model on a testset  
def perplexity(testset, model1):
    testset= nltk.word_tokenize(testset)
    perplexity = 1
    N = 2 #change values of N for calculating perplexity of tri - gram or other models
    for word in testset:
        N += 1
        #calculating inverse probability of occurence of words
        perplexity = perplexity * (1/model1[word])
    perplexity = pow(perplexity, 1/float(N)) 
    return perplexity

model1 = unigram(tokens)

perplexities = []

for i in range(len(test_set_sample)):
    pp= perplexity(test_set_sample[i], model1)
    perplexities.append(pp)
print("perplexity is" , np.mean(pp))

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


perplexity is 12.011017513076032


# Text generation for Negative reviews using statistical modelliing(n-gram)


In [0]:
import nltk
from nltk.util import ngrams
from nltk.lm import MLE
from nltk import word_tokenize

nltk.download('punkt')

docs = pd.read_csv("./negative_review.csv")
extract = 0.2
docs = docs[:int(extract*len(docs))]
del docs['sentiment']

texts = []
for s in docs['review']:
    texts.append(word_tokenize(s))
#using nltk lm model for padding
from nltk.lm.preprocessing import padded_everygram_pipeline
train, vocab = padded_everygram_pipeline(3, texts)
#creating MLE model
model = MLE(3) 
model.fit(train, vocab)

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


In [0]:
# processing formed sentences to remove unwanted characters
from nltk.tokenize.treebank import TreebankWordDetokenizer

detokenize = TreebankWordDetokenizer().detokenize

def generate_sent(model, num_words, random_seed=42):
    content = []
    for token in model.generate(num_words, random_seed=random_seed):
        if token == '<s>':
            continue
        if token == '</s>':
            break
        content.append(token)
    return detokenize(content)

In [0]:
#sentence before pre - processing
model1 = model
word_list = model.generate(200, random_seed = 12)
generated_text = (' '.join(word for word in word_list))
print(generated_text)

episode of `` Blind Date '' ) is pure prime time television . If he really achieved after this site : it tries , and unfortuneatly , nothing worth talking about and does n't manage to realize early on that island in the genre . ( The television ratings system and my love of a crowd suffocating each other and I use the phrases correctly. < br / > Actually , this baby up at the Toronto Film Festival , I would love to both watch and arse-clenchingly boring . ( Even if there even * was a wonderful tour-de-force for Peter Sellers was one of the killer in the dig. < br / > The drugs part is the reduction of human co-existence . Alan , a great dancer and she kills the wan performance of Lucy Russell in the middle of nowhere and helps organize the first six minutes of the running scenes . Indeed , we kinda noticed she 's just say it again : `` This IS NOT ONE decent scare . The film 's inherent and unintentional humour . This is too `` complicated '' to get better or become unintentionally fun

In [0]:
#sentences after pre - processing
model1 = model
generated_text = generate_sent(model, 200, random_seed=12)
print(generated_text)

episode of``Blind Date") is pure prime time television . If he really achieved after this site: it tries, and unfortuneatly, nothing worth talking about and doesn't manage to realize early on that island in the genre . (The television ratings system and my love of a crowd suffocating each other and I use the phrases correctly. <br /> Actually, this baby up at the Toronto Film Festival, I would love to both watch and arse-clenchingly boring . (Even if there even * was a wonderful tour-de-force for Peter Sellers was one of the killer in the dig. <br /> The drugs part is the reduction of human co-existence . Alan, a great dancer and she kills the wan performance of Lucy Russell in the middle of nowhere and helps organize the first six minutes of the running scenes . Indeed, we kinda noticed she's just say it again:``This IS NOT ONE decent scare . The film's inherent and unintentional humour . This is too``complicated"to get better or become unintentionally funny . Not that


In [0]:
#sentences after pre - processing
generate_sent(model, 200, random_seed=250)


"bigger then the anchorwoman changed into a giant crash is heard in movie theaters . Children will want to look at necklaces from some of the film in mind, thoughts are churning:``What? The story makes it thrilling . Very funny: reminiscent of Disney . Only to have bottomed out here <br /> The premise was good! The book was the Holocaust. <br /> <br /> This is one of a film, it is at a friend who wants to be American and not go for these important issues, especially local ones . What was even better. <br /> <br /> <br /> <br /> <br /> <br /> <br /> <br /> <br /> It's a total douche . He was only average (Van Bebble immediately stated that it would have appeared genuinely happy living in the film together, the hokey flesh-pigtailed flunky, that is?"

In [0]:
#sentences after pre - processing
generate_sent(model, 200, random_seed=-10)


'in kryptonite pajamas . That spoof worked very well . From there, I was one of the music for this on HBO2 . I wouldn\'t allow dogs, which have no souls why not kill drug-sniffing dogs? Makes no sense (`` absent") of Firecombe to hold out . The acting does not walk away and everyone else is just a mercenary? Hutton and Schlesinger don\'t know (or can I picture hearing``motherf---er!"on cable.'

In [0]:
#sentences after pre - processing
generate_sent(model, 200, random_seed=33)


'in the same dumb mistakes literally hundreds of theaters across the stage in one word: VILE! I\'m not sure that you will love this``Bangkok haunted"are the real Jim Ellis, who\'s lives . I an not adverse to a movie . About 30 minutes . If not, personally it didn\'t know what happened, but instead presents himself at board meetings as a traffic jam for 20 seconds.'

In [0]:
#sentences after pre - processing
generate_sent(model, 200, random_seed=69)


"not a big joke! What the hell is there anything that everyone else being killed around him, all is said in the opening scenes, but sadly becomes frustrating exactly because it's a few funny jokes, roll their eyes is one at a Southern Sheriff). <br /> <br /> <br /> I'm just saying that I remember them - it feels as though the town . The only real Lone Ranger, Charlie Chan, Fu Manchu offers a dose of wiz bang action and songs were created by the wisdom of its parts. <br /> Oh, the scene . Fourthly, the patient de resistance, little violence, dark humour plays a laughable Mexican bandit who can't really have to be really embarrassed that you have to have captured some aspects of this could ever synthesize with invented characters . Important dialogue was so stupid I found the dilettante Man Ray and Chicago were wonderful movies in and out cop who is wasted as Jane's Aunt no no no . America's greatest actresses of our Fathers and Bridge of"

# Calculating perplexity for negative review statistical modelling

In [0]:
import nltk
nltk.download('punkt')

test_set = len(generated_text.split())

import random
r = random.sample(range(test_set),15)
test_set_sample = [generated_text.split()[i] for i in r]

import collections, nltk
# we first tokenize the text corpus
tokens = nltk.word_tokenize(text)

#here you construct the language model for creating a dictionary
def unigram(tokens): 
  # creating a dictionary   
    model1 = collections.defaultdict(lambda: 0.01) #For words outside the scope of its knowledge, it assigns a low probability of 0.01
    for f in tokens:
        try:
            model1[f] += 1
        except KeyError:
            model1 [f] = 1
            continue
    N = float(sum(model1.values()))
    for word in model1:
        model1[word] = model1[word]/N
    return model1

#computes perplexity of the trigram model on a testset  
def perplexity(testset, model1):
    testset= nltk.word_tokenize(testset)
    #testset = testset.split()
    perplexity = 1
    N = 3 #change values of N for calculating perplexity of bi - gram or other models
  #calculating inverse probability of occurence of words
    for word in testset:
        N += 1
        perplexity = perplexity * (1/model1[word])
    perplexity = pow(perplexity, 1/float(N)) 
    return perplexity

model1 = unigram(tokens)

perplexities = []

for i in range(len(test_set_sample)):
    pp= perplexity(test_set_sample[i], model1)
    perplexities.append(pp)
print("perplexity is", np.mean(pp))

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


perplexity is 2.1926037074372404


# Text generation for all reviews

In [0]:
text = open("/content/drive/My Drive/Colab Notebooks/IMDB Dataset.csv", 'rb').read().decode(encoding='utf-8')
extract = 0.05
text = text[:int(extract*len(text))]

#Mapping chars to integers
chars = sorted(list(set(text)))
# creating 2 dictionaries with character to integer and integer to character
char_indices = dict((c, i) for i, c in enumerate(chars))
indices_char = dict((i, c) for i, c in enumerate(chars))

#splitting sentences and creating an array with last character
maxlen = 40
step = 3
sentences = []
next_chars = []
for i in range(0, len(text) - maxlen, step):
    sentences.append(text[i: i + maxlen])
    next_chars.append(text[i + maxlen])

#reshaping the sentences into boolean so that it be passed into model
x = np.zeros((len(sentences), maxlen, len(chars)), dtype=np.bool)
y = np.zeros((len(sentences), len(chars)), dtype=np.bool)
for i, sentence in enumerate(sentences):
    for t, char in enumerate(sentence):
        x[i, t, char_indices[char]] = 1
    y[i, char_indices[next_chars[i]]] = 1

#creating a model
model = Sequential()
model.add(LSTM(128, input_shape=(maxlen, len(chars)),return_sequences = True ))
model.add(LSTM(128, return_sequences = False))
model.add(Dropout(0.2))
model.add(Dense(len(chars)))
model.add(Activation('softmax'))
model.summary()
model.compile(optimizer='RMSprop', loss='categorical_crossentropy')

# helper function to sample an index from a probability array
#I got this helper function from the lstm_text_generation example from
#keras. https://github.com/keras-team/keras/blob/master/examples/lstm_text_generation.py
def sample(preds, temperature=1.0):
    preds = np.asarray(preds).astype('float64')
  # using a categorical distribution to predict the character returned by the model
    preds = np.log(preds) / temperature
    exp_preds = np.exp(preds)
    preds = exp_preds / np.sum(exp_preds)
    probas = np.random.multinomial(1, preds, 1)
    return np.argmax(probas)

# Callback function to print predicted text generated by our LSTM. 
#It prints generated text with 5 different temperatures [0.2, 0.5, 1.0, 1.2]. 
#0.2 will generate text with more ordinary word. 1.2 will generate wilder guesses.

def on_epoch_end(epoch, logs):
    # Function invoked at end of each epoch. Prints generated text.
    print()
    print('----- Generating text after Epoch: %d' % epoch)

    start_index = random.randint(0, len(text) - maxlen - 1)
    for diversity in [0.2, 0.5, 1.0, 1.2]:
        print('----- diversity:', diversity)

        generated = ''
        sentence = text[start_index: start_index + maxlen]
        generated += sentence
        print('----- Generating with seed: "' + sentence + '"')
        sys.stdout.write(generated)
#for predicting next character
        for i in range(400):
            x_pred = np.zeros((1, maxlen, len(chars)))
            for t, char in enumerate(sentence):
                x_pred[0, t, char_indices[char]] = 1.

            preds = model.predict(x_pred, verbose=0)[0]
            next_index = sample(preds, diversity)
            next_char = indices_char[next_index]

            generated += next_char
            sentence = sentence[1:] + next_char

            sys.stdout.write(next_char)
            sys.stdout.flush()
        print()
print_callback = LambdaCallback(on_epoch_end=on_epoch_end)

#comparing loss after each epoch and saving weights with least loss
from keras.callbacks import ModelCheckpoint
filepath = "weights.hdf5"
checkpoint = ModelCheckpoint(filepath, monitor='loss',
                             verbose=1, save_best_only=True,
                             mode='min')
from keras.callbacks import ReduceLROnPlateau
reduce_lr = ReduceLROnPlateau(monitor='loss', factor=0.2,
                              patience=1, min_lr=0.001)

callbacks = [print_callback, checkpoint, reduce_lr]
model.fit(x, y, batch_size=2048, epochs=10, callbacks=callbacks)

Model: "sequential_7"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
lstm_13 (LSTM)               (None, 40, 128)           137728    
_________________________________________________________________
lstm_14 (LSTM)               (None, 128)               131584    
_________________________________________________________________
dropout_7 (Dropout)          (None, 128)               0         
_________________________________________________________________
dense_7 (Dense)              (None, 140)               18060     
_________________________________________________________________
activation_7 (Activation)    (None, 140)               0         
Total params: 287,372
Trainable params: 287,372
Non-trainable params: 0
_________________________________________________________________
Epoch 1/10

----- Generating text after Epoch: 0
----- diversity: 0.2
----- Generating with seed: " I have liked h

<keras.callbacks.callbacks.History at 0x7fccd5367fd0>

In [0]:
def generate_text(length, diversity):
    # Get random starting text
    start_index = random.randint(0, len(text) - maxlen - 1)
    generated = ''
    sentence = text[start_index: start_index + maxlen]
    generated += sentence
    for i in range(length):
            x_pred = np.zeros((1, maxlen, len(chars)))
            for t, char in enumerate(sentence):
                x_pred[0, t, char_indices[char]] = 1.
# predicting next character in the model
            preds = model.predict(x_pred, verbose=0)[0]
            next_index = sample(preds, diversity)
            next_char = indices_char[next_index]

            generated += next_char
            sentence = sentence[1:] + next_char
    return generated

In [0]:
model1 = model
generated_text = generate_text(100, 1)
print(generate_text(100, 1))

funky way. Apparently people either like An Krant Rademon due making scienita is simple. Some all them actures deeriot and while mess stup o


In [0]:
model.save('/content/drive/My Drive/Deep_Learning_models/NLP_all.h5')

In [0]:
print(generate_text(100, 0.8))

only one. Taylor reportedly hated it and we looks Abirot' But work suppen tear or the camority and a martion movie though really nidely. The


In [0]:
print(generate_text(100, 0.6))

ut of Evil Dead II). There are, however, was a director movie show directing the show in the master in this better way the screen and someth


In [0]:
print(generate_text(100, 0.4))

s, ever. Just beautiful, full of human ender the fact the are about the film and it than the movie is a sure of the have the main and the mo


In [0]:
print(generate_text(100, 0.2))

o be good because I'm interested in the screen and the movie is a movie and the the the the the the the the the the story of the movie and t


# Calculating perplexity for all reviews

In [0]:
import nltk
nltk.download('punkt')

test_set = len(generated_text.split())

import random
r = random.sample(range(test_set),15)
test_set_sample = [generated_text.split()[i] for i in r]

import collections, nltk
# we first tokenize the text corpus
tokens = nltk.word_tokenize(text)

#here you construct the language model for creating a dictionary
def unigram(tokens): 
  # creating a dictionary   
    model1 = collections.defaultdict(lambda: 0.01) #For words outside the scope of its knowledge, it assigns a low probability of 0.01
    for f in tokens:
        try:
            model1[f] += 1
        except KeyError:
            model1 [f] = 1
            continue
    N = float(sum(model1.values()))
    for word in model1:
        model1[word] = model1[word]/N
    return model1

#computes perplexity of the bigram model on a testset  
def perplexity(testset, model1):
    testset= nltk.word_tokenize(testset)
    perplexity = 1
    N = 2 #change values of N for calculating perplexity of tri - gram or other models
    for word in testset:
        N += 1
        #calculating inverse probability of occurence of words
        perplexity = perplexity * (1/model1[word])
    perplexity = pow(perplexity, 1/float(N)) 
    return perplexity

model1 = unigram(tokens)

perplexities = []

for i in range(len(test_set_sample)):
    pp= perplexity(test_set_sample[i], model1)
    perplexities.append(pp)
print("perplexity is" , np.mean(pp))

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


perplexity is 6.986476760727675


# Text generation for ALL reviews using statistical modelliing(n-gram)

In [0]:

import nltk
from nltk.util import ngrams
from nltk.lm import MLE
from nltk import word_tokenize
# we need to download a special component that is used by the tokenizer below -- don't worry about it. 
nltk.download('punkt')

docs = pd.read_csv("/content/drive/My Drive/Colab Notebooks/IMDB Dataset.csv")
extract = 0.05
docs = docs[:int(extract*len(docs))]
del docs['sentiment']

texts = []
for s in docs['review']:
    texts.append(word_tokenize(s))
#Using nltk lm model for padding
from nltk.lm.preprocessing import padded_everygram_pipeline
train, vocab = padded_everygram_pipeline(3, texts)
#Creating model
model = MLE(3) 
model.fit(train, vocab)

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


In [0]:
#pre - processing is not done for this model in order to show difference between outputs
model1 = model
word_list = model.generate(200, random_seed = 12)
generated_text = (' '.join(word for word in word_list))
print(generated_text)

encouraging his pack 's hands ; however , the music is just a little black girl shouting crude insults to him . This Lifetime-like movie was made , well worth it . </s> </s> </s> </s> </s> </s> </s> </s> </s> </s> </s> </s> </s> </s> </s> </s> </s> </s> </s> </s> </s> </s> </s> </s> </s> </s> </s> </s> </s> </s> </s> </s> </s> </s> </s> </s> </s> </s> </s> </s> </s> </s> </s> </s> </s> </s> </s> </s> </s> </s> </s> </s> </s> </s> </s> </s> </s> </s> </s> </s> </s> </s> </s> </s> </s> </s> </s> </s> </s> </s> </s> </s> </s> </s> </s> </s> </s> </s> </s> </s> </s> </s> </s> </s> </s> </s> </s> </s> </s> </s> </s> </s> </s> </s> </s> </s> </s> </s> </s> </s> </s> </s> </s> </s> </s> </s> </s> </s> </s> </s> </s> </s> </s> </s> </s> </s> </s> </s> </s> </s> </s> </s> </s> </s> </s> </s> </s> </s> </s> </s> </s> </s> </s> </s> </s> </s> </s> </s> </s> </s> </s> </s> </s> </s> </s> </s> </s> </s> </s> </s> </s> </s> </s> </s> </s> </s> </s> </s> </s> </s> </s> </s> </s> </s> </s> </s> </s> <

In [0]:
word_list = model.generate(200, random_seed = 250)
print(' '.join(word for word in word_list))

before watching this . < br / > < br / > < br / > So Watch It Only If You Knew Susie ( Like Mimoso and Susana ) sounds like the ravings of a dance sequence where mannequins are used to better movies and compare it to just before tryouts and wore tights to cover his embezzlement . He played that role , making a supernatural psychological thriller worthy of some great actors were age-appropriate to this film is smart , moderately obnoxious skilled writer . But i would recommend this film , as in `` Fantastic Easter Special '' Sigmund & the lack of understanding to his bumbling , unsure-of-himself , low-key star , Janet Heffernan , Spence and Doug Pruzan ( Carrie 's boss , Dr. Janos Rukh ( Karloff ) demonstrates to colleagues Dr. Felix Benet ( Bela , Boris , a scientist . And while Olivier and Oberon are not familiar with one dull spot in this movie without having a torrid affair during the shootout ) , but there is a hardcore rock band from Des Moines , Iowa . Nine band members decide to

In [0]:
word_list = model.generate(200, random_seed = -10)
print(' '.join(word for word in word_list))

in it . Although the society in Elizabethan England . Although this was MUCH better ! < br / > I must admit that « Kitchen » a « Tatiesque » movie ? Why come hero always have this re-make which , given Ferrari 's pieces are and what are told first from Sue 's Asian boyfriend but even i can identify with one big ball of cheese that surrounds you enabling you to < br / > < br / > The original animated Dark Knight returns in his life as portrayed in the kitchen scene , even with it '' . It 's a clever one . If you do n't know what watching this in his travel. < br / > God answers the citizens sing to welcome their new princess '' keep bumping into this film is a neo-Hollywood faux-liberal , so I ca n't be worse than even your average cinema . From the cutting dialogue to the brilliant character acting on the Lifetime network . An amateur film made in Afghanistan but i just had someone new , indefinable world beyond man 's name in his role really projecting Wilde himself , really ,


In [0]:
word_list = model.generate(200, random_seed = 33)
print(' '.join(word for word in word_list))

important role of Lane.They have changed the course of the guys work there , waiting for a fool for buying it ? Have they changed the outfit and is very difficult for young talented girls . </s> </s> </s> </s> </s> </s> </s> </s> </s> </s> </s> </s> </s> </s> </s> </s> </s> </s> </s> </s> </s> </s> </s> </s> </s> </s> </s> </s> </s> </s> </s> </s> </s> </s> </s> </s> </s> </s> </s> </s> </s> </s> </s> </s> </s> </s> </s> </s> </s> </s> </s> </s> </s> </s> </s> </s> </s> </s> </s> </s> </s> </s> </s> </s> </s> </s> </s> </s> </s> </s> </s> </s> </s> </s> </s> </s> </s> </s> </s> </s> </s> </s> </s> </s> </s> </s> </s> </s> </s> </s> </s> </s> </s> </s> </s> </s> </s> </s> </s> </s> </s> </s> </s> </s> </s> </s> </s> </s> </s> </s> </s> </s> </s> </s> </s> </s> </s> </s> </s> </s> </s> </s> </s> </s> </s> </s> </s> </s> </s> </s> </s> </s> </s> </s> </s> </s> </s> </s> </s> </s> </s> </s> </s> </s> </s> </s> </s> </s> </s> </s> </s> </s> </s> </s> </s> </s> </s> </s> </s> </s> </s> </s> 

In [0]:
word_list = model.generate(200, random_seed = 69)
print(' '.join(word for word in word_list))

no attempt to portray an interpretive expression of a soldier trying to do some housework for her father 's accountant ( King ) from the set 's cocoon of cheese that surrounds it . </s> </s> </s> </s> </s> </s> </s> </s> </s> </s> </s> </s> </s> </s> </s> </s> </s> </s> </s> </s> </s> </s> </s> </s> </s> </s> </s> </s> </s> </s> </s> </s> </s> </s> </s> </s> </s> </s> </s> </s> </s> </s> </s> </s> </s> </s> </s> </s> </s> </s> </s> </s> </s> </s> </s> </s> </s> </s> </s> </s> </s> </s> </s> </s> </s> </s> </s> </s> </s> </s> </s> </s> </s> </s> </s> </s> </s> </s> </s> </s> </s> </s> </s> </s> </s> </s> </s> </s> </s> </s> </s> </s> </s> </s> </s> </s> </s> </s> </s> </s> </s> </s> </s> </s> </s> </s> </s> </s> </s> </s> </s> </s> </s> </s> </s> </s> </s> </s> </s> </s> </s> </s> </s> </s> </s> </s> </s> </s> </s> </s> </s> </s> </s> </s> </s> </s> </s> </s> </s> </s> </s> </s> </s> </s> </s> </s> </s> </s> </s> </s> </s> </s> </s> </s> </s> </s> </s> </s> </s> </s> </s> </s> </s> </s>

# Carculating perplexity for all reviews statistical modelling

In [0]:
import nltk
nltk.download('punkt')

test_set = len(generated_text.split())

import random
r = random.sample(range(test_set),15)
test_set_sample = [generated_text.split()[i] for i in r]

import collections, nltk
# we first tokenize the text corpus
tokens = nltk.word_tokenize(text)

#here you construct the language model for creating a dictionary
def unigram(tokens): 
  # creating a dictionary   
    model1 = collections.defaultdict(lambda: 0.01) #For words outside the scope of its knowledge, it assigns a low probability of 0.01
    for f in tokens:
        try:
            model1[f] += 1
        except KeyError:
            model1 [f] = 1
            continue
    N = float(sum(model1.values()))
    for word in model1:
        model1[word] = model1[word]/N
    return model1

#computes perplexity of the trigram model on a testset  
def perplexity(testset, model1):
    testset= nltk.word_tokenize(testset)
    #testset = testset.split()
    perplexity = 1
    N = 3 #change values of N for calculating perplexity of bi - gram or other models
  #calculating inverse probability of occurence of words
    for word in testset:
        N += 1
        perplexity = perplexity * (1/model1[word])
    perplexity = pow(perplexity, 1/float(N)) 
    return perplexity

model1 = unigram(tokens)

perplexities = []

for i in range(len(test_set_sample)):
    pp= perplexity(test_set_sample[i], model1)
    perplexities.append(pp)
print("perplexity is", np.mean(pp))

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


perplexity is 2.606965673054827
