In [11]:
import re
import  string
import nltk
import numpy as np

from os import listdir
from nltk.corpus import stopwords
from keras.preprocessing.text import Tokenizer
from keras.utils import plot_model

In [4]:
nltk.download('stopwords')

[nltk_data] Downloading package stopwords to /home/dai/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

In [13]:
# Load doc into memory
def load_doc(filename):
    with open(filename, 'r') as file:
        text = file.read()
    return text

# Testing the function
text = load_doc('./data/review_polarity-20231215T050834Z-001/review_polarity/review_polarity/txt_sentoken/neg/cv000_29416.txt')
text

'plot : two teen couples go to a church party , drink and then drive . \nthey get into an accident . \none of the guys dies , but his girlfriend continues to see him in her life , and has nightmares . \nwhat\'s the deal ? \nwatch the movie and " sorta " find out . . . \ncritique : a mind-fuck movie for the teen generation that touches on a very cool idea , but presents it in a very bad package . \nwhich is what makes this review an even harder one to write , since i generally applaud films which attempt to break the mold , mess with your head and such ( lost highway & memento ) , but there are good and bad ways of making all types of films , and these folks just didn\'t snag this one correctly . \nthey seem to have taken this pretty neat concept , but executed it terribly . \nso what are the problems with the movie ? \nwell , its main problem is that it\'s simply too jumbled . \nit starts off " normal " but then downshifts into this " fantasy " world in which you , as an audience membe

In [14]:
# turn a doc into clean tokens
def clean_doc(doc):
    tokens = doc.split()
    re_punc = re.compile('[%s]' % re.escape(string.punctuation))
    tokens = [re_punc.sub('', w) for w in tokens]
    tokens = [word for word in tokens if word.isalpha()]
    stop_words = set(stopwords.words('english'))
    tokens = [w for w in tokens if w not in stop_words]
    tokens = [word for word in tokens if len(word) > 1]
    return tokens


clean = clean_doc(text)
clean

['plot',
 'two',
 'teen',
 'couples',
 'go',
 'church',
 'party',
 'drink',
 'drive',
 'get',
 'accident',
 'one',
 'guys',
 'dies',
 'girlfriend',
 'continues',
 'see',
 'life',
 'nightmares',
 'whats',
 'deal',
 'watch',
 'movie',
 'sorta',
 'find',
 'critique',
 'mindfuck',
 'movie',
 'teen',
 'generation',
 'touches',
 'cool',
 'idea',
 'presents',
 'bad',
 'package',
 'makes',
 'review',
 'even',
 'harder',
 'one',
 'write',
 'since',
 'generally',
 'applaud',
 'films',
 'attempt',
 'break',
 'mold',
 'mess',
 'head',
 'lost',
 'highway',
 'memento',
 'good',
 'bad',
 'ways',
 'making',
 'types',
 'films',
 'folks',
 'didnt',
 'snag',
 'one',
 'correctly',
 'seem',
 'taken',
 'pretty',
 'neat',
 'concept',
 'executed',
 'terribly',
 'problems',
 'movie',
 'well',
 'main',
 'problem',
 'simply',
 'jumbled',
 'starts',
 'normal',
 'downshifts',
 'fantasy',
 'world',
 'audience',
 'member',
 'idea',
 'whats',
 'going',
 'dreams',
 'characters',
 'coming',
 'back',
 'dead',
 'others',

In [18]:
# load doc, clean and return line of tokens
def doc_to_line(filename, vocab):
    # Load the doc
    doc = load_doc(filename)
    # Clean the doc
    tokens = clean_doc(doc)
    # filter by vocab
    tokens = [w for w in tokens if w in vocab]
    return " ".join(tokens)


with open('./data/vocab.txt') as file:
    vocab = file.read().split()

print(doc_to_line('./data/review_polarity-20231215T050834Z-001/review_polarity/review_polarity/txt_sentoken/neg/cv000_29416.txt',  vocab=vocab))

plot two teen couples go church party drink drive get accident one guys dies girlfriend continues see life nightmares whats deal watch movie sorta find critique mindfuck movie teen generation touches cool idea presents bad package makes review even harder one write since generally applaud films attempt break mold mess head lost highway memento good bad ways making types films folks didnt snag one correctly seem taken pretty neat concept executed terribly problems movie well main problem simply jumbled starts normal downshifts fantasy world audience member idea whats going dreams characters coming back dead others look like dead strange apparitions disappearances chase scenes tons weird things happen simply explained personally dont mind trying unravel film every give clue get kind fed films biggest problem obviously got big secret hide seems want hide completely final five minutes make things entertaining thrilling even engaging meantime really sad part arrow dig flicks like actually f

In [19]:
def process_train(directory, vocab):
    documents = list()
    for filename in listdir(directory):
        if not filename.startswith('cv9'):
            path = directory + '/' + filename
            doc = load_doc(path)
            tokens = clean_doc(doc, vocab)
            documents.append(tokens)
    return documents


def process_test(directory, vocab):
    documents = list()
    for filename in listdir(directory):
        if filename.startswith('cv9'):
            path = directory + '/' + filename
            doc = load_doc(path)
            tokens = clean_doc(doc, vocab)
            documents.append(tokens)
    return documents

In [22]:
# Load all docs in directory
def process_docs(directory, vocab, is_train):
    documents = list()
    for filename in listdir(directory):
        if is_train and filename.startswith('cv9'):
            continue
        if not is_train and not filename.startswith('cv9'):
            continue
        path = directory + '/' + filename
        doc = load_doc(path)
        tokens = clean_doc(doc)
        documents.append(tokens)
    return documents

In [26]:
lines = process_docs(directory='./data/review_polarity-20231215T050834Z-001/review_polarity/txt_sentoken/pos',
                     vocab=vocab,
                     is_train=True)
len(lines)

900

In [27]:
lines

[['get',
  'slap',
  'together',
  'movie',
  'based',
  'story',
  'legendary',
  'george',
  'lucas',
  'directed',
  'virtuoso',
  'director',
  'steven',
  'spielberg',
  'starring',
  'one',
  'biggest',
  'boxoffice',
  'stars',
  'world',
  'harrison',
  'ford',
  'get',
  'one',
  'hotfudgerockin',
  'good',
  'time',
  'thats',
  'get',
  'plot',
  'professorarcheologist',
  'indiana',
  'jones',
  'sets',
  'find',
  'longlost',
  'mystical',
  'ark',
  'covenant',
  'nazis',
  'get',
  'grubby',
  'fingers',
  'hands',
  'adventures',
  'snakes',
  'romance',
  'mucho',
  'action',
  'ensues',
  'critique',
  'astounding',
  'movie',
  'packed',
  'nonstop',
  'action',
  'stunts',
  'galore',
  'interesting',
  'story',
  'line',
  'great',
  'oneliners',
  'solid',
  'cast',
  'catchy',
  'musical',
  'score',
  'fun',
  'adventure',
  'could',
  'squeeze',
  'twohour',
  'thrill',
  'ride',
  'dont',
  'enjoy',
  'film',
  'dont',
  'like',
  'action',
  'movies',
  'peri