# Movie Reviews Sentiment Classification

In [59]:
import string
import re
from os import listdir
from nltk.corpus import stopwords

## Load Doc into memory

In [60]:
def load_doc(filename):
    file=open(filename,'r')
    text=file.read()
    file.close()
    return text

## Turn a document into clean tokens

In [62]:
def clean_doc(doc):
    #split into tokens by white space
    tokens=doc.split()
    #remove punctuations
    re_punc=re.compile('[%s]'% re.escape(string.punctuation))
    #remove punctuations from each word
    tokens=[re_punc.sub('',word) for word in tokens]
    #remove remaining tokens that are not alphabetic in nature
    tokens=[word for word in tokens if word.isalpha()]
    #filter out stop words
    stop_words=set(stopwords.words('english'))
    tokens=[word for word in tokens if word not in stop_words]
    #filter out short tokens
    tokens=[word for word in tokens if len(word)>1]
    return tokens

## Saving list to a file

In [63]:
def save_list(lines, filename):
    data='\n'.join(lines)
    file=open(filename,'w')
    file.write(data)
    file.close()

## Load document , clean and return line of tokens

In [64]:
def doc_to_line(filename, vocab):
    #load the doc
    doc=load_doc(filename)
    #clean doc
    tokens=clean_doc(doc)
    #filter by vocab
    tokens=[w for w in tokens if w in vocab]
    return ' '.join(tokens)

## Load all docs in a directory

In [65]:
def process_docs(directory, vocab):
    lines=list()
    #Walk through all the files in the folder
    for filename in listdir(directory):
        if not filename.endswith(".txt"):
            next
        #Create a full path to open the file
        path=directory+'/'+filename
        #load and clean the doc
        line=doc_to_line(path, vocab)
        #add to list
        lines.append(line)
    return line

In [66]:
#load vocabulary 
vocab_filename='./vocab.txt'
vocab=load_doc(vocab_filename)
vocab=vocab.split()
vocab=set(vocab)
#Prepare negative reviews
negative_lines=process_docs('./txt_sentoken/neg/', vocab)
save_list(negative_lines,'negative.txt')
#prepare positive reviews
positive_lines=process_docs('./txt_sentoken/pos/', vocab)
save_list(positive_lines,'positive.txt')
