# Create a NLP Pipeline to 'Clean' Reviews Data

1. Load Input File and Read Reviews
2. Tokenize
3. Remove Stopwords
4. Perform Stemming
5. Write cleaned data to output file

In [1]:
sample_text =  """I loved this movie since I was 7 and I saw it on the opening day. It was so touching and beautiful. I strongly recommend seeing for all. It's a movie to watch with your family by far.<br /><br />My MPAA rating: PG-13 for thematic elements, prolonged scenes of disastor, nudity/sexuality and some language."""

### NLTK

In [2]:
from nltk.tokenize import RegexpTokenizer #import regular expression tokenizer

In [4]:
from nltk.stem.porter import PorterStemmer
from nltk.corpus import stopwords

In [6]:
# Init objects
tokenizer = RegexpTokenizer(r'\w+') # r stands for regular expression and '\w+' stands for all the words.
en_stopwords = set(stopwords.words('english'))
ps = PorterStemmer()

In [11]:
def getStemmedReview(review):
    review = review.lower()
    review = review.replace("<br /><br />"," ") # there are many break tags as the training data is extracted from some html page
    # replaced all break tags with space.
    #Tokenize
    tokens = tokenizer.tokenize(review)
    new_tokens = [token for token in tokens if token not in en_stopwords]
    stemmed_tokens = [ps.stem(token) for token in new_tokens]
    
    cleaned_review = ' '.join(stemmed_tokens)
    
    return cleaned_review

In [12]:
getStemmedReview(sample_text)

'love movi sinc 7 saw open day touch beauti strongli recommend see movi watch famili far mpaa rate pg 13 themat element prolong scene disastor nuditi sexual languag'

In [13]:
# So, this of review will be processed faster by our Machine Learning Algorithm.

In [20]:
# Write one function that accepts an input file and returns clean output file of movie reviews.

def getStemmedDocument(inputFile,outputFile):
    
    # we also have to open the output file
    out = open(outputFile, 'w', encoding="utf8") # open in write mode (w).
    
    with open(inputFile, encoding="utf8") as f:
        reviews = f.readlines()
        
    for review in reviews:
        cleaned_review = getStemmedReview(review)
        print((cleaned_review),file=out)
        
    out.close()

In [23]:
# Read command line arguments, for that we need to import the sys module
import sys



In [22]:
getStemmedDocument('IMDB/imdb_trainX.txt','IMDB/imdb_trainX_cleaned.txt')

In [26]:
# If we have to call the code from terminal then,

#inputFile = sys.argv[1]
#outputFile = sys.argv[2]
#getStemmedDocument(inputFile,outputFile)

#Run this script in terminal
#python clean_reviews.py imdb_toy_X.txt imdb_toy_clean_X.txt