In [None]:
#How to Prepare Text Data for Machine Learning with scikit-learn
#Movie review dataset:“txt_sentoken” with two sub-directories containing the text “neg” and “pos” 
#for negative and positive reviews.Reviews are stored one per file with a naming convention 
#cv000 to cv999 for each of neg and pos.

In [None]:
#Import necessary libraries 
from string import punctuation
from os import listdir
from collections import Counter
from nltk.corpus import stopwords
from os import listdir

In [1]:
#Step1: A function called load_doc() that takes a filename of the document to load and returns the text.

# load doc into memory
def load_doc(filename):
	# open the file as read only
	file = open(filename, 'r')
	# read all text
	text = file.read()
	# close the file
	file.close()
	return text

In [7]:
#Step2: We have two directories each with 1,000 documents each. We can process each directory in turn by first getting
#a list of files in the directory using the listdir() function, then loading each file in turn.
#For example, we can load each document in the negative directory using the load_doc() function to 
#do the actual loading

from os import listdir
# load all docs in a directory
def process_docs(directory):
	# walk through all files in the folder
	for filename in listdir(directory):
		# skip files that do not have the right extension
		if not filename.endswith(".txt"):
			continue
		# create the full path of the file to open
		path = directory + '/' + filename
		# load document
		doc = load_doc(path)
		print('Loaded %s' % filename)
 
# specify directory to load
directory = '/Users/hasanuzz/Desktop/review_data/txt_sentoken/neg'
process_docs(directory)


Loaded cv676_22202.txt
Loaded cv839_22807.txt
Loaded cv155_7845.txt
Loaded cv465_23401.txt
Loaded cv398_17047.txt
Loaded cv206_15893.txt
Loaded cv037_19798.txt
Loaded cv279_19452.txt
Loaded cv646_16817.txt
Loaded cv756_23676.txt
Loaded cv823_17055.txt
Loaded cv747_18189.txt
Loaded cv258_5627.txt
Loaded cv948_25870.txt
Loaded cv744_10091.txt
Loaded cv754_7709.txt
Loaded cv838_25886.txt
Loaded cv131_11568.txt
Loaded cv401_13758.txt
Loaded cv523_18285.txt
Loaded cv073_23039.txt
Loaded cv688_7884.txt
Loaded cv664_4264.txt
Loaded cv461_21124.txt
Loaded cv909_9973.txt
Loaded cv939_11247.txt
Loaded cv368_11090.txt
Loaded cv185_28372.txt
Loaded cv749_18960.txt
Loaded cv836_14311.txt
Loaded cv322_21820.txt
Loaded cv789_12991.txt
Loaded cv617_9561.txt
Loaded cv288_20212.txt
Loaded cv464_17076.txt
Loaded cv904_25663.txt
Loaded cv866_29447.txt
Loaded cv429_7937.txt
Loaded cv212_10054.txt
Loaded cv007_4992.txt
Loaded cv522_5418.txt
Loaded cv109_22599.txt
Loaded cv753_11812.txt
Loaded cv312_29308.tx

In [8]:
#Step3: Clean Text Data

#We will assume that we will be using a bag-of-words model or perhaps a word embedding that does not require 
#too much preparation.

#Split into Tokens

#First, let’s load one document and look at the raw tokens split by white space. 
#We will use the load_doc() function developed in the previous section. We can use the split() 
#function to split the loaded document into tokens separated by white space.

# load doc into memory
def load_doc(filename):
	# open the file as read only
	file = open(filename, 'r')
	# read all text
	text = file.read()
	# close the file
	file.close()
	return text

# load the document
filename = '/Users/hasanuzz/Desktop/review_data/txt_sentoken/neg/cv000_29416.txt'
text = load_doc(filename)
# split into tokens by white space
tokens = text.split()
print(tokens)

['plot', ':', 'two', 'teen', 'couples', 'go', 'to', 'a', 'church', 'party', ',', 'drink', 'and', 'then', 'drive', '.', 'they', 'get', 'into', 'an', 'accident', '.', 'one', 'of', 'the', 'guys', 'dies', ',', 'but', 'his', 'girlfriend', 'continues', 'to', 'see', 'him', 'in', 'her', 'life', ',', 'and', 'has', 'nightmares', '.', "what's", 'the', 'deal', '?', 'watch', 'the', 'movie', 'and', '"', 'sorta', '"', 'find', 'out', '.', '.', '.', 'critique', ':', 'a', 'mind-fuck', 'movie', 'for', 'the', 'teen', 'generation', 'that', 'touches', 'on', 'a', 'very', 'cool', 'idea', ',', 'but', 'presents', 'it', 'in', 'a', 'very', 'bad', 'package', '.', 'which', 'is', 'what', 'makes', 'this', 'review', 'an', 'even', 'harder', 'one', 'to', 'write', ',', 'since', 'i', 'generally', 'applaud', 'films', 'which', 'attempt', 'to', 'break', 'the', 'mold', ',', 'mess', 'with', 'your', 'head', 'and', 'such', '(', 'lost', 'highway', '&', 'memento', ')', ',', 'but', 'there', 'are', 'good', 'and', 'bad', 'ways', 'of'

In [10]:
#Step3 Continues: We could do various things to clean the data such as 

# Remove punctuation from words (e.g. ‘what’s’).
# Removing tokens that are just punctuation (e.g. ‘-‘).
# Removing tokens that contain numbers (e.g. ’10/10′).
# Remove tokens that have one character (e.g. ‘a’).
# Remove tokens that don’t have much meaning (e.g. ‘and’)

# Some ideas:

# We can filter out punctuation from tokens using the string translate() function.
# We can remove tokens that are just punctuation or contain numbers by using an isalpha() check on each token.
# We can remove English stop words using the list loaded using NLTK.
# We can filter out short tokens by checking their length.

from nltk.corpus import stopwords
import string

# load doc into memory
def load_doc(filename):
	# open the file as read only
	file = open(filename, 'r')
	# read all text
	text = file.read()
	# close the file
	file.close()
	return text

# load the document
filename = '/Users/hasanuzz/Desktop/review_data/txt_sentoken/neg/cv000_29416.txt'
text = load_doc(filename)
# split into tokens by white space
tokens = text.split()
# remove punctuation from each token
table = str.maketrans('', '', string.punctuation)
tokens = [w.translate(table) for w in tokens]
# remove remaining tokens that are not alphabetic
tokens = [word for word in tokens if word.isalpha()]
# filter out stop words
stop_words = set(stopwords.words('english'))
tokens = [w for w in tokens if not w in stop_words]
# filter out short tokens
tokens = [word for word in tokens if len(word) > 1]
print(tokens)

['plot', 'two', 'teen', 'couples', 'go', 'church', 'party', 'drink', 'drive', 'get', 'accident', 'one', 'guys', 'dies', 'girlfriend', 'continues', 'see', 'life', 'nightmares', 'whats', 'deal', 'watch', 'movie', 'sorta', 'find', 'critique', 'mindfuck', 'movie', 'teen', 'generation', 'touches', 'cool', 'idea', 'presents', 'bad', 'package', 'makes', 'review', 'even', 'harder', 'one', 'write', 'since', 'generally', 'applaud', 'films', 'attempt', 'break', 'mold', 'mess', 'head', 'lost', 'highway', 'memento', 'good', 'bad', 'ways', 'making', 'types', 'films', 'folks', 'didnt', 'snag', 'one', 'correctly', 'seem', 'taken', 'pretty', 'neat', 'concept', 'executed', 'terribly', 'problems', 'movie', 'well', 'main', 'problem', 'simply', 'jumbled', 'starts', 'normal', 'downshifts', 'fantasy', 'world', 'audience', 'member', 'idea', 'whats', 'going', 'dreams', 'characters', 'coming', 'back', 'dead', 'others', 'look', 'like', 'dead', 'strange', 'apparitions', 'disappearances', 'looooot', 'chase', 'scen

In [12]:
#Step3 Continues:
# We can put the above block into a function called clean_doc() and test it on another review, this time a positive review.

from nltk.corpus import stopwords
import string

# load doc into memory
def load_doc(filename):
	# open the file as read only
	file = open(filename, 'r')
	# read all text
	text = file.read()
	# close the file
	file.close()
	return text

# turn a doc into clean tokens
def clean_doc(doc):
	# split into tokens by white space
	tokens = doc.split()
	# remove punctuation from each token
	table = str.maketrans('', '', string.punctuation)
	tokens = [w.translate(table) for w in tokens]
	# remove remaining tokens that are not alphabetic
	tokens = [word for word in tokens if word.isalpha()]
	# filter out stop words
	stop_words = set(stopwords.words('english'))
	tokens = [w for w in tokens if not w in stop_words]
	# filter out short tokens
	tokens = [word for word in tokens if len(word) > 1]
	return tokens

# load the document
filename = '/Users/hasanuzz/Desktop/review_data/txt_sentoken/pos/cv000_29590.txt'
text = load_doc(filename)
tokens = clean_doc(text)
print(tokens)

['films', 'adapted', 'comic', 'books', 'plenty', 'success', 'whether', 'theyre', 'superheroes', 'batman', 'superman', 'spawn', 'geared', 'toward', 'kids', 'casper', 'arthouse', 'crowd', 'ghost', 'world', 'theres', 'never', 'really', 'comic', 'book', 'like', 'hell', 'starters', 'created', 'alan', 'moore', 'eddie', 'campbell', 'brought', 'medium', 'whole', 'new', 'level', 'mid', 'series', 'called', 'watchmen', 'say', 'moore', 'campbell', 'thoroughly', 'researched', 'subject', 'jack', 'ripper', 'would', 'like', 'saying', 'michael', 'jackson', 'starting', 'look', 'little', 'odd', 'book', 'graphic', 'novel', 'pages', 'long', 'includes', 'nearly', 'consist', 'nothing', 'footnotes', 'words', 'dont', 'dismiss', 'film', 'source', 'get', 'past', 'whole', 'comic', 'book', 'thing', 'might', 'find', 'another', 'stumbling', 'block', 'hells', 'directors', 'albert', 'allen', 'hughes', 'getting', 'hughes', 'brothers', 'direct', 'seems', 'almost', 'ludicrous', 'casting', 'carrot', 'top', 'well', 'anythi

In [None]:
# Step4: Develop Vocabulary 

# We need to develop a new function to process a document and add it to the vocabulary. The function needs to load a
# document by calling the previously developed load_doc() function. It needs to clean the loaded document using the 
# previously developed clean_doc() function, then it needs to add all the tokens to the Counter (it is a function), 
# and update counts. We can do this last step by calling the update() function on the counter object.

#  add_doc_to_vocab() a function that takes as arguments a document filename and a Counter vocabulary.

# load doc and add to vocab
def add_doc_to_vocab(filename, vocab):
	# load doc
	doc = load_doc(filename)
	# clean doc
	tokens = clean_doc(doc)
	# update counts
	vocab.update(tokens)


In [None]:
#  Step4: Develop Vocabulary (continues)
# Finally, we can use our template above for processing all documents in a directory called process_docs() and update 
# it to call add_doc_to_vocab()

# load all docs in a directory
def process_docs(directory, vocab):
	# walk through all files in the folder
	for filename in listdir(directory):
		# skip files that do not have the right extension
		if not filename.endswith(".txt"):
			continue
		# create the full path of the file to open
		path = directory + '/' + filename
		# add doc to vocab
		add_doc_to_vocab(path, vocab)
        

In [13]:
# We can put all of this together and develop a full vocabulary from all documents in the dataset.

from string import punctuation
from os import listdir
from collections import Counter
from nltk.corpus import stopwords

# load doc into memory
def load_doc(filename):
	# open the file as read only
	file = open(filename, 'r')
	# read all text
	text = file.read()
	# close the file
	file.close()
	return text

# turn a doc into clean tokens
def clean_doc(doc):
	# split into tokens by white space
	tokens = doc.split()
	# remove punctuation from each token
	table = str.maketrans('', '', punctuation)
	tokens = [w.translate(table) for w in tokens]
	# remove remaining tokens that are not alphabetic
	tokens = [word for word in tokens if word.isalpha()]
	# filter out stop words
	stop_words = set(stopwords.words('english'))
	tokens = [w for w in tokens if not w in stop_words]
	# filter out short tokens
	tokens = [word for word in tokens if len(word) > 1]
	return tokens

# load doc and add to vocab
def add_doc_to_vocab(filename, vocab):
	# load doc
	doc = load_doc(filename)
	# clean doc
	tokens = clean_doc(doc)
	# update counts
	vocab.update(tokens)

# load all docs in a directory
def process_docs(directory, vocab):
	# walk through all files in the folder
	for filename in listdir(directory):
		# skip files that do not have the right extension
		if not filename.endswith(".txt"):
			continue
		# create the full path of the file to open
		path = directory + '/' + filename
		# add doc to vocab
		add_doc_to_vocab(path, vocab)

# define vocab
vocab = Counter()
# add all docs to vocab
process_docs('/Users/hasanuzz/Desktop/review_data/txt_sentoken/neg', vocab)
process_docs('/Users/hasanuzz/Desktop/review_data/txt_sentoken/pos', vocab)
# print the size of the vocab
print(len(vocab))
# print the top words in the vocab
print(vocab.most_common(50))

46557
[('film', 8860), ('one', 5521), ('movie', 5440), ('like', 3553), ('even', 2555), ('good', 2320), ('time', 2283), ('story', 2118), ('films', 2102), ('would', 2042), ('much', 2024), ('also', 1965), ('characters', 1947), ('get', 1921), ('character', 1906), ('two', 1825), ('first', 1768), ('see', 1730), ('well', 1694), ('way', 1668), ('make', 1590), ('really', 1563), ('little', 1491), ('life', 1472), ('plot', 1451), ('people', 1420), ('movies', 1416), ('could', 1395), ('bad', 1374), ('scene', 1373), ('never', 1364), ('best', 1301), ('new', 1277), ('many', 1268), ('doesnt', 1267), ('man', 1266), ('scenes', 1265), ('dont', 1210), ('know', 1207), ('hes', 1150), ('great', 1141), ('another', 1111), ('love', 1089), ('action', 1078), ('go', 1075), ('us', 1065), ('director', 1056), ('something', 1048), ('end', 1047), ('still', 1038)]


In [14]:
# Least common words, those that only appear once across all reviews, are not predictive. Perhaps some of the most 
# common words are not useful too.

# keep tokens with > 5 occurrence
min_occurane = 5
tokens = [k for k,c in vocab.items() if c >= min_occurane]
print(len(tokens))

14803


In [None]:
# We can then save the chosen vocabulary of words to a new file.
def save_list(lines, filename):
	data = '\n'.join(lines)
	file = open(filename, 'w')
	file.write(data)
	file.close()

In [15]:
# The complete example for defining and saving the vocabulary is listed below.
from string import punctuation
from os import listdir
from collections import Counter
from nltk.corpus import stopwords

# load doc into memory
def load_doc(filename):
	# open the file as read only
	file = open(filename, 'r')
	# read all text
	text = file.read()
	# close the file
	file.close()
	return text

# turn a doc into clean tokens
def clean_doc(doc):
	# split into tokens by white space
	tokens = doc.split()
	# remove punctuation from each token
	table = str.maketrans('', '', punctuation)
	tokens = [w.translate(table) for w in tokens]
	# remove remaining tokens that are not alphabetic
	tokens = [word for word in tokens if word.isalpha()]
	# filter out stop words
	stop_words = set(stopwords.words('english'))
	tokens = [w for w in tokens if not w in stop_words]
	# filter out short tokens
	tokens = [word for word in tokens if len(word) > 1]
	return tokens

# load doc and add to vocab
def add_doc_to_vocab(filename, vocab):
	# load doc
	doc = load_doc(filename)
	# clean doc
	tokens = clean_doc(doc)
	# update counts
	vocab.update(tokens)

# load all docs in a directory
def process_docs(directory, vocab):
	# walk through all files in the folder
	for filename in listdir(directory):
		# skip files that do not have the right extension
		if not filename.endswith(".txt"):
			continue
		# create the full path of the file to open
		path = directory + '/' + filename
		# add doc to vocab
		add_doc_to_vocab(path, vocab)

# save list to file
def save_list(lines, filename):
	data = '\n'.join(lines)
	file = open(filename, 'w')
	file.write(data)
	file.close()

# define vocab
vocab = Counter()
# add all docs to vocab
process_docs('/Users/hasanuzz/Desktop/review_data/txt_sentoken/neg', vocab)
process_docs('/Users/hasanuzz/Desktop/review_data/txt_sentoken/pos', vocab)
# print the size of the vocab
print(len(vocab))
# print the top words in the vocab
print(vocab.most_common(50))
# keep tokens with > 5 occurrence
min_occurane = 5
tokens = [k for k,c in vocab.items() if c >= min_occurane]
print(len(tokens))
# save tokens to a vocabulary file
save_list(tokens, '/Users/hasanuzz/Desktop/review_data/vocab.txt')

46557
[('film', 8860), ('one', 5521), ('movie', 5440), ('like', 3553), ('even', 2555), ('good', 2320), ('time', 2283), ('story', 2118), ('films', 2102), ('would', 2042), ('much', 2024), ('also', 1965), ('characters', 1947), ('get', 1921), ('character', 1906), ('two', 1825), ('first', 1768), ('see', 1730), ('well', 1694), ('way', 1668), ('make', 1590), ('really', 1563), ('little', 1491), ('life', 1472), ('plot', 1451), ('people', 1420), ('movies', 1416), ('could', 1395), ('bad', 1374), ('scene', 1373), ('never', 1364), ('best', 1301), ('new', 1277), ('many', 1268), ('doesnt', 1267), ('man', 1266), ('scenes', 1265), ('dont', 1210), ('know', 1207), ('hes', 1150), ('great', 1141), ('another', 1111), ('love', 1089), ('action', 1078), ('go', 1075), ('us', 1065), ('director', 1056), ('something', 1048), ('end', 1047), ('still', 1038)]
14803


In [16]:
#Step 5 (Final Step): Save Prepared Data

# We can use the data cleaning and chosen vocabulary to prepare each movie review and save the prepared versions of the 
# reviews ready for modeling. This is a good practice as it decouples the data preparation from modeling, allowing you 
# to focus on modeling and circle back to data prep if you have new ideas.

# We can start off by loading the vocabulary from ‘vocab.txt‘.

# load doc into memory
def load_doc(filename):
	# open the file as read only
	file = open(filename, 'r')
	# read all text
	text = file.read()
	# close the file
	file.close()
	return text

# load vocabulary
vocab_filename = '/Users/hasanuzz/Desktop/review_data/vocab.txt'
vocab = load_doc(vocab_filename)
vocab = vocab.split()
vocab = set(vocab)

In [None]:
# Next, we can clean the reviews, use the loaded vocab to filter out unwanted tokens, and save the clean reviews in a 
# new file. One approach could be to save all the positive reviews in one file and all the negative reviews in another 
# file, with the filtered tokens separated by white space for each review on separate lines.

# First, we can define a function to process a document, clean it, filter it, and return it as a single line that could 
# be saved in a file. Below defines the doc_to_line() function to do just that, taking a filename and vocabulary
# (as a set) as arguments. It calls the previously defined load_doc() function to load the document and clean_doc() 
# to tokenize the document.

# load doc, clean and return line of tokens
def doc_to_line(filename, vocab):
	# load the doc
	doc = load_doc(filename)
	# clean doc
	tokens = clean_doc(doc)
	# filter by vocab
	tokens = [w for w in tokens if w in vocab]
	return ' '.join(tokens)


In [None]:
# Next, we can define a new version of process_docs() to step through all reviews in a folder and convert them to 
# lines by calling doc_to_line() for each document. A list of lines is then returned.

# load all docs in a directory
def process_docs(directory, vocab):
	lines = list()
	# walk through all files in the folder
	for filename in listdir(directory):
		# skip files that do not have the right extension
		if not filename.endswith(".txt"):
			continue
		# create the full path of the file to open
		path = directory + '/' + filename
		# load and clean the doc
		line = doc_to_line(path, vocab)
		# add to list
		lines.append(line)
	return lines


In [18]:
# We can then call process_docs() for both the directories of positive and negative reviews, then call save_list() 
# from the previous section to save each list of processed reviews to a file.

# The complete code 
from string import punctuation
from os import listdir
from collections import Counter
from nltk.corpus import stopwords

# load doc into memory
def load_doc(filename):
	# open the file as read only
	file = open(filename, 'r')
	# read all text
	text = file.read()
	# close the file
	file.close()
	return text

# turn a doc into clean tokens
def clean_doc(doc):
	# split into tokens by white space
	tokens = doc.split()
	# remove punctuation from each token
	table = str.maketrans('', '', punctuation)
	tokens = [w.translate(table) for w in tokens]
	# remove remaining tokens that are not alphabetic
	tokens = [word for word in tokens if word.isalpha()]
	# filter out stop words
	stop_words = set(stopwords.words('english'))
	tokens = [w for w in tokens if not w in stop_words]
	# filter out short tokens
	tokens = [word for word in tokens if len(word) > 1]
	return tokens

# save list to file
def save_list(lines, filename):
	data = '\n'.join(lines)
	file = open(filename, 'w')
	file.write(data)
	file.close()

# load doc, clean and return line of tokens
def doc_to_line(filename, vocab):
	# load the doc
	doc = load_doc(filename)
	# clean doc
	tokens = clean_doc(doc)
	# filter by vocab
	tokens = [w for w in tokens if w in vocab]
	return ' '.join(tokens)

# load all docs in a directory
def process_docs(directory, vocab):
	lines = list()
	# walk through all files in the folder
	for filename in listdir(directory):
		# skip files that do not have the right extension
		if not filename.endswith(".txt"):
			continue
		# create the full path of the file to open
		path = directory + '/' + filename
		# load and clean the doc
		line = doc_to_line(path, vocab)
		# add to list
		lines.append(line)
	return lines

# load vocabulary
vocab_filename = '/Users/hasanuzz/Desktop/review_data/vocab.txt'
vocab = load_doc(vocab_filename)
vocab = vocab.split()
vocab = set(vocab)
# prepare negative reviews
negative_lines = process_docs('/Users/hasanuzz/Desktop/review_data/txt_sentoken/neg', vocab)
save_list(negative_lines, '/Users/hasanuzz/Desktop/review_data/negative.txt')
# prepare positive reviews
positive_lines = process_docs('/Users/hasanuzz/Desktop/review_data/txt_sentoken/pos', vocab)
save_list(positive_lines, '/Users/hasanuzz/Desktop/review_data/positive.txt')