# Data Preprocessing 

## Importing the libraries

In [39]:
import string
import re
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords

In [40]:
filename='./pg5200.txt'
file=open(filename,'rt')
text=file.read()
file.close()

### Splitting the text corpus into words

In [41]:
tokens=word_tokenize(text)

### Converting the text to lowercase

In [42]:
token=[w.lower() for w in tokens]

### Removing the punctuation characters in our data

In [43]:
re_punc=re.compile('[%s]' % re.escape(string.punctuation))

### Removing punctions inside of words that contain punctions which is not taken care of above

In [44]:
stripped=[re_punc.sub('',w) for w in tokens]

### Removing the words that are not alphabetic 

In [45]:
words=[word for word in stripped if word.isalpha()]

### Removing stop words

In [46]:
stop_words=set(stopwords.words('english'))
words=[w for w in words if not w in stop_words]
print(words[:100])

['ï', 'Project', 'Gutenberg', 'EBook', 'Metamorphosis', 'Franz', 'Kafka', 'Translated', 'David', 'Wyllie', 'This', 'eBook', 'use', 'anyone', 'anywhere', 'cost', 'almost', 'restrictions', 'whatsoever', 'You', 'may', 'copy', 'give', 'away', 'reuse', 'terms', 'Project', 'Gutenberg', 'License', 'included', 'eBook', 'online', 'wwwgutenbergnet', 'This', 'COPYRIGHTED', 'Project', 'Gutenberg', 'eBook', 'Details', 'Below', 'Please', 'follow', 'copyright', 'guidelines', 'file', 'Title', 'Metamorphosis', 'Author', 'Franz', 'Kafka', 'Translator', 'David', 'Wyllie', 'Release', 'Date', 'August', 'EBook', 'First', 'posted', 'May', 'Last', 'updated', 'May', 'Language', 'English', 'START', 'OF', 'THIS', 'PROJECT', 'GUTENBERG', 'EBOOK', 'METAMORPHOSIS', 'Copyright', 'C', 'David', 'Wyllie', 'Metamorphosis', 'Franz', 'Kafka', 'Translated', 'David', 'Wyllie', 'I', 'One', 'morning', 'Gregor', 'Samsa', 'woke', 'troubled', 'dreams', 'found', 'transformed', 'bed', 'horrible', 'vermin', 'He', 'lay', 'armourlike

In [47]:
words=[word.lower() for word in words]

In [48]:
words[:100]

['ï',
 'project',
 'gutenberg',
 'ebook',
 'metamorphosis',
 'franz',
 'kafka',
 'translated',
 'david',
 'wyllie',
 'this',
 'ebook',
 'use',
 'anyone',
 'anywhere',
 'cost',
 'almost',
 'restrictions',
 'whatsoever',
 'you',
 'may',
 'copy',
 'give',
 'away',
 'reuse',
 'terms',
 'project',
 'gutenberg',
 'license',
 'included',
 'ebook',
 'online',
 'wwwgutenbergnet',
 'this',
 'copyrighted',
 'project',
 'gutenberg',
 'ebook',
 'details',
 'below',
 'please',
 'follow',
 'copyright',
 'guidelines',
 'file',
 'title',
 'metamorphosis',
 'author',
 'franz',
 'kafka',
 'translator',
 'david',
 'wyllie',
 'release',
 'date',
 'august',
 'ebook',
 'first',
 'posted',
 'may',
 'last',
 'updated',
 'may',
 'language',
 'english',
 'start',
 'of',
 'this',
 'project',
 'gutenberg',
 'ebook',
 'metamorphosis',
 'copyright',
 'c',
 'david',
 'wyllie',
 'metamorphosis',
 'franz',
 'kafka',
 'translated',
 'david',
 'wyllie',
 'i',
 'one',
 'morning',
 'gregor',
 'samsa',
 'woke',
 'troubled',

## Stemming the words
Stemming is the process of reducing each word to its root or base form. Example, fishing, fished, fisher all are reduced to fish. 
There are many stemming algorithms, but we are going to be using the Porter stemming algorithm. This is available in NLTK via the PorterStemmer class.

In [49]:
from nltk.stem.porter import PorterStemmer
porter=PorterStemmer()
stemmed=[porter.stem(word) for word in words] 

In [50]:
stemmed[:100]

['ï',
 'project',
 'gutenberg',
 'ebook',
 'metamorphosi',
 'franz',
 'kafka',
 'translat',
 'david',
 'wylli',
 'thi',
 'ebook',
 'use',
 'anyon',
 'anywher',
 'cost',
 'almost',
 'restrict',
 'whatsoev',
 'you',
 'may',
 'copi',
 'give',
 'away',
 'reus',
 'term',
 'project',
 'gutenberg',
 'licens',
 'includ',
 'ebook',
 'onlin',
 'wwwgutenbergnet',
 'thi',
 'copyright',
 'project',
 'gutenberg',
 'ebook',
 'detail',
 'below',
 'pleas',
 'follow',
 'copyright',
 'guidelin',
 'file',
 'titl',
 'metamorphosi',
 'author',
 'franz',
 'kafka',
 'translat',
 'david',
 'wylli',
 'releas',
 'date',
 'august',
 'ebook',
 'first',
 'post',
 'may',
 'last',
 'updat',
 'may',
 'languag',
 'english',
 'start',
 'of',
 'thi',
 'project',
 'gutenberg',
 'ebook',
 'metamorphosi',
 'copyright',
 'c',
 'david',
 'wylli',
 'metamorphosi',
 'franz',
 'kafka',
 'translat',
 'david',
 'wylli',
 'i',
 'one',
 'morn',
 'gregor',
 'samsa',
 'woke',
 'troubl',
 'dream',
 'found',
 'transform',
 'bed',
 'horr