# Stemming Words

In [3]:
import nltk
from urllib import request

### Preparing the Text for Processing

In [8]:
# Requesting the text
url = 'https://www.gutenberg.org/files/55463/55463-0.txt'
response = request.urlopen(url)
raw = response.read().decode('utf8')

In [34]:
raw_corpus = raw[raw.find('LUCK ***')+9:raw.find('END OF PROJECT')]

In [35]:
tokens = nltk.word_tokenize(raw_corpus)

In [36]:
text = nltk.Text(tokens)

### Reducing words to their core form

#### Using standardized Stemmer Versions

In [42]:
# Loading the Stemmersb
porter = nltk.PorterStemmer()
lancaster = nltk.LancasterStemmer()

In [44]:
# Craeting lemmatized versions of the text with either stemmer
tokens_porter = [porter.stem(w) for w in tokens]
tokens_lanc = [lancaster.stem(w) for w in tokens]

Investigating the results

In [55]:
tokens_porter[1500:1510]

['.', 'they', 'had', 'given', 'him', 'no', 'firm', 'foothold', 'in', 'alight']

In [54]:
tokens_lanc[1500:1510]

['.', 'they', 'had', 'giv', 'him', 'no', 'firm', 'foothold', 'in', 'alight']

#### Lemmatizer

In [56]:
from nltk.stem import WordNetLemmatizer

# Creating the Lemmatizer
lemma = WordNetLemmatizer()
tokens_lemma = [lemma.lemmatize(w) for w in tokens]

In [59]:
tokens_lemma[:25]

['Produced',
 'by',
 'Demian',
 'Katz',
 ',',
 'Craig',
 'Kirkwood',
 ',',
 'and',
 'the',
 'Online',
 'Distributed',
 'Proofreading',
 'Team',
 'at',
 'http',
 ':',
 '//www.pgdp.net',
 '(',
 'Northern',
 'Illinois',
 'University',
 'Digital',
 'Library',
 'at']

## Creating Features

Manually creating feature representations on strings to work with in Machine Learning Models

In [61]:
[(len(token), token, token.istitle()) for token in tokens_lemma]

[(8, 'Produced', True),
 (2, 'by', False),
 (6, 'Demian', True),
 (4, 'Katz', True),
 (1, ',', False),
 (5, 'Craig', True),
 (8, 'Kirkwood', True),
 (1, ',', False),
 (3, 'and', False),
 (3, 'the', False),
 (6, 'Online', True),
 (11, 'Distributed', True),
 (12, 'Proofreading', True),
 (4, 'Team', True),
 (2, 'at', False),
 (4, 'http', False),
 (1, ':', False),
 (14, '//www.pgdp.net', False),
 (1, '(', False),
 (8, 'Northern', True),
 (8, 'Illinois', True),
 (10, 'University', True),
 (7, 'Digital', True),
 (7, 'Library', True),
 (2, 'at', False),
 (4, 'http', False),
 (1, ':', False),
 (22, '//digital.lib.niu.edu/', False),
 (1, ')', False),
 (11, 'Transcriber', True),
 (1, '’', False),
 (1, 's', False),
 (5, 'Notes', True),
 (1, ':', False),
 (4, 'Text', True),
 (8, 'enclosed', False),
 (2, 'by', False),
 (10, 'underscore', False),
 (2, 'is', False),
 (2, 'in', False),
 (6, 'italic', False),
 (1, '(', False),
 (9, '_italics_', False),
 (1, ')', False),
 (1, ',', False),
 (3, 'and', Fa

In [64]:
# Counting the longest word in a sequence
maxlen = max(len(token) for token in text)
[word for word in text if len(word) == maxlen]

['//www.gutenberg.org/5/5/4/6/55463/']

In [78]:
# Creating bigrams from text
word_token_sents = [nltk.word_tokenize(sent) for sent in nltk.sent_tokenize(raw_corpus)]
n = 3
trigrams = [[sent[i:i+n] for i in range(len(sent)-n+1)] for sent in word_token_sents]

In [79]:
len(trigrams)

2370