#### Import Libraries

In [1]:
import nltk
from nltk import TreebankWordTokenizer, SnowballStemmer
from nltk.stem import WordNetLemmatizer
from nltk.corpus import stopwords
import string
import urllib

nltk.download('wordnet')
nltk.download('stopwords')

[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\shava\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\shava\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

In [2]:
# read in the data
def print_some_url():
    with urllib.request.urlopen('alice_in_wonderland.txt') as f:
        return f.read().decode('ISO-8859-1')

data = print_some_url()
print(data[:863])

Alice's Adventures in Wonderland

                ALICE'S ADVENTURES IN WONDERLAND

                          Lewis Carroll

               THE MILLENNIUM FULCRUM EDITION 3.0




                            CHAPTER I

                      Down the Rabbit-Hole


  Alice was beginning to get very tired of sitting by her sister
on the bank, and of having nothing to do:  once or twice she had
peeped into the book her sister was reading, but it had no
pictures or conversations in it, `and what is the use of a book,'
thought Alice `without pictures or conversation?'

  So she was considering in her own mind (as well as she could,
for the hot day made her feel very sleepy and stupid), whether
the pleasure of making a daisy-chain would be worth the trouble
of getting up and picking the daisies, when suddenly a White
Rabbit with pink eyes ran close by her.

 


### Convert to lowercase and remove punctuation 

In [3]:
def remove_punctuation(words):
    words = words.lower()
    return ''.join([x for x in words if x not in string.punctuation])

In [4]:
data = remove_punctuation(data)

### Creating a bag of words and assigning stemmer and lemmatizer

In [6]:
# define stemmer function
stemmer = SnowballStemmer('english')

# tokenise data
tokeniser = TreebankWordTokenizer()
tokens = tokeniser.tokenize(data)

# define lemmatiser
lemmatizer = WordNetLemmatizer()

# bag of words
def bag_of_words_count(words, word_dict={}):
    """ this function takes in a list of words and returns a dictionary 
        with each word as a key, and the value represents the number of 
        times that word appeared"""
    for word in words:
        if word in word_dict.keys():
            word_dict[word] += 1
        else:
            word_dict[word] = 1
    return word_dict

# remove stopwords
tokens_less_stopwords = [word for word in tokens if word not in stopwords.words('english')]

# create bag of words
bag_of_words = bag_of_words_count(tokens_less_stopwords)

### Find nth word stem and lemma

In [7]:
def find_roots(token_list, n):
    w_dic = {'original': token_list[n-1], 'stem': stemmer.stem(token_list[n-1]), 'lemma': lemmatizer.lemmatize(token_list[n-1])}
    return w_dic

In [8]:
find_roots(tokens, 120) 

{'original': 'daisies', 'stem': 'daisi', 'lemma': 'daisy'}

### Count stopwords

In [9]:
def count_stopwords(token_list):
    return len([word for word in token_list if word in stopwords.words('english')])

In [10]:
count_stopwords(tokens)

13774

### Find unique words

In [13]:
def unique_words(token_list):
    unique = []
    for word in token_list:
        if word not in unique:
            unique.append(word)
    return len(unique)

In [14]:
unique_words(tokens)

2749

### Most common word

In [15]:
def most_common_word(bag, k):
    value = list(sorted(bag.values()))[-k]
    values = list(bag.values())
    key = list(bag.keys())
    return key[values.index(value)]

In [16]:
most_common_word(bag_of_words, 3)

'little'

### How many words appear n times in the text

In [17]:
def word_frequency_count(bag, n):
    dic = {key: value for (key, value) in bag.items() if value == n}
    return len(dic)

In [18]:
word_frequency_count(bag_of_words, 5)

97