### Phase 1 : Import required libraries and write user-defined functions

In [18]:
#Import required libraries.
import pandas as pd
import numpy as np

In [14]:
from nltk import word_tokenize, sent_tokenize, pos_tag
from nltk.corpus import stopwords
from nltk.tag.stanford import StanfordNERTagger
from collections import Counter

In [3]:
def display_text(token_list, number):
    for i in range(number):
        print(token_list[i])

In [4]:
def token_length(token_list):
    print('Length of token list is : ',len(token_list))

### Phase 2 : Import dataset

In [5]:
#Specify path where the data is.
DATA_PATH = '../data/raw/'

#Specify the name fo the source file.
DATA_FILE = 'hp1.txt'

In [6]:
#Open connection to file, read the text and close it.
file = open(DATA_PATH + DATA_FILE, 'rt', encoding='utf8')
text = file.read()
file.close()

### Phase 3 : Tokenize

In [7]:
#Tokenize by sentences.
sentences = sent_tokenize(text)
display_text(sentences, 5)

Harry Potter and the Sorcerer's Stone 

CHAPTER ONE 

THE BOY WHO LIVED 

Mr. and Mrs. Dursley, of number four, Privet Drive, were proud to say that they were perfectly normal, thank you very much.
They were the last people you'd expect to be involved in anything strange or mysterious, because they just didn't hold with such nonsense.
Mr. Dursley was the director of a firm called Grunnings, which made drills.
He was a big, beefy man with hardly any neck, although he did have a very large mustache.
Mrs. Dursley was thin and blonde and had nearly twice the usual amount of neck, which came in very useful as she spent so much of her time craning over garden fences, spying on the neighbors.


In [8]:
#Tokenize by words.
words = word_tokenize(text)
display_text(words, 5)
token_length(words)

Harry
Potter
and
the
Sorcerer
Length of token list is :  98841


### Phase 4 : Text Preprocessing

#### Part A : Switch to same case

In [9]:
#Change to lowercase.
word_copy = words.copy()
words = [word.lower() for word in words]
display_text(words, 5)
token_length(words)

harry
potter
and
the
sorcerer
Length of token list is :  98841


#### Part B : Remove numbers and special characters

In [10]:
#Taking only words. Remove numbers and special characters.
words = [word for word in words if word.isalpha()]
word_copy = [word for word in word_copy if word.isalpha()]
display_text(words, 5)
token_length(words)

harry
potter
and
the
sorcerer
Length of token list is :  77031


#### Part C : Remove stopwords

In [11]:
#Remove stopwords
stop_words = set(stopwords.words('english'))
words = [word for word in words if not word in stop_words]
word_copy = [word for word in word_copy if not word in stop_words]
display_text(words, 5)
token_length(words)

harry
potter
sorcerer
stone
chapter
Length of token list is :  40291


#### Part D : Named Entity Recognition

In [12]:
st = StanfordNERTagger('../dependencies/english.muc.7class.distsim.crf.ser.gz', '../dependencies/stanford-ner.jar')
person = st.tag(word_copy)

In [13]:
person_list = []
for tag in person:
    if tag[1] == 'PERSON':
        person_list.append(tag)

In [15]:
Counter(person_list).most_common()

[(Harry Potter, 1),
 (Dursley, 1),
 (Dursley, 1),
 (Dursley, 1),
 (Potter, 1),
 (Dursley, 1),
 (Dursley, 1),
 (Dursley, 1),
 (Dursley, 1),
 (Dursley, 1),
 (Dursley, 1),
 (Dursley, 1),
 (Dursley, 1),
 (Dursley, 1),
 (Dursley, 1),
 (Dursley, 1),
 (Dursley, 1),
 (Dursley, 1),
 (Dursley, 1),
 (Dursley, 1),
 (Dursley, 1),
 (Dursley, 1),
 (Dursley, 1),
 (Harry, 1),
 (Dursley, 1),
 (Harry, 1),
 (Harry, 1),
 (Dursley, 1),
 (Dursley, 1),
 (Dursley, 1),
 (Dursley, 1),
 (Dursley, 1),
 (Dursley, 1),
 (Dursley, 1),
 (Next Door, 1),
 (Dursley, 1),
 (Jim McGuffin, 1),
 (Jim, 1),
 (Ted, 1),
 (Yorkshire, 1),
 (Dundee, 1),
 (Dursley, 1),
 (Mysterious, 1),
 (Dursley, 1),
 (Dursley, 1),
 (Dursley, 1),
 (Dursley, 1),
 (Dursley, 1),
 (Dursley, 1),
 (Dursley, 1),
 (Howard, 1),
 (Harry, 1),
 (Dursley, 1),
 (Dursley, 1),
 (Dursley, 1),
 (Dursley, 1),
 (Dursley, 1),
 (Dursley, 1),
 (Dursley, 1),
 (Albus Dumbledore, 1),
 (Albus Dumbledore, 1),
 (the Put-Outer, 1),
 (Dursley, 1),
 (McGonagall, 1),
 (McGonagall, 1