Text cleanup and word extraction exercise
-----------------------------------------

**Step 1.** Read story from URL `http://sixty-north.com/c/t.txt`

In [1]:
from urllib.request import urlopen
url='http://sixty-north.com/c/t.txt'
story = urlopen(url)
story_lines = [line.decode('utf-8') for line in story]
story.close()

**Step 2.** Create list of words

In [2]:
word_list = [line.split(' ') for line in story_lines]

**Step 3.** Flatten the nested list

In [3]:
# story_words = []

# for words in word_list:
#    for word in words:
#        story_words.append(word)

story_words = [word for words in word_list for word in words]

**Step 4.** Remove newline (\n) characters from words

In [4]:
story_words = [ word.replace('\n','') for word in story_words]

**Step 5.** Remove common words 

In [5]:
common_words = ['a', 'an', 'the', 'it', 'in', 'is', 'was', 'are', 'were', 'of', 'on', 'its', 'or', 'for', 'have', 'has', 'had', 'to', 'we', 'us', 'so', 'this', 'that', 'all']
story_words = [word for word in story_words if word.lower() not in common_words]

**Step 6.** Count occurence of each word

In [6]:
# Get count of each word
from collections import defaultdict
word_count = defaultdict(int)

for word in story_words:
    word_count[word.lower()]+=1

**Step 7.** Get list of words in lexical order

In [7]:
sorted(word_count.keys())

['age',
 'authorities',
 'before',
 'being',
 'belief',
 'best',
 'comparison',
 'darkness',
 'degree',
 'despair',
 'direct',
 'epoch',
 'everything',
 'evil',
 'far',
 'foolishness',
 'going',
 'good',
 'heaven',
 'hope',
 'incredulity',
 'insisted',
 'light',
 'like',
 'noisiest',
 'nothing',
 'only',
 'other',
 'period',
 'present',
 'received',
 'season',
 'short',
 'some',
 'spring',
 'superlative',
 'times',
 'way',
 'winter',
 'wisdom',
 'worst']

**Step 8.** Get list of words sorted by occurence count in descending order

In [8]:
sorted(word_count.items(), reverse=True, key=lambda x: x[1])

[('times', 2),
 ('age', 2),
 ('epoch', 2),
 ('season', 2),
 ('before', 2),
 ('going', 2),
 ('direct', 2),
 ('period', 2),
 ('best', 1),
 ('worst', 1),
 ('wisdom', 1),
 ('foolishness', 1),
 ('belief', 1),
 ('incredulity', 1),
 ('light', 1),
 ('darkness', 1),
 ('spring', 1),
 ('hope', 1),
 ('winter', 1),
 ('despair', 1),
 ('everything', 1),
 ('nothing', 1),
 ('heaven', 1),
 ('other', 1),
 ('way', 1),
 ('short', 1),
 ('far', 1),
 ('like', 1),
 ('present', 1),
 ('some', 1),
 ('noisiest', 1),
 ('authorities', 1),
 ('insisted', 1),
 ('being', 1),
 ('received', 1),
 ('good', 1),
 ('evil', 1),
 ('superlative', 1),
 ('degree', 1),
 ('comparison', 1),
 ('only', 1)]

**Step 9.** List of words occuring more than once

In [9]:
words_occuring_more_than_once = [word for word, count in word_count.items() if count>1]
words_occuring_more_than_once

['times', 'age', 'epoch', 'season', 'before', 'going', 'direct', 'period']

**Step 10.** Words occuring more than once and also having atleast 5 alphabets.

In [10]:
word_subset = [word for word, count in word_count.items() if count>1 and len(word)>=5]
word_subset

['times', 'epoch', 'season', 'before', 'going', 'direct', 'period']