In [None]:
import sys

print('Python info', sys.version)

In [None]:
import os

print('This is the curent directory', os.getcwd())

In [None]:
import datetime
import pytz

current_date = datetime.date.today()
current_time = datetime.datetime.now()
local_time = datetime.datetime.today().astimezone(pytz.timezone("America/New_York"))

print("System date/time", current_time)
print("Local date/time", local_time)

In [None]:
import pandas as pd
import numpy as np
import matplotlib as mpl
import matplotlib.pyplot as plt

print('Pandas version', pd.__version__)
print('Numpy version', np.__version__)
print('Matplotlib version', mpl.__version__)

In [None]:
pd.options.display.float_format = '{:,.2f}'.format
pd.set_option('display.precision', 2)
pd.set_option('max_rows', 250)
pd.set_option('max_columns', 250)

from pylab import rcParams
%matplotlib inline
rcParams['figure.figsize'] = 5, 4
plt.style.use('seaborn-whitegrid')

In [None]:
import re
import string
from pprint import pprint
import collections
from collections import Counter

In [None]:
!pip install nltk

In [None]:
import nltk
from nltk.corpus import stopwords
from nltk import word_tokenize
from nltk.util import ngrams

nltk.download('punkt')

In [None]:
!pip install wordcloud

In [None]:
from wordcloud import WordCloud

In [None]:
!pip install wget

In [None]:
import wget

In [None]:
URL = 'http://www.gutenberg.org/files/120/120-0.txt'

In [None]:
print(URL)

In [None]:
book = wget.download(URL)

In [None]:
with open(book, encoding='utf-8') as f:
    data = f.read()

type(data)

In [None]:
pprint(data[0:1000])

In [None]:
data.rfind('END')

In [None]:
pprint(data[381000:])

In [None]:
def regex(pattern, string):
    patt = re.compile(pattern)
    matches = patt.finditer(string)    
    return list(matches)

In [None]:
# Number of lines?
len(regex(r'\n', data))

In [None]:
# Phone Numbers?
regex(r'\d{3}[.-]\d{3}[.-]\d{4}', data)

In [None]:
# Emails?
email = r'[a-zA-Z0-9_.+-]+@[a-zA-Z0-9-]+\.[a-zA-Z0-9-.]+'
regex(email, data)

In [None]:
# Websites?
regex(r'https?://(www\.)?(\w+)(\.\w+)', data)

In [None]:
# Mr/Ms/Mrs?
regex(r'(Mr|Ms|Mrs)\.?\s[A-Z]\w*', data)

In [None]:
# Names?
regex(r"[A-Z][a-z]{1,20}\s[A-Z][a-z]{1,20}\s", data)

In [None]:
# Titles?
regex(r"([A-Z][a-z]{1,20}\s){5,}", data)

In [None]:
# Blackbeard?
regex(r'Blackbeard', data)

In [None]:
# Scan around "Blackbeard"
pprint(data[63000:64000])

In [None]:
# Long John Silver?
regex(r'Long John Silver', data)

In [None]:
# Get sentences with "Long John Silver"
get_sentence = re.findall(r"[^.]*Long John Silver[^.]*\.", data)

for sentence in get_sentence:
    print(sentence.strip())
    print("-"*80)

In [None]:
# https://stackoverflow.com/questions/4576077/python-split-text-on-sentences

alphabets= "([A-Za-z])"
prefixes = "(Mr|St|Mrs|Ms|Dr)[.]"
suffixes = "(Inc|Ltd|Jr|Sr|Co)"
starters = "(Mr|Mrs|Ms|Dr|He\s|She\s|It\s|They\s|Their\s|Our\s|We\s|But\s|However\s|That\s|This\s|Wherever)"
acronyms = "([A-Z][.][A-Z][.](?:[A-Z][.])?)"
websites = "[.](com|net|org|io|gov)"

def split_into_sentences(text):
    #text = " " + text + "  "
    text = text.replace("\n"," ")
    text = re.sub(prefixes,"\\1<prd>",text)
    text = re.sub(websites,"<prd>\\1",text)
    text = re.sub("\s" + alphabets + "[.] "," \\1<prd> ",text)
    text = re.sub(acronyms+" "+starters,"\\1<stop> \\2",text)
    text = re.sub(alphabets + "[.]" + alphabets + "[.]" + alphabets + "[.]","\\1<prd>\\2<prd>\\3<prd>",text)
    text = re.sub(alphabets + "[.]" + alphabets + "[.]","\\1<prd>\\2<prd>",text)
    text = re.sub(" "+suffixes+"[.] "+starters," \\1<stop> \\2",text)
    text = re.sub(" "+suffixes+"[.]"," \\1<prd>",text)
    text = re.sub(" " + alphabets + "[.]"," \\1<prd>",text)
    if "”" in text: text = text.replace(".”","”.")
    if "\"" in text: text = text.replace(".\"","\".")
    if "!" in text: text = text.replace("!\"","\"!")
    if "?" in text: text = text.replace("?\"","\"?")
    text = text.replace(".",".<stop>")
    text = text.replace("?","?<stop>")
    text = text.replace("!","!<stop>")
    text = text.replace("<prd>",".")
    sentences = text.split("<stop>")
    sentences = sentences[:-1]
    sentences = [s.strip() for s in sentences]
    return sentences

sentence_list = split_into_sentences(data)

for s in sentence_list[900:911]:
    print(s)
    print("-"*80)

In [None]:
# Split into words
word_search = re.findall(pattern='\w+', string=data)
type(word_search)

In [None]:
words = re.findall(r'\w+', data.lower())
Counter(words).most_common(20)

In [None]:
words = re.findall(r'[a-z]*', data.lower())
Counter(words).most_common(20)

In [None]:
letters = re.findall('[a-zA-Z]', data.lower())
Counter(letters).most_common(10)

In [None]:
numbers = re.findall(r'\d+', data)
Counter(numbers).most_common(10)

In [None]:
def draw_wordcloud(dict):
    wordcloud = WordCloud(
        max_words=50,
        background_color='white',
        colormap='Dark2',
        height=600,
        width=600).generate_from_frequencies(dict)
    
    plt.imshow(wordcloud, interpolation="bilinear")
    plt.axis("off")        
    plt.show()

In [None]:
features = re.findall('(mountain|lake|forest|river|ocean|crater|hills|sea|ocean|jungle|sand)',data.lower())
Counter(features)

In [None]:
draw_wordcloud(Counter(features))

In [None]:
features = re.findall('(Spain|England|France|America|Portugal)',data)
Counter(features)

In [None]:
draw_wordcloud(Counter(features))

In [None]:
with open(book, 'r', encoding='utf-8', newline='\r\n') as f:
    datalines = [line.strip() for line in f.readlines()]

type(datalines)

In [None]:
datalines[0:50]

In [None]:
for i, x in enumerate(datalines):
    if 'blackbeard' in x.lower():
        print(i, '>>>', x)

In [None]:
stopwords = nltk.download('stopwords')
type(stopwords)

In [None]:
stopwords = nltk.corpus.stopwords.words('english')
type(stopwords)

In [None]:
ps = nltk.PorterStemmer()
type(ps)

In [None]:
ps.stem('Surely')

In [None]:
stopwords[0:20]

In [None]:
string.punctuation

In [None]:
string.printable

In [None]:
def clean_text(text):
    text = "".join([word.lower() for word in text if word not in string.punctuation])
    text = "".join([word.lower() for word in text if word in string.printable])
    tokens = re.split('\W+', text)
    text = [word for word in tokens if word not in stopwords]
    text = [ps.stem(word) for word in tokens if word not in stopwords]
    return text

In [None]:
clean_book = clean_text(data)
clean_book[0:20]

In [None]:
words = re.findall(r'\w+', " ".join(clean_book))
Counter(words).most_common(20)

In [None]:
wc = dict(Counter(words).most_common(20))

draw_wordcloud(wc)

In [None]:
mywords = ['treasure', 'island', 'ship', 'ships', 'rum', 'pirates',
           'matey', 'captain', 'cannon', 'wenches', 'sailors',
           'gold', 'booty', 'yarr', 'poopdeck', 'port',
           'plank', 'sea', 'blackbeard', 'ale', 'slave']

special_words = [word for word in words if word in mywords]
Counter(special_words)

In [None]:
wc = Counter(special_words)

draw_wordcloud(wc)

In [None]:
?plt.imshow