# TOC

  __Chapter 1 - Introduction to natural language processing__

1. [Import](#Import)
1. [Basic tokenization](#Basic-tokenization)

# Import

<a id = 'Import'></a>

In [1]:
# Standard libary and settings
import os
import sys
import importlib
import itertools
import warnings

warnings.simplefilter("ignore")
from IPython.core.display import display, HTML

display(HTML("<style>.container { width:95% !important; }</style>"))

# Data extensions and settings
import numpy as np

np.set_printoptions(threshold=np.inf, suppress=True)
import pandas as pd

pd.set_option("display.max_rows", 500)
pd.set_option("display.max_columns", 500)
pd.options.display.float_format = "{:,.6f}".format

# Modeling extensions
import nltk

# Visualization extensions and settings
import seaborn as sns
import matplotlib.pyplot as plt

%matplotlib inline
sns.set_style("whitegrid")

# Basic tokenization


<a id = 'Basic-tokenization'></a>

In [7]:
# read text of python's homepage
from urllib.request import urlopen

response = urlopen("http://python.org")
html = response.read().decode("utf-8")
print(len(html))

49350


In [8]:
# split text into individual tokens
tokens = [tok for tok in html.split()]
print(tokens[:100])

['<!doctype', 'html>', '<!--[if', 'lt', 'IE', '7]>', '<html', 'class="no-js', 'ie6', 'lt-ie7', 'lt-ie8', 'lt-ie9">', '<![endif]-->', '<!--[if', 'IE', '7]>', '<html', 'class="no-js', 'ie7', 'lt-ie8', 'lt-ie9">', '<![endif]-->', '<!--[if', 'IE', '8]>', '<html', 'class="no-js', 'ie8', 'lt-ie9">', '<![endif]-->', '<!--[if', 'gt', 'IE', '8]><!--><html', 'class="no-js"', 'lang="en"', 'dir="ltr">', '<!--<![endif]-->', '<head>', '<meta', 'charset="utf-8">', '<meta', 'http-equiv="X-UA-Compatible"', 'content="IE=edge">', '<link', 'rel="prefetch"', 'href="//ajax.googleapis.com/ajax/libs/jquery/1.8.2/jquery.min.js">', '<meta', 'name="application-name"', 'content="Python.org">', '<meta', 'name="msapplication-tooltip"', 'content="The', 'official', 'home', 'of', 'the', 'Python', 'Programming', 'Language">', '<meta', 'name="apple-mobile-web-app-title"', 'content="Python.org">', '<meta', 'name="apple-mobile-web-app-capable"', 'content="yes">', '<meta', 'name="apple-mobile-web-app-status-bar-style"', 'c

In [9]:
# use regex to parse input
import re

tokens = re.split("\W+", html)
print(tokens[:100])

['', 'doctype', 'html', 'if', 'lt', 'IE', '7', 'html', 'class', 'no', 'js', 'ie6', 'lt', 'ie7', 'lt', 'ie8', 'lt', 'ie9', 'endif', 'if', 'IE', '7', 'html', 'class', 'no', 'js', 'ie7', 'lt', 'ie8', 'lt', 'ie9', 'endif', 'if', 'IE', '8', 'html', 'class', 'no', 'js', 'ie8', 'lt', 'ie9', 'endif', 'if', 'gt', 'IE', '8', 'html', 'class', 'no', 'js', 'lang', 'en', 'dir', 'ltr', 'endif', 'head', 'meta', 'charset', 'utf', '8', 'meta', 'http', 'equiv', 'X', 'UA', 'Compatible', 'content', 'IE', 'edge', 'link', 'rel', 'prefetch', 'href', 'ajax', 'googleapis', 'com', 'ajax', 'libs', 'jquery', '1', '8', '2', 'jquery', 'min', 'js', 'meta', 'name', 'application', 'name', 'content', 'Python', 'org', 'meta', 'name', 'msapplication', 'tooltip', 'content', 'The', 'official']


In [21]:
# use BeautifulSoup to parse html
from bs4 import BeautifulSoup

soup = BeautifulSoup(html)
text2 = soup.get_text()
text2[:200]

' \n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\nWelcome to Python.org\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n     {\n       "@context": "https://schema.org",\n       "@type": "WebSite",\n       "url": "https://www.python.org/",\n       "potentialA'

In [20]:
#
freq_dist = nltk.FreqDist(tokens)
print(freq_dist)
count = 0
for k, v in freq_dist.items():
    print("{0} : {1}".format(k, v))
    count += 1
    if count == 25:
        break

<FreqDist with 1042 samples and 6229 outcomes>
 : 2
doctype : 1
html : 19
if : 10
lt : 8
IE : 10
7 : 14
class : 363
no : 20
js : 21
ie6 : 1
ie7 : 2
ie8 : 3
ie9 : 3
endif : 9
8 : 20
gt : 8
lang : 1
en : 1
dir : 1
ltr : 1
head : 2
meta : 31
charset : 4
utf : 4
