# Natural Language Processing

In [1]:
import pandas as pd
import numpy as np
import os
import seaborn as sns
import matplotlib.pyplot as plt
%matplotlib inline

#lines below let allow multiple results from a line of code to be shown e.g. df.head() + df.columns
from IPython.core.interactiveshell import InteractiveShell
InteractiveShell.ast_node_interactivity = "all"

#this allows us to see all of our columns or rows in jupyter notebook
pd.set_option('display.max_columns', None)
pd.set_option('display.max_rows', None)

For more information on F-string literal, go here https://www.geeksforgeeks.org/formatted-string-literals-f-strings-python/

In [3]:
# f string literal formatting

person = 'Jose'
#old way: print('my name is {}').format(person))

#new way
print(f' my name is {person}')

 my name is Jose


In [7]:
#you can also use the {} inside the print statement for operations 
d = {'a':123, 'b':456}

print(f"my number is {d['a']}")

my number is 123


In [8]:
#we can also use a list to reference within the f-string
library = [('Author', 'Topic', 'Pages'), ('Twain', 'Rafting', 601), ('Feynman', 'Physics', 95), ('Hamilton', 'Mythology', 144)]

In [12]:
#creating a loop for using the indexed [0] (first) item within each tuple above 
for book in library:
    print(f'Author is {book[0]}')

Author is Author
Author is Twain
Author is Feynman
Author is Hamilton


In [17]:
#we can also use tuple unpacking to do the following , but notice the alignment of everything is off and super close together, we can change that by adding
# what is called 'padding' to the text through a number of ways
for author, topic, pages in library:
    print(f"{author} {topic} {pages}")

Author Topic Pages
Twain Rafting 601
Feynman Physics 95
Hamilton Mythology 144


To set the alignment, use the character `<` for left-align,  `^` for center, `>` for right.<br>
To set padding, precede the alignment character with the padding character (`-` and `.` are common choices).

In [18]:
#don't get too in the weeds here, just including it to become familiar with the idea
for book in library:
    print(f'{book[0]:{10}} {book[1]:<{10}} {book[2]:.>{7}}') # here .> was added

Author     Topic      ..Pages
Twain      Rafting    ....601
Feynman    Physics    .....95
Hamilton   Mythology  ....144


In [21]:
#date formatting, look up strftime codes at http://strftime.org/ for the date codes
from datetime import datetime
today = datetime(year = 2001, month = 6, day = 6)

print(f'{today: %B %d, %Y}')

 June 06, 2001


## Working with text files 

In [23]:
%%writefile test.txt
hello, this is a quick text file.
this is the second line of the file.

Writing test.txt


In [24]:
#can't have any comments in the cell above for it to work, (BTW %% are called 'magic commands')

In [29]:
#opening text files=, you must have your working directory set to where the file is!
myfile = open('test.txt')
myfile

<_io.TextIOWrapper name='test.txt' mode='r' encoding='cp1252'>

In [30]:
#read the file
myfile.read()

'hello, this is a quick text file.\nthis is the second line of the file.\n'

In [31]:
#you cannot just call read on a file multiple times! it will return blank ''
myfile.read()

''

In [32]:
#you have to return the cursor to the beginning of the file and rerun the read code again
myfile.seek(0)
myfile.read()

0

'hello, this is a quick text file.\nthis is the second line of the file.\n'

In [33]:
myfile.seek(0)

content = myfile.read()

print(content)

0

hello, this is a quick text file.
this is the second line of the file.



In [34]:
#ALWAYS CLOSE A FILE WHEN YOU ARE DONE WORKING WITH IT
myfile.close()

In [35]:
myfile = open('test.txt')
myfile.readlines() #reads in each line as a seperate thing

['hello, this is a quick text file.\n',
 'this is the second line of the file.\n']

In [37]:
myfile.seek(0)

mylines = myfile.readlines()

mylines

0

['hello, this is a quick text file.\n',
 'this is the second line of the file.\n']

In [38]:
#you can run through using loops when you use the readlines method 
for line in mylines:
    print(line[0])

h
t


In [39]:
myfile = open('test.txt', 'w+') #allows us to read and write to the file, you can restrict the file's access in many ways, w+ overwrites original file 

#look at the mode = argument inside of open() to see the options for permissions

In [40]:
myfile.read()

''

In [41]:
myfile.write('new text')

8

In [42]:
myfile.seek(0)

0

In [43]:
myfile.read()

'new text'

In [44]:
myfile.close()

In [47]:
#appending to a file, keeps old information and allows you to add new lines
myfile = open('whoops.txt', 'a+') #allows me to append to the file, creates a new one if it doesn't already exist

myfile.write('my first line in a+ opening')
myfile.close()

newfile = open('whoops.txt')

newfile.read()

27

'my first line in a+ openingmy first line in a+ opening'

In [48]:
newfile.close()

In [49]:
#using a context manager to close the file for us! using this 'with' operator, we don't have to worry about manually closing the file, WE SHOULD ALWAYS OPEN FILES 
# USING THIS METHOD
with open('test.txt', 'r') as mynewfile:
    myvariable = mynewfile.readlines()

In [50]:
myvariable

['new text']

## Working with PDF Files

In [53]:
#reading in text data from PDF
import PyPDF2

In [55]:
pwd

'C:\\Users\\Sam Cannon\\Desktop\\Python\\Udemy Courses\\NLP\\UPDATED_NLP_COURSE'

In [56]:
import os 
os.chdir('C:\\Users\\Sam Cannon\\Desktop\\Python\\Udemy Courses\\NLP\\UPDATED_NLP_COURSE\\00-Python-Text-Basics')

In [57]:
myfile = open('US_Declaration.pdf', mode = 'rb') #using 'rb' (Read binary) here becuase this is a PDF not a text file

In [58]:
#we have to convert this file into a pdf file reader object
pdf_reader = PyPDF2.PdfFileReader(myfile)

In [60]:
#there are a number of methods for pdf reader, we can extract a lot of different information using this
pdf_reader.numPages

5

In [62]:
#here we are grabbing the text from the first page
page_one = pdf_reader.getPage(0)

page_one.extractText()

"Declaration of IndependenceIN CONGRESS, July 4, 1776. The unanimous Declaration of the thirteen united States of America, When in the Course of human events, it becomes necessary for one people to dissolve the\npolitical bands which have connected them with another, and to assume among the powers of the\nearth, the separate and equal station to which the Laws of Nature and of Nature's God entitle\n\nthem, a decent respect to the opinions of mankind requires that they should declare the causes\n\nwhich impel them to the separation. \nWe hold these truths to be self-evident, that all men are created equal, that they are endowed by\n\ntheir Creator with certain unalienable Rights, that among these are Life, Liberty and the pursuit\nof Happiness.ŠThat to secure these rights, Governments are instituted among Men, deriving\n\ntheir just powers from the consent of the governed,ŠThat whenever any Form of Government\nbecomes destructive of these ends, it is the Right of the People to alter or 

In [63]:
print(page_one.extractText())

Declaration of IndependenceIN CONGRESS, July 4, 1776. The unanimous Declaration of the thirteen united States of America, When in the Course of human events, it becomes necessary for one people to dissolve the
political bands which have connected them with another, and to assume among the powers of the
earth, the separate and equal station to which the Laws of Nature and of Nature's God entitle

them, a decent respect to the opinions of mankind requires that they should declare the causes

which impel them to the separation. 
We hold these truths to be self-evident, that all men are created equal, that they are endowed by

their Creator with certain unalienable Rights, that among these are Life, Liberty and the pursuit
of Happiness.ŠThat to secure these rights, Governments are instituted among Men, deriving

their just powers from the consent of the governed,ŠThat whenever any Form of Government
becomes destructive of these ends, it is the Right of the People to alter or to abolish it,

In [64]:
myfile.close()

We can also add text to a pdf file, but I am not going to cover that here since I see no use case for it in my own life thus far

In [67]:
#we can also grab all of the pages and text from a pdf file
f = open('US_Declaration.pdf','rb')

# List of every page's text.
# The index will correspond to the page number.
pdf_text = [0]  # zero is a placehoder to make page 1 = index 1

pdf_reader = PyPDF2.PdfFileReader(f)

for p in range(pdf_reader.numPages):
    
    page = pdf_reader.getPage(p)
    
    pdf_text.append(page.extractText())

f.close()

In [69]:
print(pdf_text)

["Declaration of IndependenceIN CONGRESS, July 4, 1776. The unanimous Declaration of the thirteen united States of America, When in the Course of human events, it becomes necessary for one people to dissolve the\npolitical bands which have connected them with another, and to assume among the powers of the\nearth, the separate and equal station to which the Laws of Nature and of Nature's God entitle\n\nthem, a decent respect to the opinions of mankind requires that they should declare the causes\n\nwhich impel them to the separation. \nWe hold these truths to be self-evident, that all men are created equal, that they are endowed by\n\ntheir Creator with certain unalienable Rights, that among these are Life, Liberty and the pursuit\nof Happiness.ŠThat to secure these rights, Governments are instituted among Men, deriving\n\ntheir just powers from the consent of the governed,ŠThat whenever any Form of Government\nbecomes destructive of these ends, it is the Right of the People to alter or

# Regular Expressions

Regular Expressions (sometimes called regex for short) allow a user to search for strings using almost any sort of rule they can come up with. For example, finding all capital letters in a string, or finding a phone number in a document. 

Regular expressions are notorious for their seemingly strange syntax. This strange syntax is a byproduct of their flexibility. Regular expressions have to be able to filter out any string pattern you can imagine, which is why they have a complex string pattern format.

Regular expressions are handled using Python's built-in **re** library. See [the docs](https://docs.python.org/3/library/re.html) for more information.

In [81]:
text = 'call me at phone number 555-552-3345'

In [82]:
'phone' in text

True

In [83]:
import re #regular expressions library

pattern = 'phone'

re.search(pattern, text)

<re.Match object; span=(11, 16), match='phone'>

In [84]:
my_match = re.search(pattern, text) #returns the first match of the pattern

In [85]:
#gives you the position of the pattern you are telling it to look for
my_match.span()

(11, 16)

In [86]:
text = 'my phone is a new phone'

match = re.search(pattern, text)

match.span() #see how it only gives us the position of the first instance of 'phone' within our string above? we need to use findall

(3, 8)

In [88]:
re.findall('phone', text)

['phone', 'phone']

# Patterns

So far we've learned how to search for a basic string. What about more complex examples? Such as trying to find a telephone number in a large string of text? Or an email address?

We could just use search method if we know the exact phone or email, but what if we don't know it? We may know the general format, and we can use that along with regular expressions to search the document for strings that match a particular pattern.

## Identifiers for Characters in Patterns
​
Characters such as a digit or a single string have different codes that represent them. You can use these to build up a pattern string. Notice how these make heavy use of the backwards slash \ . Because of this when defining a pattern string for regular expression we use the format:
​
    r'mypattern'
    
placing the r in front of the string allows python to understand that the \ in the pattern string are not meant to be escape slashes.
​
Below you can find a table of all the possible identifiers:

<table ><tr><th>Character</th><th>Description</th><th>Example Pattern Code</th><th >Exammple Match</th></tr>

<tr ><td><span >\d</span></td><td>A digit</td><td>file_\d\d</td><td>file_25</td></tr>

<tr ><td><span >\w</span></td><td>Alphanumeric</td><td>\w-\w\w\w</td><td>A-b_1</td></tr>



<tr ><td><span >\s</span></td><td>White space</td><td>a\sb\sc</td><td>a b c</td></tr>



<tr ><td><span >\D</span></td><td>A non digit</td><td>\D\D\D</td><td>ABC</td></tr>

<tr ><td><span >\W</span></td><td>Non-alphanumeric</td><td>\W\W\W\W\W</td><td>*-+=)</td></tr>

<tr ><td><span >\S</span></td><td>Non-whitespace</td><td>\S\S\S\S</td><td>Yoyo</td></tr></table>

In [90]:
text = 'my phone number is 555-666-7777'

In [91]:
pattern = r'\d\d\d-\d\d\d-\d\d\d\d'

In [92]:
phone_number = re.search(pattern, text) #could also use findall, or re.finditer

In [95]:
phone_number
phone_number.group()

<re.Match object; span=(19, 31), match='555-666-7777'>

'555-666-7777'

## Quantifiers
​
Now that we know the special character designations, we can use them along with quantifiers to define how many we expect.

<table ><tr><th>Character</th><th>Description</th><th>Example Pattern Code</th><th >Exammple Match</th></tr>

<tr ><td><span >+</span></td><td>Occurs one or more times</td><td>	Version \w-\w+</td><td>Version A-b1_1</td></tr>

<tr ><td><span >{3}</span></td><td>Occurs exactly 3 times</td><td>\D{3}</td><td>abc</td></tr>



<tr ><td><span >{2,4}</span></td><td>Occurs 2 to 4 times</td><td>\d{2,4}</td><td>123</td></tr>



<tr ><td><span >{3,}</span></td><td>Occurs 3 or more</td><td>\w{3,}</td><td>anycharacters</td></tr>

<tr ><td><span >\*</span></td><td>Occurs zero or more times</td><td>A\*B\*C*</td><td>AAACC</td></tr>

<tr ><td><span >?</span></td><td>Once or none</td><td>plurals?</td><td>plural</td></tr></table>

In [102]:
pattern = r'\d{3}-\d{3}-\d{4}' #easier to search this way using the {}

re.search(pattern, text).group()

'555-666-7777'

In [103]:
#search for groups
phone_pattern = re.compile(r'(\d{3})-(\d{3})-(\d{4})')

In [104]:
results = re.search(phone_pattern,text)

In [105]:
# The entire result
results.group()

'555-666-7777'

In [106]:
# Can then also call by group position.
# remember groups were separated by parentheses ()
# Something to note is that group ordering starts at 1. Passing in 0 returns everything
results.group(1)

'555'

In [107]:
results.group(2)

'666'

In [108]:
results.group(3)

'7777'

In [110]:
# We only had three groups of parentheses
results.group(4)

IndexError: no such group

In [111]:
#we can also search for multiple words inside of a string
re.search('man|woman', 'this woman was here') #this looked for man or woman using |

<re.Match object; span=(5, 10), match='woman'>

In [112]:
re.findall(r'.at', 'the cat in the hat sat') #findall found everything with that pattern, the '.' is a wildcard that found a pattern for anything
                                                # with one letter first and ending with 'at', we could add more .. for the # of letters before the pattern

['cat', 'hat', 'sat']

In [114]:
re.findall(r'...at', 'splat')

['splat']

In [115]:
#find anything that starts or ends with something
re.findall(r'\d$', 'this ends with a number 2')

['2']

In [118]:
re.findall(r'^\d', '1 is the loneliest number') #these search through the entire string

['1']

In [120]:
#excluding things from text
phrase = 'there are 3 numbers 34 inside 5 of this sentence'

re.findall(r'[^\d]+', phrase)

['there are ', ' numbers ', ' inside ', ' of this sentence']

In [124]:
#removing punctuation from a sentence
test_phrase = 'this is a string but it has punctuation! How, should, we remove it all?'

mylist = re.findall(r'[^!.,?]+', test_phrase) # anything after the carrot ^ symbol, we are going to exclude

' '.join(mylist) # this ' ' joins every item with a space in between it inside of mylist, so that we get a nice sentence back

'this is a string but it has punctuation  How  should  we remove it all'

In [126]:
text = 'only find the hyphen-words. where are the long-ish words?'

re.findall(r'[\w]+-[\w]+', text) #found the hyphenated words

['hyphen-words', 'long-ish']