## Natural Language Processing
#### Table/Structured --> RDBMS, Non-RDBMS
#### Images/Video -> Computer Vision
#### Text -> NLP


## There are 3 types of NLP
### Lexical, Syntactical, Semantic Processsing

In [1]:
import re # regular expression
text = " Shreyas is exceptional student!"

isthere = re.search("Shreyas", text)
isthere



<re.Match object; span=(1, 8), match='Shreyas'>

In [27]:
isthere.group()


'Shreyas'

In [26]:
import re
text = ("""Text: Google, Domlur, Bangalore, 560071""")
match = re.search("[0-9]+", text)
match
match.group()


'560071'

In [31]:
import re

logs = [
    "System booted successfully",
    "Warning: low memory",
    "Err: file not found",
    "Error: disk failure",
    "E: unknown exception",
]

for log in logs:
    match = re.search("Er?", log)
    if match:
        print(log)


Err: file not found
Error: disk failure
E: unknown exception


# Qualifiers

#### '*' : Zero or More
#### '?' : Zero or One
#### '+' : One or More


In [42]:
text = "The color scheme is different from the colour used in UK."
re.findall("colou?r", text)



['color', 'colour']

# Anchors
- '^' : Indicates start of the string
- '$' : End of the string


In [49]:
emails = ["emp123@company.com", "john@company.com", "emp456@company.com"]

for email in emails:
    match = re.search('^emp', email)
    if match:
        print(email)

emp123@company.com
emp456@company.com


In [54]:
files = ["report.pdf", "data.xlsx", "summary.pdf"]

for file in files:
    match = re.search("\.pdf$", file)
    if match:
        print(file)

report.pdf
summary.pdf


# Wildcards 
-  '.': Matches any character 

In [56]:
re.findall('.', 'hello')

['h', 'e', 'l', 'l', 'o']

## Character Sets
| Pattern  | Matches                                                                                    |
|----------|--------------------------------------------------------------------------------------------|
| [abc]    | Matches either an a, b or c character                                                      |
| [abcABC] | Matches either an a, A, b, B, c or C character                                             |
| [a-z]    | Matches any characters between a and z, including a and z                                  |
| [A-Z]    | Matches any characters between A and Z, including A and Z                                  |
| [a-zA-Z] | Matches any characters between a and z, including a and z ignoring cases of the characters |
| [0-9]    | Matches any character which is a number between 0 and 9                                    |

### Meta sequences

| Pattern  | Equivalent to    |
|----------|------------------|
| \s       | [ \t\n\r\f\v]    |
| \S       | [^ \t\n\r\f\v]   |
| \d       | [0-9]            |
| \D       | [^0-9]           |
| \w       | [a-zA-Z0-9_]     |
| \W       | [^a-zA-Z0-9_]    |

In [58]:
text = "My phone number is 123-456-7890 and my PIN is 4321."

re.findall("[0-9]+", text)

['123', '456', '7890', '4321']

In [59]:
text = "My phone number is 123-456-7890 and my PIN is 4321."
re.findall("\d+", text)

['123', '456', '7890', '4321']

In [None]:
url = "http://www.telegraph.co.uk/formula-1/2017/10/28/mexican-grand-prix-2017-time-does-start-tv-channel-odds-lewisl/2017/05/12"

re.findall("\d{4}+/\d{1,2}/\d{1,2}", url)

['2017/10/28', '2017/05/12']

In [87]:
weblink = "http://www.telegraph.co.uk/formula-1/2017/10/28/mexican-grand-prix-2017-time-does-start-tv-channel-odds-lewisl/2017/05/12"

re.findall("\d{4}/\d{2}/\d{2}", weblink)

['2017/10/28', '2017/05/12']

- match: Only at start 
- findall: All matches

In [90]:

emails = ["test.email@example.com", "invalid-email.com", "user@domain.org"]

pattern = ("\w+@\w+\.+\w")

for i in emails:
    match = re.search(pattern, i)
    if match:
        print(i)
    


test.email@example.com
user@domain.org


### Ignore case and replace character with *

In [101]:
text = "This is a test string. I'm strong"
# aeiou => *
re.sub("[t]", "*", text, flags=re.IGNORECASE)

"*his is a *es* s*ring. I'm s*rong"