# Text Procecssing
## Capturing text data
### Plain text

In [1]:
import pandas as pd

In [5]:
text=''
with open('text.txt','r') as f:
    text = f.read()
    print(text)

I would love to try or hear the sample audio your app can produce. I do not want to purchase, because I've purchased so many apps that say they do something and do not deliver.  

Can you please add audio samples with text you've converted? I'd love to see the end results.

Thanks!



### Tabular data 

In [3]:
df = pd.read_csv('tabular.csv')
df.head()

Unnamed: 0,Start Date,Start Time,End Date,End Time,Event Title,All Day Event,No End Time,Event Description,Contact,Contact Email,Contact Phone,Location,Category,Mandatory,Registration,Maximum,Last Date To Register
0,9/5/2011,3:00:00 PM,9/5/2011,,Social Studies Dept. Meeting,N,Y,Department meeting,Chris Gallagher,cgallagher@schoolwires.com,814-555-5179,High School,2,N,N,25,9/2/2011
1,9/5/2011,6:00:00 PM,9/5/2011,8:00:00 PM,Curriculum Meeting,N,N,Curriculum Meeting,Chris Gallagher,cgallagher@schoolwires.com,814-555-5179,High School,2,N,N,25,9/2/2011


### Online data

In [4]:
import requests

In [12]:
r=requests.get('http://www.example.com')
print(r.text)

<html>
<head>
<title></title>
.
		</BODY>
		</html>



## Cleaning

In [6]:
text

"I would love to try or hear the sample audio your app can produce. I do not want to purchase, because I've purchased so many apps that say they do something and do not deliver.  \n\nCan you please add audio samples with text you've converted? I'd love to see the end results.\n\nThanks!\n"

In [7]:
text = text.replace('\n',' ')

In [8]:
import re
text = re.sub(' +',' ',text)
text

"I would love to try or hear the sample audio your app can produce. I do not want to purchase, because I've purchased so many apps that say they do something and do not deliver. Can you please add audio samples with text you've converted? I'd love to see the end results. Thanks! "

## Normalization 

In [9]:
# Case Normalization
text = text.lower()

In [10]:
# Punctuation removal
text = re.sub(r'[^a-zA-Z0-9]'," ",text)

In [11]:
print(text)

i would love to try or hear the sample audio your app can produce  i do not want to purchase  because i ve purchased so many apps that say they do something and do not deliver  can you please add audio samples with text you ve converted  i d love to see the end results  thanks  


## Tokenization 

In [15]:
import nltk
nltk.download('punkt')

[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\PR369694\AppData\Roaming\nltk_data...
[nltk_data]   Unzipping tokenizers\punkt.zip.


True

In [16]:
from nltk.tokenize import word_tokenize

In [17]:
tokens = word_tokenize(text)

In [18]:
print(tokens)

['i', 'would', 'love', 'to', 'try', 'or', 'hear', 'the', 'sample', 'audio', 'your', 'app', 'can', 'produce', 'i', 'do', 'not', 'want', 'to', 'purchase', 'because', 'i', 've', 'purchased', 'so', 'many', 'apps', 'that', 'say', 'they', 'do', 'something', 'and', 'do', 'not', 'deliver', 'can', 'you', 'please', 'add', 'audio', 'samples', 'with', 'text', 'you', 've', 'converted', 'i', 'd', 'love', 'to', 'see', 'the', 'end', 'results', 'thanks']


## Stop words removal

In [21]:
from nltk.corpus import stopwords
nltk.download('stopwords')

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\PR369694\AppData\Roaming\nltk_data...
[nltk_data]   Unzipping corpora\stopwords.zip.


True

In [23]:
print(stopwords.words('english'))

['i', 'me', 'my', 'myself', 'we', 'our', 'ours', 'ourselves', 'you', "you're", "you've", "you'll", "you'd", 'your', 'yours', 'yourself', 'yourselves', 'he', 'him', 'his', 'himself', 'she', "she's", 'her', 'hers', 'herself', 'it', "it's", 'its', 'itself', 'they', 'them', 'their', 'theirs', 'themselves', 'what', 'which', 'who', 'whom', 'this', 'that', "that'll", 'these', 'those', 'am', 'is', 'are', 'was', 'were', 'be', 'been', 'being', 'have', 'has', 'had', 'having', 'do', 'does', 'did', 'doing', 'a', 'an', 'the', 'and', 'but', 'if', 'or', 'because', 'as', 'until', 'while', 'of', 'at', 'by', 'for', 'with', 'about', 'against', 'between', 'into', 'through', 'during', 'before', 'after', 'above', 'below', 'to', 'from', 'up', 'down', 'in', 'out', 'on', 'off', 'over', 'under', 'again', 'further', 'then', 'once', 'here', 'there', 'when', 'where', 'why', 'how', 'all', 'any', 'both', 'each', 'few', 'more', 'most', 'other', 'some', 'such', 'no', 'nor', 'not', 'only', 'own', 'same', 'so', 'than', '

In [24]:
tokens = [w for w in tokens if w not in stopwords.words('english')]

In [25]:
print(tokens)

['would', 'love', 'try', 'hear', 'sample', 'audio', 'app', 'produce', 'want', 'purchase', 'purchased', 'many', 'apps', 'say', 'something', 'deliver', 'please', 'add', 'audio', 'samples', 'text', 'converted', 'love', 'see', 'end', 'results', 'thanks']


## POS Tagging

In [28]:
from nltk import pos_tag
nltk.download('averaged_perceptron_tagger')

[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     C:\Users\PR369694\AppData\Roaming\nltk_data...
[nltk_data]   Unzipping taggers\averaged_perceptron_tagger.zip.


True

In [29]:
print(pos_tag(tokens))

[('would', 'MD'), ('love', 'VB'), ('try', 'NN'), ('hear', 'JJ'), ('sample', 'JJ'), ('audio', 'JJ'), ('app', 'NN'), ('produce', 'NN'), ('want', 'VBP'), ('purchase', 'NN'), ('purchased', 'VBD'), ('many', 'JJ'), ('apps', 'NNS'), ('say', 'VBP'), ('something', 'NN'), ('deliver', 'JJ'), ('please', 'NN'), ('add', 'VB'), ('audio', 'JJ'), ('samples', 'NNS'), ('text', 'RB'), ('converted', 'VBN'), ('love', 'NN'), ('see', 'VBP'), ('end', 'JJ'), ('results', 'NNS'), ('thanks', 'NNS')]


## Stemming and Lemmatization

In [31]:
from nltk.stem.porter import PorterStemmer

In [33]:
tokens1 = [PorterStemmer().stem(w) for w in tokens]

In [34]:
print(tokens1)

['would', 'love', 'tri', 'hear', 'sampl', 'audio', 'app', 'produc', 'want', 'purchas', 'purchas', 'mani', 'app', 'say', 'someth', 'deliv', 'pleas', 'add', 'audio', 'sampl', 'text', 'convert', 'love', 'see', 'end', 'result', 'thank']


In [38]:
from nltk.stem.wordnet import WordNetLemmatizer
nltk.download('wordnet')

[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\PR369694\AppData\Roaming\nltk_data...
[nltk_data]   Unzipping corpora\wordnet.zip.


True

In [39]:
tokens2 = [WordNetLemmatizer().lemmatize(w) for w in tokens]

In [40]:
print(tokens2)

['would', 'love', 'try', 'hear', 'sample', 'audio', 'app', 'produce', 'want', 'purchase', 'purchased', 'many', 'apps', 'say', 'something', 'deliver', 'please', 'add', 'audio', 'sample', 'text', 'converted', 'love', 'see', 'end', 'result', 'thanks']
