#Tokenization

In [41]:
import nltk
nltk.download('punkt')
sent = "Tokenization is the task of splitting a text into meaningful segments, called tokens"

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


In [42]:
word = nltk.word_tokenize(sent)
word

['Tokenization',
 'is',
 'the',
 'task',
 'of',
 'splitting',
 'a',
 'text',
 'into',
 'meaningful',
 'segments',
 ',',
 'called',
 'tokens']

In [43]:
paragraph = "he is a good body. she is a good girl. Boy and Girl are good."
nltk.sent_tokenize(paragraph)

['he is a good body.', 'she is a good girl.', 'Boy and Girl are good.']

In [44]:
email = "Hi, you won $40 worth of lottery. You can buy a fa cup coupon using this card."

#Stopwords

#Words like "is, the, of, a, an etc. do not add much meaning or value to the text.. So we use stopwatch to remove them all.

In [45]:
import nltk

In [46]:
nltk.download('stopwords')
nltk.download('punkt')

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


True

In [47]:
en_stopwords = nltk.corpus.stopwords.words('english')


In [48]:
email = "Hi, you won $40 worth of lottery. You can buy a fa cup coupon using this card."
email = email.lower()

In [49]:
words = nltk.word_tokenize(email)
print(words)

['hi', ',', 'you', 'won', '$', '40', 'worth', 'of', 'lottery', '.', 'you', 'can', 'buy', 'a', 'fa', 'cup', 'coupon', 'using', 'this', 'card', '.']


In [50]:
cleaned_words = [word for word in words if word not in en_stopwords]

In [51]:
" ".join(cleaned_words)

'hi , $ 40 worth lottery . buy fa cup coupon using card .'

# **Stemming VS Lemmatization**

Stemming: convert words into rooot word or base word stem or base word -word may not have any meaning -very fast

In [52]:
words = ["change", "changes", "changer"]
stemmer = nltk.stem.PorterStemmer()
[stemmer.stem(word) for word in words]

['chang', 'chang', 'changer']

# **Lemmatization: Lemmatization deals with reducing the word to its canonical dictionary form. The root word is called a 'lemma' and the method is called lemmatization. -Slow**

In [53]:
nltk.download('wordnet')

[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


True

In [54]:
words = ["change", "changes", "changer"]
lemmatizer = nltk.stem.WordNetLemmatizer()
[lemmatizer.lemmatize(word) for word in words]

['change', 'change', 'changer']

#Removing HTML

In [55]:
text = "<h1><b> I've been recently having more conversations with participants in our self-awareness courses on their marriages. </b></h1>"

In [56]:
from bs4 import BeautifulSoup
soup = BeautifulSoup(text, 'html.parser')
print(soup.text)

 I've been recently having more conversations with participants in our self-awareness courses on their marriages. 


In [57]:
import re

In [58]:
compiler = re.compile(r'<.*?>')
compiler.sub('', text)

" I've been recently having more conversations with participants in our self-awareness courses on their marriages. "

#Regular Expression

In [59]:
import re

In [60]:
pattern = "^a...s$"

In [61]:
string = 'abacus'

In [62]:
re.match(pattern, string)

In [63]:
string = 'abyss'
re.match(pattern, string)

<re.Match object; span=(0, 5), match='abyss'>

In [64]:
pattern = '[abc]'
string = "hey Jude"

In [65]:
re.match(pattern, string)

In [66]:
string = "abacus"
re.findall(pattern, string)

['a', 'b', 'a', 'c']

#Extracting email addresses using RegEx

In [67]:
string = """
In the bustling digital realm, where communication bridges vast distances with mere clicks, email addresses serve as gateways to connectivity. Among the myriad of addresses, some stand out, like beacons in the night. Take, for instance, example1@gmail.com, a simple yet eloquent address, embodying the essence of clarity. Then there's user1234@gmail.com, a testament to the universality of usernames in the online world. Venturing further into the realms of passion, we encounter gamingfanatic007@gmail.com, where pixels collide and dreams are crafted in virtual worlds.

But let's not forget the whimsical touch of iamawesome42@gmail.com, a declaration of self-assurance amidst the digital cacophony. And finally, nestled among the trials of testing and exploration, we find testaccount555@gmail.com, a beacon of experimentation in the vast ocean of cyberspace.

Each address tells a story, a narrative woven into the fabric of the internet, awaiting connections, conversations, and collaborations yet to unfold.

"""

In [68]:
pattern = r"\b[0-9a-zA-Z._+]+\@gmail.com\b"

In [69]:
re.findall(pattern, string)

['example1@gmail.com',
 'user1234@gmail.com',
 'gamingfanatic007@gmail.com',
 'iamawesome42@gmail.com',
 'testaccount555@gmail.com']

#Converting Text to vectors


*   Bag of words/ count vectoorizer()

*   Tfidf


In [70]:
corpus = [
"he is a good boy",
"She is a good girl",
"boy and girl are good",
"good boy and good girl are good for society"]

In [71]:
from sklearn.feature_extraction.text import CountVectorizer

In [72]:
vectorizer = CountVectorizer(stop_words='english')

In [73]:
X = vectorizer.fit_transform(corpus)
X

<4x4 sparse matrix of type '<class 'numpy.int64'>'
	with 11 stored elements in Compressed Sparse Row format>

In [74]:
X.toarray()

array([[1, 0, 1, 0],
       [0, 1, 1, 0],
       [1, 1, 1, 0],
       [1, 1, 3, 1]])

In [75]:
vocab = vectorizer.get_feature_names_out() #vocab
vocab

array(['boy', 'girl', 'good', 'society'], dtype=object)

In [76]:
import pandas as pd
pd.DataFrame(X.toarray(), columns = vocab)

Unnamed: 0,boy,girl,good,society
0,1,0,1,0
1,0,1,1,0
2,1,1,1,0
3,1,1,3,1


Advantages:
* Simple

Disadvantages:
* Sparsity

* Out of vocabulary Problem

* Ordering of is not preserved

* Semantic MEaning Lost

* Focuses on most frequent words

# Tfidf
TF-IDF stands for "Term-Frequency -Inverse data Frequency". We will learn what this means mathematically

$tf = \frac{\text{No of repetition of words in sentence}}{\text{No of words in sentence}}
idf = log(\frac{\text{No of sentence}}{\text{No of sentence containing the word}})$

#tf*idf

In [77]:
corpus = [
"he is a good boy",
"She is a good girl",
"boy and girl are good",
"good boy and good girl are good for society"]

In [78]:
from sklearn.feature_extraction.text import TfidfVectorizer

In [79]:
vectorizer = TfidfVectorizer()
X = vectorizer.fit_transform(corpus)
vocab = vectorizer.get_feature_names_out() #vocab
print(vocab)
import pandas as pd
pd.DataFrame(X.toarray(), columns = vocab)

['and' 'are' 'boy' 'for' 'girl' 'good' 'he' 'is' 'she' 'society']


Unnamed: 0,and,are,boy,for,girl,good,he,is,she,society
0,0.0,0.0,0.420753,0.0,0.0,0.343993,0.659191,0.519714,0.0,0.0
1,0.0,0.0,0.0,0.0,0.420753,0.343993,0.0,0.519714,0.659191,0.0
2,0.51647,0.51647,0.418127,0.0,0.418127,0.341846,0.0,0.0,0.0,0.0
3,0.30903,0.30903,0.250186,0.391965,0.250186,0.613631,0.0,0.0,0.0,0.391965


# How to preseve semanticmeaning?

_For ex: ngram_range of (1,1) means only unigram, (1,2) means unigram and bigram, (2, 2) means bigram only._