In [2]:
import nltk
import spacy
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize, sent_tokenize
from nltk.stem import WordNetLemmatizer,PorterStemmer
import re
from sklearn.feature_extraction.text import CountVectorizer



In [3]:
nltk.download('punkt')
nltk.download('stopwords')
nltk.download('wordnet')
nltk.download('omw-1.4')
nltk.download('punkt_tab')

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.
[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data] Downloading package omw-1.4 to /root/nltk_data...
[nltk_data] Downloading package punkt_tab to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt_tab.zip.


True

In [4]:
our_text='Artificial Intelligence and Machine Learning are going to change the futrue of IT industry and technology !'

In [5]:
#Tokenization
tokens=word_tokenize(our_text)
print(tokens)

['Artificial', 'Intelligence', 'and', 'Machine', 'Learning', 'are', 'going', 'to', 'change', 'the', 'futrue', 'of', 'IT', 'industry', 'and', 'technology', '!']


In [7]:
#cleaning
clean_tokens=[re.sub(r'[^a-zA-Z0-9]','',token) for token in tokens if token.isalpha()]
print(clean_tokens)

['Artificial', 'Intelligence', 'and', 'Machine', 'Learning', 'are', 'going', 'to', 'change', 'the', 'futrue', 'of', 'IT', 'industry', 'and', 'technology']


In [8]:
#stop word set of english
stop_words=set(stopwords.words('english'))
print(stop_words)

{"she's", "should've", 'while', 'are', 'no', 'in', "they'll", 'that', 'we', "she'd", 'themselves', 'very', 'their', 'theirs', 'ours', 'about', 'can', 're', "you'll", 't', 'our', 'into', "they'd", 'or', 'off', 'between', 'before', 'why', 'from', "hadn't", 'ourselves', 'should', "we'll", "haven't", 'hadn', 'myself', 'after', 'y', "shan't", 'out', "don't", 'not', "they're", 'having', 'you', 'because', 'just', 'down', 'few', 'which', 'during', "needn't", 'most', 'hasn', 'again', "won't", "he's", 'its', "we'd", "he'll", 'himself', 'his', "shouldn't", 'any', 'own', 'those', 'over', 'ma', 'than', 'been', 'at', 'him', 'the', 'm', 'mustn', 'below', "you've", 'ain', 'through', 'shan', 'aren', 'above', "i'll", 'by', "isn't", 'yourself', 'hers', 'both', "wasn't", 'further', 'weren', 'all', "you're", "they've", 'here', 'did', "wouldn't", 'have', "it'd", 'they', 'll', 'until', "i'd", 'didn', "hasn't", "i've", 'and', 'your', 'where', 'she', 'some', 'be', 'shouldn', 'doing', 'to', "weren't", "didn't",

In [9]:
#stop word removal
filtered_tokens=[token for token in clean_tokens if token.lower() not in stop_words]
print(filtered_tokens)

['Artificial', 'Intelligence', 'Machine', 'Learning', 'going', 'change', 'futrue', 'industry', 'technology']


In [10]:
#stemming: Reduce the word to their root from (It's not always a dictionary word)
stemmer=PorterStemmer()
stemmed_tokens=[stemmer.stem(token) for token in filtered_tokens]
print(stemmed_tokens)

['artifici', 'intellig', 'machin', 'learn', 'go', 'chang', 'futru', 'industri', 'technolog']


In [11]:
#Lemmatization:Convert words to their dictionary form(lemma) using POS :part of speech
lemmatizer=WordNetLemmatizer()
lemmatized_tokens=[lemmatizer.lemmatize(token) for token in filtered_tokens]
print(lemmatized_tokens)

['Artificial', 'Intelligence', 'Machine', 'Learning', 'going', 'change', 'futrue', 'industry', 'technology']


In [12]:

#vecotrization expect sentences not token list
text=[' '.join(lemmatized_tokens)]
print(text)

['Artificial Intelligence Machine Learning going change futrue industry technology']


In [14]:
#Vectorization: Converting text tokens into numbers so ML model can understand them
vectorizer=CountVectorizer()
vectorized_text=vectorizer.fit_transform(text)
print(vectorized_text.toarray())

[[1 1 1 1 1 1 1 1 1]]


In [16]:
print(vectorizer.get_feature_names_out())

['artificial' 'change' 'futrue' 'going' 'industry' 'intelligence'
 'learning' 'machine' 'technology']


#Count Vectorizer

In [28]:
import pandas as pd

In [29]:
emails=[
    'Win money now',
    'Limited offer win cash',
    'Meeting scheduled for tomorrow',
    'Project discussion metting',
    'Win free tickets now',
    'Tomorrow is the project deadline',
    'Meeting scheduled for next week'
]

In [30]:
vectorizer=CountVectorizer()
vectorized_emails=vectorizer.fit_transform(emails)

In [31]:
vectorizer.get_feature_names_out()

array(['cash', 'deadline', 'discussion', 'for', 'free', 'is', 'limited',
       'meeting', 'metting', 'money', 'next', 'now', 'offer', 'project',
       'scheduled', 'the', 'tickets', 'tomorrow', 'week', 'win'],
      dtype=object)

In [32]:
df=pd.DataFrame(vectorized_emails.toarray(),columns=vectorizer.get_feature_names_out())
#

In [33]:
df

Unnamed: 0,cash,deadline,discussion,for,free,is,limited,meeting,metting,money,next,now,offer,project,scheduled,the,tickets,tomorrow,week,win
0,0,0,0,0,0,0,0,0,0,1,0,1,0,0,0,0,0,0,0,1
1,1,0,0,0,0,0,1,0,0,0,0,0,1,0,0,0,0,0,0,1
2,0,0,0,1,0,0,0,1,0,0,0,0,0,0,1,0,0,1,0,0
3,0,0,1,0,0,0,0,0,1,0,0,0,0,1,0,0,0,0,0,0
4,0,0,0,0,1,0,0,0,0,0,0,1,0,0,0,0,1,0,0,1
5,0,1,0,0,0,1,0,0,0,0,0,0,0,1,0,1,0,1,0,0
6,0,0,0,1,0,0,0,1,0,0,1,0,0,0,1,0,0,0,1,0


#NLP
* Basics one more example

In [34]:
dataset=[
    'Artificial Intelligence is future.',
    'Machine Learning is a subset of Artificial Intelligence',
    'Deep Learning is a subset of Machine Learning and Machine Learing is importent for Artificial Intelligence',
    'Natural Language Processing is also a part of Machine Learning and Artificial Intelligence',
    'Computer Vision is also a part of Machine Learning and Artificial Intelligence'
]


In [35]:
#Tokenization
tokens=[word_tokenize(sentence.lower()) for sentence in dataset]
print(tokens)

[['artificial', 'intelligence', 'is', 'future', '.'], ['machine', 'learning', 'is', 'a', 'subset', 'of', 'artificial', 'intelligence'], ['deep', 'learning', 'is', 'a', 'subset', 'of', 'machine', 'learning', 'and', 'machine', 'learing', 'is', 'importent', 'for', 'artificial', 'intelligence'], ['natural', 'language', 'processing', 'is', 'also', 'a', 'part', 'of', 'machine', 'learning', 'and', 'artificial', 'intelligence'], ['computer', 'vision', 'is', 'also', 'a', 'part', 'of', 'machine', 'learning', 'and', 'artificial', 'intelligence']]


In [38]:
#stop words removal
stop_words=set(stopwords.words('english'))
filtered_tokens=[[token for token in sentence if token.isalpha() and token not in stop_words] for sentence in tokens]
print(filtered_tokens)

[['artificial', 'intelligence', 'future'], ['machine', 'learning', 'subset', 'artificial', 'intelligence'], ['deep', 'learning', 'subset', 'machine', 'learning', 'machine', 'learing', 'importent', 'artificial', 'intelligence'], ['natural', 'language', 'processing', 'also', 'part', 'machine', 'learning', 'artificial', 'intelligence'], ['computer', 'vision', 'also', 'part', 'machine', 'learning', 'artificial', 'intelligence']]


In [40]:
#Lemmatization
lemmatizer=WordNetLemmatizer()
lemmatized_tokens=[[lemmatizer.lemmatize(token) for token in sentence] for sentence in filtered_tokens]
print(lemmatized_tokens)

[['artificial', 'intelligence', 'future'], ['machine', 'learning', 'subset', 'artificial', 'intelligence'], ['deep', 'learning', 'subset', 'machine', 'learning', 'machine', 'learing', 'importent', 'artificial', 'intelligence'], ['natural', 'language', 'processing', 'also', 'part', 'machine', 'learning', 'artificial', 'intelligence'], ['computer', 'vision', 'also', 'part', 'machine', 'learning', 'artificial', 'intelligence']]


In [41]:
clean_dataset=[' '.join(sentence) for sentence in lemmatized_tokens]
print(clean_dataset)

['artificial intelligence future', 'machine learning subset artificial intelligence', 'deep learning subset machine learning machine learing importent artificial intelligence', 'natural language processing also part machine learning artificial intelligence', 'computer vision also part machine learning artificial intelligence']


In [42]:
vectorizer=CountVectorizer()
vectorized_dataset=vectorizer.fit_transform(clean_dataset)

In [43]:
print(vectorizer.get_feature_names_out())

['also' 'artificial' 'computer' 'deep' 'future' 'importent' 'intelligence'
 'language' 'learing' 'learning' 'machine' 'natural' 'part' 'processing'
 'subset' 'vision']


In [44]:
print('Bag of words matrix')
print(vectorized_dataset.toarray())

Bag of words matrix
[[0 1 0 0 1 0 1 0 0 0 0 0 0 0 0 0]
 [0 1 0 0 0 0 1 0 0 1 1 0 0 0 1 0]
 [0 1 0 1 0 1 1 0 1 2 2 0 0 0 1 0]
 [1 1 0 0 0 0 1 1 0 1 1 1 1 1 0 0]
 [1 1 1 0 0 0 1 0 0 1 1 0 1 0 0 1]]


In [45]:
# Convert Result to Data frame
df=pd.DataFrame(vectorized_dataset.toarray(),columns=vectorizer.get_feature_names_out())
df

Unnamed: 0,also,artificial,computer,deep,future,importent,intelligence,language,learing,learning,machine,natural,part,processing,subset,vision
0,0,1,0,0,1,0,1,0,0,0,0,0,0,0,0,0
1,0,1,0,0,0,0,1,0,0,1,1,0,0,0,1,0
2,0,1,0,1,0,1,1,0,1,2,2,0,0,0,1,0
3,1,1,0,0,0,0,1,1,0,1,1,1,1,1,0,0
4,1,1,1,0,0,0,1,0,0,1,1,0,1,0,0,1


### **Day-17 Exercise -01**
* Imagine you are working in a company and you reciive customer feedback. You want to convert text into numbers so that ML model can understand it.
* Sample Customer feedback dataset as follows.


In [46]:
customer_feedback= ['The product quality is very good',
        'I love this phone, camera is very nice',
        'Worst product ever please dont buy',
        'Customer support is pathetic not statisfied',
        'Excellent Customer Support , The have resolved my issue',
        'Very disappointing experience',
        'Fast delivery good packaging'
        ]

**TF-IDF**Term Frequency X Inverse Document Frequency
* It gives high importance to meaning words and low importance to common words.

In [47]:
customer_feedback= ['The product quality is very good',
        'I love this phone, camera is very nice',
        'Worst product ever please dont buy',
        'Customer support is pathetic not statisfied',
        'Excellent Customer Support , The have resolved my issue',
        'Very disappointing experience',
        'Fast delivery good packaging'
        ]

In [48]:
from sklearn.feature_extraction.text import TfidfVectorizer

In [49]:
vectorizer=TfidfVectorizer()
vectorized_feedback=vectorizer.fit_transform(customer_feedback)
#

In [50]:
df=pd.DataFrame(vectorized_feedback.toarray(),columns=vectorizer.get_feature_names_out())
df

Unnamed: 0,buy,camera,customer,delivery,disappointing,dont,ever,excellent,experience,fast,...,please,product,quality,resolved,statisfied,support,the,this,very,worst
0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.411257,0.495439,0.0,0.0,0.0,0.411257,0.0,0.351529,0.0
1,0.0,0.408015,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.408015,0.289499,0.0
2,0.419257,0.0,0.0,0.0,0.0,0.419257,0.419257,0.0,0.0,0.0,...,0.419257,0.348019,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.419257
3,0.0,0.0,0.375704,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.452608,0.375704,0.0,0.0,0.0,0.0
4,0.0,0.0,0.312249,0.0,0.0,0.0,0.0,0.376165,0.0,0.0,...,0.0,0.0,0.0,0.376165,0.0,0.312249,0.312249,0.0,0.0,0.0
5,0.0,0.0,0.0,0.0,0.632022,0.0,0.0,0.0,0.632022,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.448438,0.0
6,0.0,0.0,0.0,0.520647,0.0,0.0,0.0,0.0,0.0,0.520647,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
