In [20]:
import nltk
import spacy
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize, sent_tokenize
from nltk.stem import PorterStemmer, WordNetLemmatizer
import re
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer


In [4]:
nltk.download('punkt')
nltk.download('stopwords')
nltk.download('wordnet')
nltk.download('omw-1.4')
nltk.download('punkt_tab')

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package omw-1.4 to /root/nltk_data...
[nltk_data]   Package omw-1.4 is already up-to-date!
[nltk_data] Downloading package punkt_tab to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt_tab.zip.


True

In [5]:
our_text="Artificial Intelligence and Machine Learning are going to change the future of technology!"

In [6]:
# Tokenization
tokens=word_tokenize(our_text)
print(tokens)

['Artificial', 'Intelligence', 'and', 'Machine', 'Learning', 'are', 'going', 'to', 'change', 'the', 'future', 'of', 'technology', '!']


In [8]:
#cleaning
clean_tokens=[re.sub('[^a-zA-Z0-9]','',token.lower()) for token in tokens if token.isalpha()]
print(clean_tokens)

['artificial', 'intelligence', 'and', 'machine', 'learning', 'are', 'going', 'to', 'change', 'the', 'future', 'of', 'technology']


In [14]:
#Stop Words Removal
#1. Stop Word set of english
stop_words=set(stopwords.words('english'))
print(stop_words)



{'it', 'own', "mightn't", 't', 'ma', "shouldn't", "you're", "we'd", "wasn't", 'over', "they'll", 'there', "we've", 'to', 'whom', 'themselves', 'further', 'some', 've', 'mustn', "you've", 'my', 'myself', 'itself', 'most', "she'd", 'these', 'them', 'won', "i'll", 'hers', 'now', "we'll", 'between', 'each', 'her', "hadn't", "she's", "shan't", 'from', "it's", "i'm", "we're", 'wouldn', 'd', 'his', "wouldn't", 'below', "isn't", 'off', 'that', 'him', 'shouldn', 'on', 'weren', 'after', 'is', 'don', 'haven', 'himself', 'then', 'isn', 'should', 'was', 'yourselves', 'o', 'for', 'at', "you'd", 'wasn', 'again', 'ain', 'herself', 'are', 're', 'ourselves', "they're", "aren't", 'm', "you'll", "that'll", 'its', 'those', 'here', 'what', 'not', 'both', 'while', 'because', 'did', 'aren', 'had', 'hadn', 'too', "he's", 'with', 'a', 'he', 'will', 'being', 'can', 'didn', "mustn't", 'if', 'above', "he'd", 'against', 'nor', 'until', "won't", 'few', 'how', 'an', 'very', "i've", 'does', "should've", 'only', 'ours'

In [15]:
filtered_tokens=[token for token in clean_tokens if token not in stop_words] #2. stop  word removal example
print(filtered_tokens)

['artificial', 'intelligence', 'machine', 'learning', 'going', 'change', 'future', 'technology']


In [16]:
#Stemming: Reduce the words to their root form (It's not always a dictionary word)
stemmer=PorterStemmer()
stemmed_tokens=[stemmer.stem(token) for token in filtered_tokens]
print(stemmed_tokens)
#

['artifici', 'intellig', 'machin', 'learn', 'go', 'chang', 'futur', 'technolog']


In [17]:
#Lemmatization: Convert words to their dictionary form (lemma) using POS: part of speech
lemmatizer=WordNetLemmatizer()
lemmatized_tokens=[lemmatizer.lemmatize(token) for token in filtered_tokens]
print(lemmatized_tokens)

['artificial', 'intelligence', 'machine', 'learning', 'going', 'change', 'future', 'technology']


In [19]:
#Vectorization: Converting text tokens into numbers so ML models can understand them.
#Vectorization expect sentences not token list
text=[" ".join(lemmatized_tokens)]
print (text)


['artificial intelligence machine learning going change future technology']


In [21]:
vectorizer=CountVectorizer()
X=vectorizer.fit_transform(text)
print(vectorizer.get_feature_names_out())


['artificial' 'change' 'future' 'going' 'intelligence' 'learning'
 'machine' 'technology']


In [22]:
print(X.toarray())

[[1 1 1 1 1 1 1 1]]


In [23]:
dataset=[
    'Artificial Intelligence is future.',
    'Machine Learning is a subset of Artificial Intelligence',
    'Deep Learning is a subset of Machine Learning and Machine Learing is importent for Artificial Intelligence',
    'Natural Language Processing is also a part of Machine Learning and Artificial Intelligence',
    'Computer Vision is also a part of Machine Learning and Artificial Intelligence'
]



In [28]:
# Tokenization
tokens=[word_tokenize(sentence.lower()) for sentence in dataset]
print(tokens)

[['artificial', 'intelligence', 'is', 'future', '.'], ['machine', 'learning', 'is', 'a', 'subset', 'of', 'artificial', 'intelligence'], ['deep', 'learning', 'is', 'a', 'subset', 'of', 'machine', 'learning', 'and', 'machine', 'learing', 'is', 'importent', 'for', 'artificial', 'intelligence'], ['natural', 'language', 'processing', 'is', 'also', 'a', 'part', 'of', 'machine', 'learning', 'and', 'artificial', 'intelligence'], ['computer', 'vision', 'is', 'also', 'a', 'part', 'of', 'machine', 'learning', 'and', 'artificial', 'intelligence']]


In [30]:
#Stop Words Removal
stop_words=set(stopwords.words('english'))
filtered_tokens=[[token for token in sentence if token.isalpha() and token not in stop_words] for sentence in tokens]
print(filtered_tokens)

[['artificial', 'intelligence', 'future'], ['machine', 'learning', 'subset', 'artificial', 'intelligence'], ['deep', 'learning', 'subset', 'machine', 'learning', 'machine', 'learing', 'importent', 'artificial', 'intelligence'], ['natural', 'language', 'processing', 'also', 'part', 'machine', 'learning', 'artificial', 'intelligence'], ['computer', 'vision', 'also', 'part', 'machine', 'learning', 'artificial', 'intelligence']]


In [31]:
#Lemmatization
lemmatizer=WordNetLemmatizer()
lemmatized_tokens=[[lemmatizer.lemmatize(token) for token in sentence] for sentence in filtered_tokens]
print(lemmatized_tokens)

[['artificial', 'intelligence', 'future'], ['machine', 'learning', 'subset', 'artificial', 'intelligence'], ['deep', 'learning', 'subset', 'machine', 'learning', 'machine', 'learing', 'importent', 'artificial', 'intelligence'], ['natural', 'language', 'processing', 'also', 'part', 'machine', 'learning', 'artificial', 'intelligence'], ['computer', 'vision', 'also', 'part', 'machine', 'learning', 'artificial', 'intelligence']]


In [32]:
clean_dataset=[" ".join(sentence) for sentence in lemmatized_tokens]
print(clean_dataset)
#

['artificial intelligence future', 'machine learning subset artificial intelligence', 'deep learning subset machine learning machine learing importent artificial intelligence', 'natural language processing also part machine learning artificial intelligence', 'computer vision also part machine learning artificial intelligence']


In [33]:
bow_vectorizer=CountVectorizer()
X=bow_vectorizer.fit_transform(clean_dataset)


In [34]:
print(bow_vectorizer.get_feature_names_out())

['also' 'artificial' 'computer' 'deep' 'future' 'importent' 'intelligence'
 'language' 'learing' 'learning' 'machine' 'natural' 'part' 'processing'
 'subset' 'vision']


In [37]:
print('Bag of Words Matrix')
print(X.toarray())
#

Bag of Words Matrix
[[0 1 0 0 1 0 1 0 0 0 0 0 0 0 0 0]
 [0 1 0 0 0 0 1 0 0 1 1 0 0 0 1 0]
 [0 1 0 1 0 1 1 0 1 2 2 0 0 0 1 0]
 [1 1 0 0 0 0 1 1 0 1 1 1 1 1 0 0]
 [1 1 1 0 0 0 1 0 0 1 1 0 1 0 0 1]]


In [39]:
from sklearn.feature_extraction.text import TfidfVectorizer
print(text)


['artificial intelligence machine learning going change future technology']


In [40]:
vectorizer=TfidfVectorizer()
X=vectorizer.fit_transform(text)
print(vectorizer.get_feature_names_out())
print(X.toarray())

['artificial' 'change' 'future' 'going' 'intelligence' 'learning'
 'machine' 'technology']
[[0.35355339 0.35355339 0.35355339 0.35355339 0.35355339 0.35355339
  0.35355339 0.35355339]]


In [41]:
movie_review=[
    'I love this movie',
    'This movie is excellent',
    'I hate this movie because too much fight',
    'I am a fan of this product',
    'I hate this product',
    'It is truly a masterpiece ,great movie'

]

In [42]:
vectorizer=TfidfVectorizer()
X=vectorizer.fit_transform(movie_review)


In [43]:
import pandas as pd
df=pd.DataFrame(X.toarray(),columns=vectorizer.get_feature_names_out())
print(df)

         am   because  excellent  ...      this       too     truly
0  0.000000  0.000000   0.000000  ...  0.403215  0.000000  0.000000
1  0.000000  0.000000   0.661272  ...  0.338787  0.000000  0.000000
2  0.000000  0.434912   0.000000  ...  0.222817  0.434912  0.000000
3  0.504119  0.000000   0.000000  ...  0.258274  0.000000  0.000000
4  0.000000  0.000000   0.000000  ...  0.404106  0.000000  0.000000
5  0.000000  0.000000   0.000000  ...  0.000000  0.000000  0.446127

[6 rows x 18 columns]
