### Stemming stopwords and Word2Vec

<!-- Do web Scraping  -->

sentence = "This is my computer I do programming and computing in it. Programs I generate can be used for generating synthetic images"

In [1]:
import nltk
nltk.download('punkt')

[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\KIIT\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!


True

### Stemming

In [2]:
from nltk.stem import PorterStemmer
from nltk.tokenize import word_tokenize

In [3]:
ps = PorterStemmer()
sentence="This is my computer I do programming and computing in it. Programs I generate can be used for generating synthetic images"
words = word_tokenize(sentence)

In [4]:
for w in words:
    print(w," : ",ps.stem(w))

This  :  thi
is  :  is
my  :  my
computer  :  comput
I  :  i
do  :  do
programming  :  program
and  :  and
computing  :  comput
in  :  in
it  :  it
.  :  .
Programs  :  program
I  :  i
generate  :  gener
can  :  can
be  :  be
used  :  use
for  :  for
generating  :  gener
synthetic  :  synthet
images  :  imag


### Stopwords

In [5]:
import nltk
nltk.download('stopwords')
from nltk.corpus import stopwords

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\KIIT\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [6]:
stopWords = set(stopwords.words('english'))
wordsfiltered = [w for w in words if w not in stopWords]

In [7]:
wordsfiltered

['This',
 'computer',
 'I',
 'programming',
 'computing',
 '.',
 'Programs',
 'I',
 'generate',
 'used',
 'generating',
 'synthetic',
 'images']

### Word2Vec

In [8]:
from gensim.models import Word2Vec
model = Word2Vec([wordsfiltered], vector_size=100, window=5, min_count=1, workers=4)
word_vectors = model.wv


In [9]:
for word in wordsfiltered:
    vector = word_vectors[word]
    print(f"Word: {word}, Vector: {vector}")

Word: This, Vector: [ 9.7702928e-03  8.1651136e-03  1.2809718e-03  5.0975787e-03
  1.4081288e-03 -6.4551616e-03 -1.4280510e-03  6.4491653e-03
 -4.6173059e-03 -3.9930656e-03  4.9244044e-03  2.7130984e-03
 -1.8479753e-03 -2.8769434e-03  6.0107317e-03 -5.7167388e-03
 -3.2367026e-03 -6.4878250e-03 -4.2346325e-03 -8.5809948e-03
 -4.4697891e-03 -8.5112294e-03  1.4037776e-03 -8.6181965e-03
 -9.9166557e-03 -8.2016252e-03 -6.7726658e-03  6.6805850e-03
  3.7845564e-03  3.5616636e-04 -2.9579818e-03 -7.4283206e-03
  5.3341867e-04  4.9989222e-04  1.9561886e-04  8.5259555e-04
  7.8633073e-04 -6.8160298e-05 -8.0070542e-03 -5.8702733e-03
 -8.3829118e-03 -1.3120425e-03  1.8206370e-03  7.4171280e-03
 -1.9634271e-03 -2.3252917e-03  9.4871549e-03  7.9704521e-05
 -2.4045217e-03  8.6048469e-03  2.6870037e-03 -5.3439722e-03
  6.5881060e-03  4.5101536e-03 -7.0544672e-03 -3.2317400e-04
  8.3448651e-04  5.7473574e-03 -1.7176545e-03 -2.8065301e-03
  1.7484308e-03  8.4717153e-04  1.1928272e-03 -2.6342822e-03
 -5.

### Webscrape a news website

In [10]:
import requests
from bs4 import BeautifulSoup

In [11]:
url = 'https://indianexpress.com/'

In [12]:
response = requests.get(url)
soup = BeautifulSoup(response.content, 'html.parser')
text = soup.get_text()

### Tokenizing

In [13]:
text_words=word_tokenize(text)

In [14]:
text_words

['Latest',
 'News',
 'Today',
 ':',
 'Breaking',
 'News',
 'and',
 'Top',
 'Headlines',
 'from',
 'India',
 ',',
 'Entertainment',
 ',',
 'Business',
 ',',
 'Politics',
 'and',
 'Sports',
 '|',
 'The',
 'Indian',
 'Express',
 'Sections',
 'English‡Æ§‡ÆÆ‡Æø‡Æ¥‡Øç‡¶¨‡¶æ‡¶Ç‡¶≤‡¶æ‡¥Æ‡¥≤‡¥Ø‡¥æ‡¥≥‡¥Ç‡™ó‡´Å‡™ú‡™∞‡™æ‡™§‡´Ä‡§π‡§ø‡§Ç‡§¶‡•Ä‡§Æ‡§∞‡§æ‡§†‡•ÄBusiness‡§¨‡§ø‡•õ‡§®‡•á‡§∏',
 'Newsletters',
 'Tuesday',
 ',',
 'Feb',
 '20',
 ',',
 '2024',
 'ePaper',
 'Today',
 '‚Äô',
 's',
 'Paper',
 'Journalism',
 'of',
 'Courage',
 'HomeCitiesIndiaExplainedOpinionBusinessEntertainmentSportsPoliticsUPSC',
 'EssentialsLifestyleTechInvestigationsResearchVideos',
 'Subscribe',
 'Sign',
 'In',
 'TrendingUPSC',
 'PackPlay',
 'CrosswordExpress',
 'ShortsüéôÔ∏è',
 'PodcastPremium',
 'StoriesHealth',
 '&',
 'WellnessTake',
 'Our',
 'Reader',
 'Survey',
 'Advertisement',
 'SC',
 'sets',
 'aside',
 'Chandigarh',
 'mayoral',
 'poll',
 'result',
 ',',
 'declares',
 'AAP',
 'candidate',
 'the',
 'winnerThe',
 'Suprem

### Stemming

In [15]:
for w in text_words:
    print(w," : ",ps.stem(w))

Latest  :  latest
News  :  new
Today  :  today
:  :  :
Breaking  :  break
News  :  new
and  :  and
Top  :  top
Headlines  :  headlin
from  :  from
India  :  india
,  :  ,
Entertainment  :  entertain
,  :  ,
Business  :  busi
,  :  ,
Politics  :  polit
and  :  and
Sports  :  sport
|  :  |
The  :  the
Indian  :  indian
Express  :  express
Sections  :  section
English‡Æ§‡ÆÆ‡Æø‡Æ¥‡Øç‡¶¨‡¶æ‡¶Ç‡¶≤‡¶æ‡¥Æ‡¥≤‡¥Ø‡¥æ‡¥≥‡¥Ç‡™ó‡´Å‡™ú‡™∞‡™æ‡™§‡´Ä‡§π‡§ø‡§Ç‡§¶‡•Ä‡§Æ‡§∞‡§æ‡§†‡•ÄBusiness‡§¨‡§ø‡•õ‡§®‡•á‡§∏  :  english‡Æ§‡ÆÆ‡Æø‡Æ¥‡Øç‡¶¨‡¶æ‡¶Ç‡¶≤‡¶æ‡¥Æ‡¥≤‡¥Ø‡¥æ‡¥≥‡¥Ç‡™ó‡´Å‡™ú‡™∞‡™æ‡™§‡´Ä‡§π‡§ø‡§Ç‡§¶‡•Ä‡§Æ‡§∞‡§æ‡§†‡•Äbusiness‡§¨‡§ø‡•õ‡§®‡•á‡§∏
Newsletters  :  newslett
Tuesday  :  tuesday
,  :  ,
Feb  :  feb
20  :  20
,  :  ,
2024  :  2024
ePaper  :  epap
Today  :  today
‚Äô  :  ‚Äô
s  :  s
Paper  :  paper
Journalism  :  journal
of  :  of
Courage  :  courag
HomeCitiesIndiaExplainedOpinionBusinessEntertainmentSportsPoliticsUPSC  :  homecitiesindiaexplainedopinionbusinessentertainmentsportspoliticsupsc
Essenti

In [16]:
filtered_text = [w for w in text_words if w not in stopWords]

In [17]:
filtered_text

['Latest',
 'News',
 'Today',
 ':',
 'Breaking',
 'News',
 'Top',
 'Headlines',
 'India',
 ',',
 'Entertainment',
 ',',
 'Business',
 ',',
 'Politics',
 'Sports',
 '|',
 'The',
 'Indian',
 'Express',
 'Sections',
 'English‡Æ§‡ÆÆ‡Æø‡Æ¥‡Øç‡¶¨‡¶æ‡¶Ç‡¶≤‡¶æ‡¥Æ‡¥≤‡¥Ø‡¥æ‡¥≥‡¥Ç‡™ó‡´Å‡™ú‡™∞‡™æ‡™§‡´Ä‡§π‡§ø‡§Ç‡§¶‡•Ä‡§Æ‡§∞‡§æ‡§†‡•ÄBusiness‡§¨‡§ø‡•õ‡§®‡•á‡§∏',
 'Newsletters',
 'Tuesday',
 ',',
 'Feb',
 '20',
 ',',
 '2024',
 'ePaper',
 'Today',
 '‚Äô',
 'Paper',
 'Journalism',
 'Courage',
 'HomeCitiesIndiaExplainedOpinionBusinessEntertainmentSportsPoliticsUPSC',
 'EssentialsLifestyleTechInvestigationsResearchVideos',
 'Subscribe',
 'Sign',
 'In',
 'TrendingUPSC',
 'PackPlay',
 'CrosswordExpress',
 'ShortsüéôÔ∏è',
 'PodcastPremium',
 'StoriesHealth',
 '&',
 'WellnessTake',
 'Our',
 'Reader',
 'Survey',
 'Advertisement',
 'SC',
 'sets',
 'aside',
 'Chandigarh',
 'mayoral',
 'poll',
 'result',
 ',',
 'declares',
 'AAP',
 'candidate',
 'winnerThe',
 'Supreme',
 'Court',
 'deprecated',
 'conduct',
 'pre