In [None]:
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer

In [69]:
# Sample Data

corpus = [
    "This movie is scray and long.",
    "This movie is not scray but it is long.",
    "Overall, the movie is fine.",
    "This movie is overall good."
]

## **Bag-of-Words(BOW)**

In [None]:
vect = CountVectorizer()
X = vect.fit_transform(corpus)

In [None]:
print(vect.get_feature_names_out())
print(X.toarray())

['and' 'but' 'fine' 'good' 'is' 'it' 'long' 'movie' 'not' 'overall'
 'scray' 'the' 'this']
[[1 0 0 0 1 0 1 1 0 0 1 0 1]
 [0 1 0 0 2 1 1 1 1 0 1 0 1]
 [0 0 1 0 1 0 0 1 0 1 0 1 0]
 [0 0 0 1 1 0 0 1 0 1 0 0 1]]


## **TF-IDF**

In [None]:
tf_vect = TfidfVectorizer()
X_tf = tf_vect.fit_transform(corpus)

In [None]:
print(tf_vect.get_feature_names_out())
print(X_tf.toarray())

['and' 'but' 'fine' 'good' 'is' 'it' 'long' 'movie' 'not' 'overall'
 'scray' 'the' 'this']
[[0.55943386 0.         0.         0.         0.2919358  0.
  0.44106408 0.2919358  0.         0.         0.44106408 0.
  0.35707939]
 [0.         0.40783426 0.         0.         0.42564968 0.40783426
  0.32154122 0.21282484 0.40783426 0.         0.32154122 0.
  0.26031533]
 [0.         0.         0.56199026 0.         0.29326983 0.
  0.         0.29326983 0.         0.44307958 0.         0.56199026
  0.        ]
 [0.         0.         0.         0.62334157 0.32528549 0.
  0.         0.32528549 0.         0.49144966 0.         0.
  0.39787085]]


## **Web Scrapping & Word2Vec**

In [99]:
import bs4 as bs
import urllib.request
import re
import nltk

scrapped_data = urllib.request.urlopen('https://en.wikipedia.org/wiki/Natural_Language_Processing')
article = scrapped_data .read()

parsed_article = bs.BeautifulSoup(article,'lxml')

paragraphs = parsed_article.find_all('p')

article_text = ""

for p in paragraphs:
    article_text += p.text

In [100]:
article_text

'Natural language processing (NLP) is an interdisciplinary subfield of linguistics, computer science, and artificial intelligence concerned with the interactions between computers and human language, in particular how to program computers to process and analyze large amounts of natural language data.  The goal is a computer capable of "understanding" the contents of documents, including the contextual nuances of the language within them. The technology can then accurately extract information and insights contained in the documents as well as categorize and organize the documents themselves.\nChallenges in natural language processing frequently involve speech recognition, natural-language understanding, and natural-language generation.\nNatural language processing has its roots in the 1950s. Already in 1950, Alan Turing published an article titled "Computing Machinery and Intelligence" which proposed what is now called the Turing test as a criterion of intelligence, though at the time t

In [101]:
# Cleaing the text
processed_article = article_text.lower()
processed_article = re.sub('[^a-zA-Z]', ' ', processed_article )
processed_article = re.sub(r'\s+', ' ', processed_article)

# Preparing the dataset
all_sentences = nltk.sent_tokenize(processed_article)

all_words = [nltk.word_tokenize(sent) for sent in all_sentences]

# Removing Stop Words
from nltk.corpus import stopwords
for i in range(len(all_words)):
    all_words[i] = [w for w in all_words[i] if w not in stopwords.words('english')]

In [102]:
from gensim.models import Word2Vec

word2vec = Word2Vec(all_words, min_count=2)

In [103]:
v1 = word2vec.wv['natural']

In [104]:
v1

array([-0.00869064,  0.00378438,  0.00538187,  0.00553134,  0.00771393,
       -0.007157  ,  0.00165784,  0.00728173, -0.00318505, -0.00691072,
       -0.00043627, -0.00952769, -0.00581128,  0.0075982 ,  0.00348346,
        0.00722565,  0.00690645,  0.00705527, -0.00382736, -0.00165207,
        0.00273543, -0.00413541,  0.00844212, -0.01003117,  0.00665031,
        0.0031199 , -0.00515758,  0.00406614, -0.00180159,  0.00661922,
        0.01041669, -0.00427885, -0.0002148 , -0.00626602,  0.0035467 ,
        0.0034898 ,  0.00751932,  0.00620364,  0.00964136,  0.00879245,
        0.00826574, -0.00710364, -0.00942547, -0.00044008, -0.00274929,
        0.00791391,  0.00553018, -0.00144595,  0.00165137,  0.00202249,
        0.00769151, -0.00980415,  0.00010188,  0.00342743, -0.00080541,
        0.00857465,  0.00941438,  0.00665714, -0.00095007,  0.00826924,
       -0.00867033,  0.00291828, -0.00435947, -0.00508096,  0.003319  ,
        0.00578243,  0.00810919, -0.00542858,  0.00707849,  0.00

In [105]:
sim_words = word2vec.wv.most_similar('intelligence')

In [106]:
sim_words

[('standing', 0.2857658267021179),
 ('weights', 0.20401710271835327),
 ('generation', 0.1854391247034073),
 ('behaviour', 0.1752675175666809),
 ('system', 0.16358298063278198),
 ('proposed', 0.15643902122974396),
 ('understanding', 0.15008430182933807),
 ('different', 0.13415870070457458),
 ('intermediate', 0.12377858906984329),
 ('paradigm', 0.10384783148765564)]