In [15]:
import nltk
import pandas as pd 
import numpy as np 

In [2]:
nltk.download('punkt')
nltk.download('stopwords')


[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\prana\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\prana\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

In [3]:
corpus = [
    "Natural language processing is a part of artificial intelligence",
    "Machine learning and deep learning are subsets of AI",
    "NLP uses machine learning techniques",
    "Deep learning is powerful for NLP tasks"
]


In [4]:
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
import string

stop_words = set(stopwords.words("english"))

def preprocess(text):
    tokens = word_tokenize(text.lower())
    tokens = [w for w in tokens if w not in stop_words and w not in string.punctuation]
    return " ".join(tokens)

processed_corpus = [preprocess(text) for text in corpus]

processed_corpus


['natural language processing part artificial intelligence',
 'machine learning deep learning subsets ai',
 'nlp uses machine learning techniques',
 'deep learning powerful nlp tasks']

In [6]:
from sklearn.feature_extraction.text import CountVectorizer

count_vectorizer = CountVectorizer()
bow_matrix = count_vectorizer.fit_transform(processed_corpus)

bow_df = pd.DataFrame(
    bow_matrix.toarray(),
    columns=count_vectorizer.get_feature_names_out()
)

bow_df


Unnamed: 0,ai,artificial,deep,intelligence,language,learning,machine,natural,nlp,part,powerful,processing,subsets,tasks,techniques,uses
0,0,1,0,1,1,0,0,1,0,1,0,1,0,0,0,0
1,1,0,1,0,0,2,1,0,0,0,0,0,1,0,0,0
2,0,0,0,0,0,1,1,0,1,0,0,0,0,0,1,1
3,0,0,1,0,0,1,0,0,1,0,1,0,0,1,0,0


In [7]:
normalized_bow = bow_df.div(bow_df.sum(axis=1), axis=0)
normalized_bow


Unnamed: 0,ai,artificial,deep,intelligence,language,learning,machine,natural,nlp,part,powerful,processing,subsets,tasks,techniques,uses
0,0.0,0.166667,0.0,0.166667,0.166667,0.0,0.0,0.166667,0.0,0.166667,0.0,0.166667,0.0,0.0,0.0,0.0
1,0.166667,0.0,0.166667,0.0,0.0,0.333333,0.166667,0.0,0.0,0.0,0.0,0.0,0.166667,0.0,0.0,0.0
2,0.0,0.0,0.0,0.0,0.0,0.2,0.2,0.0,0.2,0.0,0.0,0.0,0.0,0.0,0.2,0.2
3,0.0,0.0,0.2,0.0,0.0,0.2,0.0,0.0,0.2,0.0,0.2,0.0,0.0,0.2,0.0,0.0


In [8]:
from sklearn.feature_extraction.text import TfidfVectorizer

tfidf_vectorizer = TfidfVectorizer()
tfidf_matrix = tfidf_vectorizer.fit_transform(processed_corpus)

tfidf_df = pd.DataFrame(
    tfidf_matrix.toarray(),
    columns=tfidf_vectorizer.get_feature_names_out()
)

tfidf_df


Unnamed: 0,ai,artificial,deep,intelligence,language,learning,machine,natural,nlp,part,powerful,processing,subsets,tasks,techniques,uses
0,0.0,0.408248,0.0,0.408248,0.408248,0.0,0.0,0.408248,0.0,0.408248,0.0,0.408248,0.0,0.0,0.0,0.0
1,0.453012,0.0,0.35716,0.0,0.0,0.578303,0.35716,0.0,0.0,0.0,0.0,0.0,0.453012,0.0,0.0,0.0
2,0.0,0.0,0.0,0.0,0.0,0.334067,0.41264,0.0,0.41264,0.0,0.0,0.0,0.0,0.0,0.523381,0.523381
3,0.0,0.0,0.41264,0.0,0.0,0.334067,0.0,0.0,0.41264,0.0,0.523381,0.0,0.0,0.523381,0.0,0.0


In [9]:
tokenized_corpus = [word_tokenize(text) for text in processed_corpus]
tokenized_corpus


[['natural', 'language', 'processing', 'part', 'artificial', 'intelligence'],
 ['machine', 'learning', 'deep', 'learning', 'subsets', 'ai'],
 ['nlp', 'uses', 'machine', 'learning', 'techniques'],
 ['deep', 'learning', 'powerful', 'nlp', 'tasks']]

In [11]:
from gensim.models import Word2Vec

word2vec_model = Word2Vec(
    sentences=tokenized_corpus,
    vector_size=50,
    window=5,
    min_count=1,
    workers=4
)


In [12]:
word2vec_model.wv["learning"]


array([-1.0729393e-03,  4.7271003e-04,  1.0208104e-02,  1.8019602e-02,
       -1.8605592e-02, -1.4234448e-02,  1.2918263e-02,  1.7946383e-02,
       -1.0030984e-02, -7.5273751e-03,  1.4760993e-02, -3.0677442e-03,
       -9.0746665e-03,  1.3108633e-02, -9.7204596e-03, -3.6306570e-03,
        5.7542399e-03,  1.9837192e-03, -1.6570160e-02, -1.8896744e-02,
        1.4623734e-02,  1.0140221e-02,  1.3516697e-02,  1.5253476e-03,
        1.2702212e-02, -6.8102744e-03, -1.8939794e-03,  1.1537288e-02,
       -1.5042618e-02, -7.8722741e-03, -1.5023078e-02, -1.8603110e-03,
        1.9076865e-02, -1.4639303e-02, -4.6668286e-03, -3.8753832e-03,
        1.6155630e-02, -1.1862010e-02,  9.1137852e-05, -9.5066773e-03,
       -1.9205842e-02,  1.0013717e-02, -1.7519865e-02, -8.7845018e-03,
       -7.1028109e-05, -5.9188530e-04, -1.5321815e-02,  1.9229794e-02,
        9.9641699e-03,  1.8467069e-02], dtype=float32)

In [13]:
word2vec_model.wv.most_similar("learning")


[('language', 0.2105702906847),
 ('artificial', 0.16704507172107697),
 ('processing', 0.1501905620098114),
 ('uses', 0.1320112943649292),
 ('deep', 0.1267675906419754),
 ('subsets', 0.09985639154911041),
 ('nlp', 0.04236132651567459),
 ('part', 0.04067830368876457),
 ('powerful', 0.01243599783629179),
 ('intelligence', -0.012584488838911057)]

In [16]:
def document_embedding(doc):
    vectors = [word2vec_model.wv[word] for word in doc if word in word2vec_model.wv]
    return np.mean(vectors, axis=0)

doc_embedding = document_embedding(tokenized_corpus[0])
doc_embedding


array([ 0.00045587, -0.00846928,  0.00525375,  0.00124259, -0.01240102,
        0.00091135,  0.00447051, -0.00423807, -0.00955425, -0.00812436,
        0.00554483, -0.00305503,  0.00275594, -0.00015909, -0.00178209,
        0.00275119, -0.00223738,  0.00276977, -0.0077352 , -0.01319914,
       -0.0014146 ,  0.00527331,  0.00963619,  0.00276856, -0.00388846,
        0.00280395,  0.00035993, -0.00125929, -0.00166374,  0.00516055,
        0.00418978, -0.00339041, -0.00539889, -0.0053815 , -0.00339299,
       -0.002424  ,  0.00099349,  0.00302327, -0.00278685, -0.00287947,
        0.01144264, -0.00337562, -0.00665261, -0.00104969,  0.006179  ,
        0.0019327 , -0.00347674, -0.00811787,  0.00316158, -0.00293876],
      dtype=float32)