In [20]:
import nltk
import re
from gensim.models import Word2Vec
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer

In [21]:
paragraph="Natural Language Processing (NLP) tasks encompass a wide range of applications aimed at understanding and processing human language data. From sentiment analysis to machine translation, NLP tasks play a pivotal role in enabling machines to comprehend and generate human language. One key NLP task involves Named Entity Recognition (NER), where the goal is to identify and classify named entities such as people, organizations, locations, dates, and more within a given text. NER is crucial for information extraction, search engines, and various other applications. By accurately identifying named entities, NER systems help extract valuable information from unstructured text data, enabling further analysis and decision-making. Through advanced machine learning algorithms and linguistic techniques, NER systems can achieve high precision and recall, making them indispensable tools for extracting structured information from vast amounts of textual data."

In [22]:
wordnet=WordNetLemmatizer()
corpus=[]

sentence=nltk.sent_tokenize(paragraph)

for i in range(len(sentence)):
    review=re.sub('[^a-zA-Z]',' ',sentence[i])
    review=review.lower()
    review=review.split()
    review=[wordnet.lemmatize(word) for word in review if word not in stopwords.words('english')]
    review=' '.join(review)
    corpus.append(review)

In [23]:
corpus

['natural language processing nlp task encompass wide range application aimed understanding processing human language data',
 'sentiment analysis machine translation nlp task play pivotal role enabling machine comprehend generate human language',
 'one key nlp task involves named entity recognition ner goal identify classify named entity people organization location date within given text',
 'ner crucial information extraction search engine various application',
 'accurately identifying named entity ner system help extract valuable information unstructured text data enabling analysis decision making',
 'advanced machine learning algorithm linguistic technique ner system achieve high precision recall making indispensable tool extracting structured information vast amount textual data']

In [33]:
#min_count says if the word is present less than 1 then do not consider that word
# Train Word2Vec model
model = Word2Vec(corpus, min_count=1, vector_size=100)

In [34]:
# Print vocabulary
print("Vocabulary:", model.wv.index_to_key)

Vocabulary: ['ner', 'data', 'language', 'nlp', 'task', 'information', 'machine', 'entity', 'named', 'human', 'system', 'making', 'analysis', 'enabling', 'text', 'application', 'processing', 'generate', 'recognition', 'goal', 'involves', 'key', 'one', 'textual', 'translation', 'comprehend', 'role', 'pivotal', 'play', 'classify', 'sentiment', 'understanding', 'aimed', 'range', 'wide', 'encompass', 'identify', 'location', 'people', 'decision', 'vast', 'structured', 'extracting', 'tool', 'indispensable', 'recall', 'precision', 'high', 'achieve', 'technique', 'linguistic', 'algorithm', 'learning', 'advanced', 'unstructured', 'organization', 'valuable', 'extract', 'help', 'identifying', 'accurately', 'various', 'engine', 'search', 'extraction', 'crucial', 'given', 'within', 'date', 'amount', 'natural']


In [35]:
# Get vector representation of a word
word_vector = model.wv['nlp']
print("Vector representation of 'nlp':", word_vector)

Vector representation of 'nlp': [-8.2355151e-03  9.2978338e-03 -1.9957332e-04 -1.9653966e-03
  4.6042944e-03 -4.0997970e-03  2.7476645e-03  6.9402982e-03
  6.0647279e-03 -7.5113252e-03  9.3847588e-03  4.6674493e-03
  3.9651403e-03 -6.2321611e-03  8.4552076e-03 -2.1526464e-03
  8.8279434e-03 -5.3664022e-03 -8.1329467e-03  6.8238406e-03
  1.6732425e-03 -2.1887177e-03  9.5216343e-03  9.4913086e-03
 -9.7724926e-03  2.5007348e-03  6.1534229e-03  3.8736451e-03
  2.0154754e-03  4.2987103e-04  6.9201383e-04 -3.8225681e-03
 -7.1394327e-03 -2.0975021e-03  3.9236234e-03  8.8142520e-03
  9.2565268e-03 -5.9713172e-03 -9.4056912e-03  9.7533856e-03
  3.4162311e-03  5.1639555e-03  6.2743747e-03 -2.8054365e-03
  7.3266625e-03  2.8337566e-03  2.8635475e-03 -2.3820915e-03
 -3.1220356e-03 -2.3655412e-03  4.2762361e-03  7.2884570e-05
 -9.5907124e-03 -9.6783927e-03 -6.1411401e-03 -1.2653318e-04
  1.9998190e-03  9.4300117e-03  5.5796970e-03 -4.2910534e-03
  2.7187349e-04  4.9597253e-03  7.7080247e-03 -1.1405

In [36]:
# Find similar words
similar_words = model.wv.most_similar('nlp', topn=5)
print("Words similar to 'nlp':", similar_words)

Words similar to 'nlp': [('text', 0.17861540615558624), ('identify', 0.16389790177345276), ('recall', 0.1499251425266266), ('valuable', 0.13510604202747345), ('information', 0.13150909543037415)]
