**Foundations
Natural Language Processing (NLP):**

In [None]:
# Sample code for basic text processing with NLTK
import nltk
nltk.download('punkt')
from nltk.tokenize import word_tokenize

text = "Natural Language Processing is fascinating!"
tokens = word_tokenize(text)
print(tokens)


[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.


['Natural', 'Language', 'Processing', 'is', 'fascinating', '!']


# Basic Text Processing

In [None]:
import pandas as pd
import numpy as np

df = pd.read_csv('sample.csv')  # assuming a regular CSV file without a specific delimiter

# Using str.lower()  # Lowercasing
df['text'] = df['text'].str.lower()

# Alternatively, using apply with a lambda function
# df['text'] = df['text'].apply(lambda x: x.lower())

# Remove HTML Tags using Regex
import re

def remove_html_tags(text):
    pattern = re.compile(r'<.*?>')  # Regex to match HTML tags
    return re.sub(pattern, '', text)

df['text'] = df['text'].apply(remove_html_tags)

# Display the modified DataFrame
print(df)


#Remove Url's
def remove_url(text):
    pattern = re.compile(r'https?://\S+|www\.\S+')
    return pattern.sub(r'', text)
df['text'].apply(remove_url)


#Remove Punctuation
def remove_punct(text):
    for char in exclude:
        text = text.replace(char, '')
    return text


chat_words = {}

# Chat conversion function
def chat_conversion(text):
    new_text = []
    for w in text.split():
        if w.upper() in chat_words:
            new_text.append(chat_words[w.upper()])
        else:
            new_text.append(w)
    return ' '.join(new_text)




    tweet_id        author_id  inbound                      created_at  \
0     119237           105834     True  Wed Oct 11 06:55:44 +0000 2017   
1     119238     ChaseSupport    False  Wed Oct 11 13:25:49 +0000 2017   
2     119239           105835     True  Wed Oct 11 13:00:09 +0000 2017   
3     119240     VirginTrains    False  Tue Oct 10 15:16:08 +0000 2017   
4     119241           105836     True  Tue Oct 10 15:17:21 +0000 2017   
5     119243     VirginTrains    False  Tue Oct 10 15:25:14 +0000 2017   
6     119244           105836     True  Tue Oct 10 15:26:44 +0000 2017   
7     119245     VirginTrains    False  Tue Oct 10 15:33:22 +0000 2017   
8     119242           105836     True  Tue Oct 10 15:09:00 +0000 2017   
9     119246     VirginTrains    False  Tue Oct 10 10:13:19 +0000 2017   
10    119248     AppleSupport    False  Wed Oct 11 13:38:29 +0000 2017   
11    119249           105837     True  Wed Oct 11 07:37:27 +0000 2017   
12    119250           105838     True

In [None]:
#Spell Correction
from textblob import TextBlob
incorrect_text = 'any tezt with for checing'
textblob = TextBlob(incorrect_text)
textblob.correct()

TextBlob("any test with for checking")

In [None]:
#Remove StopWords
from nltk.corpus import stopwords
stopwords.words('english')

['i',
 'me',
 'my',
 'myself',
 'we',
 'our',
 'ours',
 'ourselves',
 'you',
 "you're",
 "you've",
 "you'll",
 "you'd",
 'your',
 'yours',
 'yourself',
 'yourselves',
 'he',
 'him',
 'his',
 'himself',
 'she',
 "she's",
 'her',
 'hers',
 'herself',
 'it',
 "it's",
 'its',
 'itself',
 'they',
 'them',
 'their',
 'theirs',
 'themselves',
 'what',
 'which',
 'who',
 'whom',
 'this',
 'that',
 "that'll",
 'these',
 'those',
 'am',
 'is',
 'are',
 'was',
 'were',
 'be',
 'been',
 'being',
 'have',
 'has',
 'had',
 'having',
 'do',
 'does',
 'did',
 'doing',
 'a',
 'an',
 'the',
 'and',
 'but',
 'if',
 'or',
 'because',
 'as',
 'until',
 'while',
 'of',
 'at',
 'by',
 'for',
 'with',
 'about',
 'against',
 'between',
 'into',
 'through',
 'during',
 'before',
 'after',
 'above',
 'below',
 'to',
 'from',
 'up',
 'down',
 'in',
 'out',
 'on',
 'off',
 'over',
 'under',
 'again',
 'further',
 'then',
 'once',
 'here',
 'there',
 'when',
 'where',
 'why',
 'how',
 'all',
 'any',
 'both',
 'each

**Data Preparation
How to Clean Text Manually and with NLTK:**

In [None]:
# Sample code for text cleaning with NLTK
from nltk.corpus import stopwords
nltk.download('wordnet')


[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


True

In [None]:
stop_words = set(stopwords.words('english'))

def clean_text(text):
    words = word_tokenize(text)
    filtered_words = [word.lower() for word in words if word.isalpha() and word.lower() not in stop_words]
    return ' '.join(filtered_words)

cleaned_text = clean_text("This is a sample text for cleaning.")
print(cleaned_text)


sample text cleaning


# Text Pre-Processing

In [None]:
import nltk
nltk.download('wordnet')
nltk.download('averaged_perceptron_tagger')
nltk.download('maxent_ne_chunker')
nltk.download('words')

from nltk.tokenize import word_tokenize, sent_tokenize
from nltk.probability import FreqDist
from nltk.corpus import stopwords
from nltk.stem import PorterStemmer, WordNetLemmatizer
from nltk import pos_tag, ne_chunk
from nltk.corpus import wordnet

# Sample text for demonstration
sample_text = "The quick brown fox jumps over the lazy dog. The dog barks loudly."

# Tokenization
tokens = word_tokenize(sample_text)
print("Tokenization:")
print(tokens)
print()

# Frequency Distribution of Words
fdist = FreqDist(tokens)
print("Frequency Distribution of Words:")
print(fdist)
print()

# Filtering Stop Words
stop_words = set(stopwords.words('english'))
filtered_tokens = [word for word in tokens if word.lower() not in stop_words]
print("Filtering Stop Words:")
print(filtered_tokens)
print()

# Stemming
ps = PorterStemmer()
stemmed_tokens = [ps.stem(word) for word in tokens]
print("Stemming:")
print(stemmed_tokens)
print()

# Lemmatization
lemmatizer = WordNetLemmatizer()
lemmatized_tokens = [lemmatizer.lemmatize(word) for word in tokens]
print("Lemmatization:")
print(lemmatized_tokens)
print()

# Parts of Speech (POS) Tagging
pos_tags = pos_tag(tokens)
print("Parts of Speech (POS) Tagging:")
print(pos_tags)
print()

# Name Entity Recognition (NER)
ner_result = ne_chunk(pos_tags)
print("Name Entity Recognition (NER):")
print(ner_result)
print()

# WordNet Usage
synonyms = []
antonyms = []

for word in tokens:
    for syn in wordnet.synsets(word):
        for lemma in syn.lemmas():
            synonyms.append(lemma.name())
            if lemma.antonyms():
                antonyms.append(lemma.antonyms()[0].name())

print("WordNet Synonyms:")
print(set(synonyms))
print("WordNet Antonyms:")
print(set(antonyms))


[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     /root/nltk_data...
[nltk_data]   Package averaged_perceptron_tagger is already up-to-
[nltk_data]       date!
[nltk_data] Downloading package maxent_ne_chunker to
[nltk_data]     /root/nltk_data...
[nltk_data]   Package maxent_ne_chunker is already up-to-date!
[nltk_data] Downloading package words to /root/nltk_data...
[nltk_data]   Unzipping corpora/words.zip.


Tokenization:
['The', 'quick', 'brown', 'fox', 'jumps', 'over', 'the', 'lazy', 'dog', '.', 'The', 'dog', 'barks', 'loudly', '.']

Frequency Distribution of Words:
<FreqDist with 12 samples and 15 outcomes>

Filtering Stop Words:
['quick', 'brown', 'fox', 'jumps', 'lazy', 'dog', '.', 'dog', 'barks', 'loudly', '.']

Stemming:
['the', 'quick', 'brown', 'fox', 'jump', 'over', 'the', 'lazi', 'dog', '.', 'the', 'dog', 'bark', 'loudli', '.']

Lemmatization:
['The', 'quick', 'brown', 'fox', 'jump', 'over', 'the', 'lazy', 'dog', '.', 'The', 'dog', 'bark', 'loudly', '.']

Parts of Speech (POS) Tagging:
[('The', 'DT'), ('quick', 'JJ'), ('brown', 'NN'), ('fox', 'NN'), ('jumps', 'VBZ'), ('over', 'IN'), ('the', 'DT'), ('lazy', 'JJ'), ('dog', 'NN'), ('.', '.'), ('The', 'DT'), ('dog', 'NN'), ('barks', 'VBZ'), ('loudly', 'RB'), ('.', '.')]

Name Entity Recognition (NER):
(S
  The/DT
  quick/JJ
  brown/NN
  fox/NN
  jumps/VBZ
  over/IN
  the/DT
  lazy/JJ
  dog/NN
  ./.
  The/DT
  dog/NN
  barks/VBZ
  lo

# **Bag-of-Words **

In [None]:
from sklearn.feature_extraction.text import CountVectorizer

# Sample documents
documents = [
    "This is the first document.",
    "This document is the second document.",
    "And this is the third one.",
    "Is this the first document?"
]

# Create the CountVectorizer
vectorizer = CountVectorizer()

# Fit and transform the documents
X = vectorizer.fit_transform(documents)

# Get the feature names (unique words in the corpus)
feature_names = vectorizer.get_feature_names_out()

# Convert the sparse matrix to a dense matrix for better readability
dense_matrix = X.toarray()

# Display the feature names and the Bag-of-Words representation
print("Feature Names (Unique Words):")
print(feature_names)
print("\nBag-of-Words Representation:")
print(dense_matrix)


Feature Names (Unique Words):
['and' 'document' 'first' 'is' 'one' 'second' 'the' 'third' 'this']

Bag-of-Words Representation:
[[0 1 1 1 0 0 1 0 1]
 [0 2 0 1 0 1 1 0 1]
 [1 0 0 1 1 0 1 1 1]
 [0 1 1 1 0 0 1 0 1]]


In [None]:
from transformers import BertTokenizer
from sklearn.feature_extraction.text import CountVectorizer
import numpy as np

# Sample documents
documents = [
    "This is the first document.",
    "This document is the second document.",
    "And this is the third one.",
    "Is this the first document?"
]

# Load BERT tokenizer
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')

# Tokenize the documents using BERT tokenizer
tokenized_documents = [tokenizer.encode(doc, add_special_tokens=True) for doc in documents]

# Flatten the list of tokenized documents
flat_tokens = [item for sublist in tokenized_documents for item in sublist]

# Get the unique tokens (words) in the corpus
unique_tokens = list(set(flat_tokens))

# Create a mapping of word to index
word_index = {word: index for index, word in enumerate(unique_tokens)}

# Create a Bag-of-Words matrix
bow_matrix = np.zeros((len(documents), len(unique_tokens)))

for doc_index, doc_tokens in enumerate(tokenized_documents):
    for token in doc_tokens:
        if token in word_index:
            bow_matrix[doc_index, word_index[token]] += 1

# Display the unique tokens and the Bag-of-Words matrix
print("Unique Tokens:")
print(unique_tokens)
print("\nBag-of-Words Matrix:")
print(bow_matrix)


The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


tokenizer_config.json:   0%|          | 0.00/28.0 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

config.json:   0%|          | 0.00/570 [00:00<?, ?B/s]

Unique Tokens:
[101, 102, 2023, 2117, 1029, 1996, 2028, 6254, 1998, 2353, 2034, 2003, 1012]

Bag-of-Words Matrix:
[[1. 1. 1. 0. 0. 1. 0. 1. 0. 0. 1. 1. 1.]
 [1. 1. 1. 1. 0. 1. 0. 2. 0. 0. 0. 1. 1.]
 [1. 1. 1. 0. 0. 1. 1. 0. 1. 1. 0. 1. 1.]
 [1. 1. 1. 0. 1. 1. 0. 1. 0. 0. 1. 1. 0.]]


# Word Embeddings

In [None]:
from transformers import AutoTokenizer, AutoModel
import torch

# Sample words
words = ["apple", "banana", "orange", "grape", "pineapple"]

# Load pre-trained BERT model and tokenizer
model_name = "bert-base-uncased"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModel.from_pretrained(model_name)

# Tokenize the words and obtain embeddings
tokenized_input = tokenizer(words, return_tensors="pt", padding=True, truncation=True)
with torch.no_grad():
    output = model(**tokenized_input)

# Extract the embeddings for the [CLS] token (first token in each sequence)
word_embeddings = output.last_hidden_state[:, 0, :].numpy()

# Display the word embeddings
for word, embedding in zip(words, word_embeddings):
    print(f"{word}: {embedding}")


# Importing Packages

In [None]:
# Import necessary libraries
import nltk
nltk.download('punkt')
import numpy as np
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.cluster import KMeans
from nltk.corpus import stopwords
import pandas as pd

# Additional imports for clustering and visualization
from sklearn.cluster import KMeans, AgglomerativeClustering, DBSCAN
from nltk.corpus import stopwords
import matplotlib.pyplot as plt

# Import for cosine similarity calculation and topic modeling
from sklearn.metrics.pairwise import cosine_similarity
import gensim
from gensim import corpora

# Set options for displaying Pandas DataFrames
pd.set_option('display.max_rows', None)
pd.set_option('display.max_columns', None)
pd.set_option('display.width', None)
pd.set_option('display.max_colwidth', None)

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


In [None]:
# Text preprocessing using NLTK
nltk.download('stopwords')  # Download NLTK stopwords
stop_words = set(stopwords.words('english'))  # Create a set of English stopwords

def preprocess_text(text):
    # Tokenize the input text into words
    words = nltk.word_tokenize(text)

    # Convert words to lowercase, remove non-alphanumeric characters, and filter out stopwords
    words = [word.lower() for word in words if word.isalnum() and word.lower() not in stop_words]

    # Join the filtered words back into a single string
    return " ".join(words)


def vectorizer(file):
  # Read interview questions from an Excel file into a Pandas DataFrame
#questions = pd.read_excel('Interview Questions and Answers for Sales and Marketing.xlsx', sheet_name='Consolidated')

# Extract the 'Question*' column from the DataFrame and convert it to a Python list
  question_list = file['interview_question'].values.tolist()
  #words= preprocess_text(text)
  # Apply the preprocess_text function to each question in the question_list
  preprocessed_questions = [preprocess_text(question) for question in question_list]
  # TF-IDF Vectorization
  tfidf_vectorizer = TfidfVectorizer()  # Initialize a TF-IDF vectorizer
  tfidf_matrix = tfidf_vectorizer.fit_transform(preprocessed_questions)
  return tfidf_matrix, question_list

def cluster_formation(tfidf_matrix,question_list):

  # Cluster using DBSCAN
  dbscan = DBSCAN(eps=0.5, min_samples=1)
  dbscan_clusters = dbscan.fit_predict(tfidf_matrix.toarray())

  # Create a DataFrame to compare clusters
  clusters_df = pd.DataFrame({
    "Question": question_list,
    "DBSCAN Cluster": dbscan_clusters
  })

  return clusters_df


def fetch_rows_to_be_dropped(file):
  matrix,ques=vectorizer(file)
  clusters_df=cluster_formation(matrix,ques)

  bag=[]
  rows=[]
  k=-1
  for i in clusters_df["DBSCAN Cluster"]:
    k=k+1
    if i not in bag:
      bag.append(i)
    else:
      rows.append(k)
  return rows


[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [None]:
import spacy
from sklearn.datasets import fetch_20newsgroups


In [None]:
# Load the dataset
newsgroups = fetch_20newsgroups(subset="train")

In [None]:
# Instantiate the spaCy model
nlp = spacy.load("en_core_web_sm")

In [None]:
# Example NLP tasks
doc = nlp(newsgroups.data[0])

In [None]:
# Tokenization
print("Tokens:")
for token in doc:
    print(token.text)

In [None]:
# Sentence Segmentation
print("\nSentences:")
for sent in doc.sents:
    print(sent.text)


Sentences:
From: lerxst@wam.umd.edu (where's my thing)
Subject: WHAT car is this!?
Nntp-Posting-Host: rac3.wam.umd.edu
Organization: University of Maryland, College Park
Lines:
15

 I was wondering if anyone out there could enlighten me on this car I saw
the other day.
It was a 2-door sports car, looked to be from the late 60s/
early 70s.
It was called a Bricklin.
The doors were really small.
In addition,
the front bumper was separate from the rest of the body.
This is 
all I know.
If anyone can tellme a model name, engine specs, years
of production, where this car is made, history, or whatever info you
have on this funky looking car, please e-mail.


Thanks,
- IL
   ---- brought to you by your neighborhood Lerxst ----







In [None]:
# Part-of-Speech (POS) Tagging
print("\nPOS Tags:")
for token in doc:
    print(token.text, token.pos_)


POS Tags:
From ADP
: PUNCT
lerxst@wam.umd.edu PROPN
( PUNCT
where SCONJ
's AUX
my PRON
thing NOUN
) PUNCT

 SPACE
Subject PROPN
: PUNCT
WHAT DET
car NOUN
is AUX
this PRON
! PUNCT
? PUNCT

 SPACE
Nntp NOUN
- PUNCT
Posting NOUN
- PUNCT
Host PROPN
: PUNCT
rac3.wam.umd.edu NOUN

 SPACE
Organization PROPN
: PUNCT
University PROPN
of ADP
Maryland PROPN
, PUNCT
College PROPN
Park PROPN

 SPACE
Lines NOUN
: PUNCT
15 NUM


  SPACE
I PRON
was AUX
wondering VERB
if SCONJ
anyone PRON
out ADV
there ADV
could AUX
enlighten VERB
me PRON
on ADP
this DET
car NOUN
I PRON
saw VERB

 SPACE
the DET
other ADJ
day NOUN
. PUNCT
It PRON
was AUX
a DET
2 NUM
- PUNCT
door NOUN
sports NOUN
car NOUN
, PUNCT
looked VERB
to PART
be AUX
from ADP
the DET
late ADJ
60s/ NUM

 SPACE
early ADJ
70s NOUN
. PUNCT
It PRON
was AUX
called VERB
a DET
Bricklin PROPN
. PUNCT
The DET
doors NOUN
were AUX
really ADV
small ADJ
. PUNCT
In ADP
addition NOUN
, PUNCT

 SPACE
the DET
front ADJ
bumper NOUN
was AUX
separate ADJ
from ADP
the 

In [None]:
# Named Entity Extraction
print("\nNamed Entities:")
for ent in doc.ents:
    print(ent.text, ent.label_)


Named Entities:
lerxst@wam.umd.edu PERSON
Organization: University of Maryland ORG
College Park
Lines GPE
15 CARDINAL
the other day DATE
2 CARDINAL
early 70s DATE
Bricklin GPE
years DATE
Lerxst PERSON


In [None]:
# Chunking
print("\nNoun Chunks:")
for chunk in doc.noun_chunks:
    print(chunk.text)


Noun Chunks:
lerxst@wam.umd.edu
my thing
Subject
WHAT car
this
Nntp-Posting-Host
rac3.wam.umd.edu
Organization
University
Maryland
College Park
Lines
I
anyone
me
this car
I
the other day
It
a 2-door sports car
early 70s
It
a Bricklin
The doors
addition
the front bumper
the rest
the body
This
all
I
anyone
a model name
engine specs
years
production
this car
history
whatever info
you
this funky looking car
e
-
mail
Thanks
- IL
you
your neighborhood Lerxst


In [None]:
# Parsing
print("\nDependency Parsing:")
for token in doc:
    print(token.text, token.dep_, token.head.text, token.head.pos_)

In [None]:
import nltk
nltk.download('punkt')
nltk.download('stopwords')
nltk.download('wordnet')
nltk.download('averaged_perceptron_tagger')

from nltk.tokenize import word_tokenize, sent_tokenize
from nltk.corpus import stopwords
from nltk.stem import PorterStemmer, WordNetLemmatizer
from nltk.tag import pos_tag




[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.
[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     /root/nltk_data...
[nltk_data]   Unzipping taggers/averaged_perceptron_tagger.zip.


In [None]:
# Sample text
text = "The quick brown fox jumps over the lazy dog. It was a bright cold day in April, and the clocks were striking thirteen."

# Tokenization
tokens = word_tokenize(text)
sentences = sent_tokenize(text)
print("Tokens:", tokens)
print("Sentences:", sentences)



Tokens: ['The', 'quick', 'brown', 'fox', 'jumps', 'over', 'the', 'lazy', 'dog', '.', 'It', 'was', 'a', 'bright', 'cold', 'day', 'in', 'April', ',', 'and', 'the', 'clocks', 'were', 'striking', 'thirteen', '.']
Sentences: ['The quick brown fox jumps over the lazy dog.', 'It was a bright cold day in April, and the clocks were striking thirteen.']


In [None]:
# Stop words removal
stop_words = set(stopwords.words('english'))
filtered_tokens = [word for word in tokens if word.lower() not in stop_words]
print("Filtered Tokens:", filtered_tokens)

# Stemming
stemmer = PorterStemmer()
stemmed_tokens = [stemmer.stem(word) for word in filtered_tokens]
print("Stemmed Tokens:", stemmed_tokens)



Filtered Tokens: ['quick', 'brown', 'fox', 'jumps', 'lazy', 'dog', '.', 'bright', 'cold', 'day', 'April', ',', 'clocks', 'striking', 'thirteen', '.']
Stemmed Tokens: ['quick', 'brown', 'fox', 'jump', 'lazi', 'dog', '.', 'bright', 'cold', 'day', 'april', ',', 'clock', 'strike', 'thirteen', '.']


In [None]:
import nltk
nltk.download('omw-1.4')
# Normalization
lowercase_tokens = [word.lower() for word in stemmed_tokens]
print("Lowercase Tokens:", lowercase_tokens)

# Lemmatization
lemmatizer = WordNetLemmatizer()
lemmatized_tokens = [lemmatizer.lemmatize(word) for word in filtered_tokens]
print("Lemmatized Tokens:", lemmatized_tokens)


[nltk_data] Downloading package omw-1.4 to /root/nltk_data...


Lowercase Tokens: ['quick', 'brown', 'fox', 'jump', 'lazi', 'dog', '.', 'bright', 'cold', 'day', 'april', ',', 'clock', 'strike', 'thirteen', '.']
Lemmatized Tokens: ['quick', 'brown', 'fox', 'jump', 'lazy', 'dog', '.', 'bright', 'cold', 'day', 'April', ',', 'clock', 'striking', 'thirteen', '.']


In [None]:
# Parts of speech tagging
pos_tags = pos_tag(tokens)
print("POS Tags:", pos_tags)

POS Tags: [('The', 'DT'), ('quick', 'JJ'), ('brown', 'NN'), ('fox', 'NN'), ('jumps', 'VBZ'), ('over', 'IN'), ('the', 'DT'), ('lazy', 'JJ'), ('dog', 'NN'), ('.', '.'), ('It', 'PRP'), ('was', 'VBD'), ('a', 'DT'), ('bright', 'JJ'), ('cold', 'JJ'), ('day', 'NN'), ('in', 'IN'), ('April', 'NNP'), (',', ','), ('and', 'CC'), ('the', 'DT'), ('clocks', 'NNS'), ('were', 'VBD'), ('striking', 'VBG'), ('thirteen', 'NN'), ('.', '.')]


How to Develop Word Embeddings with Gensim:





In [None]:
# Sample code for training word embeddings with Gensim
from gensim.models import Word2Vec

# Example sentences
sentences = [["this", "is", "a", "sentence"], ["another", "sentence"]]

# Training the Word2Vec model
model = Word2Vec(sentences, vector_size=100, window=5, min_count=1, workers=4)

word = 'word'

# Check if the word is in the vocabulary before accessing its vector
if word in model.wv:
    word_vector = model.wv[word]
    print(f"Word vector for '{word}': {word_vector}")
else:
    print(f"The word '{word}' is not present in the vocabulary.")


The word 'word' is not present in the vocabulary.
