In [4]:
from sklearn.preprocessing import OneHotEncoder
import numpy as np

# Sample vocabulary
vocabulary = ['apple', 'banana', 'orange', 'fruit', 'sweet']

# OneHotEncoder from sklearn to fit on the vocabulary
encoder = OneHotEncoder(sparse=False)
one_hot_encoded = encoder.fit_transform(np.array(vocabulary).reshape(-1, 1))

# Function to convert a word to one-hot encoded vector
def word_to_one_hot(word):
    index = vocabulary.index(word)
    return one_hot_encoded[index]

# Example
word_to_one_hot('apple')  # This will return the one-hot encoded vector for 'apple'


array([1., 0., 0., 0., 0.])

In [5]:
from gensim.models import Word2Vec
import nltk
nltk.download('punkt')
from nltk.tokenize import word_tokenize

# Sample sentences
sentences = ["Apple and banana are fruits", "Fruits like apple and orange are sweet"]

# Tokenizing words
tokenized_sentences = [word_tokenize(sentence.lower()) for sentence in sentences]

# Training Word2Vec model
model = Word2Vec(sentences=tokenized_sentences, vector_size=50, window=5, min_count=1, workers=4)

# Function to get vector for a word
def get_word_vector(word):
    return model.wv[word]

# Example
get_word_vector('apple')  # This will return the 50-dimensional vector for 'apple'


[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\sevan\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!


array([ 1.56351421e-02, -1.90203730e-02, -4.11062239e-04,  6.93839323e-03,
       -1.87794445e-03,  1.67635437e-02,  1.80215668e-02,  1.30730132e-02,
       -1.42324204e-03,  1.54208085e-02, -1.70686692e-02,  6.41421322e-03,
       -9.27599426e-03, -1.01779103e-02,  7.17923651e-03,  1.07406788e-02,
        1.55390287e-02, -1.15330126e-02,  1.48667218e-02,  1.32509926e-02,
       -7.41960062e-03, -1.74912829e-02,  1.08749345e-02,  1.30195115e-02,
       -1.57510047e-03, -1.34197120e-02, -1.41718509e-02, -4.99412045e-03,
        1.02865072e-02, -7.33047491e-03, -1.87401194e-02,  7.65347946e-03,
        9.76895820e-03, -1.28571270e-02,  2.41711619e-03, -4.14975407e-03,
        4.88066689e-05, -1.97670180e-02,  5.38400887e-03, -9.50021297e-03,
        2.17529293e-03, -3.15244915e-03,  4.39334614e-03, -1.57631524e-02,
       -5.43436781e-03,  5.32639725e-03,  1.06933638e-02, -4.78302967e-03,
       -1.90201886e-02,  9.01175756e-03], dtype=float32)

In [10]:
from sklearn.metrics.pairwise import cosine_similarity

def calculate_similarity(word1, word2):
    vector1 = get_word_vector(word1).reshape(1, -1)
    vector2 = get_word_vector(word2).reshape(1, -1)
    return cosine_similarity(vector1, vector2)[0][0]

# Example
calculate_similarity('apple', 'banana')


0.044917308

In [15]:
import pandas as pd
from gensim.models import Word2Vec
import nltk
from nltk.tokenize import word_tokenize

# Load the dataset
data = pd.read_csv(r"C:\Users\sevan\Desktop\IIIT-H\SimLex-999\SimLex-999\SimLex-999.txt", delimiter='\t')

# Extract words and their pairs
word_pairs = data[['word1', 'word2']].values

# Flatten the list of pairs and tokenize
all_words = [word for pair in word_pairs for word in pair]

# You might want to process the list to handle duplicates, etc.
sentences = [word_tokenize(" ".join(set(all_words)))]

# Training the Word2Vec model
model = Word2Vec(sentences=sentences, vector_size=50, window=5, min_count=1, workers=4)

# Function to get vector for a word
def get_word_vector(word):
    return model.wv[word]

# Function to calculate similarity between two word vectors
from sklearn.metrics.pairwise import cosine_similarity

def calculate_similarity(word1, word2):
    vector1 = get_word_vector(word1).reshape(1, -1)
    vector2 = get_word_vector(word2).reshape(1, -1)
    return cosine_similarity(vector1, vector2)[0][0]

# Example usage
print(calculate_similarity('fast', 'rapid'))


0.17888929


In [16]:
# Example usage
print(calculate_similarity('fast', 'rapid'))

0.41072547


In [18]:
# Example usage
print(calculate_similarity('apple', 'juice'))

-0.18896222


In [11]:
from gensim.models import Word2Vec
import nltk
from nltk.tokenize import word_tokenize

# Sample text data (you would replace this with your corpus)
sample_text = "Hello, my name is Sevanth Gajula. I'm currently in my final year of my undergraduation. I want to now work on LLM's. The last thing I want to do is sleep."

# Tokenization
tokens = [word_tokenize(doc.lower()) for doc in nltk.sent_tokenize(sample_text)]

# Training a Word2Vec model
model = Word2Vec(tokens, vector_size=100, window=5, min_count=1, workers=4)

# Getting word vectors
vector = model.wv['currently']  # replace 'example' with any word

print(vector)

# Calculating similarity between two words
similarity_score = model.wv.similarity('last', 'final')  # replace 'word1' and 'word2' with actual words
print(similarity_score)

[-0.00949705  0.00957051 -0.00776096 -0.00263575 -0.00490457 -0.00497958
 -0.00801145 -0.00776483 -0.0045593  -0.00129684 -0.00509552  0.00613002
 -0.00952529 -0.00530443  0.00943686  0.00699585  0.00769379  0.00424049
  0.00049711 -0.00599661  0.00603068  0.00263671  0.0077129   0.00638569
  0.00793799  0.00866407 -0.00990478 -0.00674524  0.00133512  0.0064464
  0.00737509  0.0055081   0.00766372 -0.00514982  0.00658349 -0.00411261
 -0.0090392   0.00914917  0.00133302 -0.00275518 -0.00246698 -0.00422915
  0.00480578  0.0044081  -0.00264758 -0.00734272 -0.00357629 -0.00033629
  0.00611082 -0.00282893 -0.00012132  0.00087327 -0.00709177  0.00205667
 -0.00143795  0.00280997  0.00485269 -0.00134754 -0.00278216  0.0077516
  0.00504399  0.00671702  0.00451951  0.00867088  0.00747768 -0.0010623
  0.00875303  0.00461498  0.00543825 -0.00138768 -0.00203692 -0.00442776
 -0.00850421  0.00303949  0.00888956  0.00891639 -0.00193567  0.00609334
  0.00378064 -0.00431056  0.00202129 -0.00543464  0.00

In [12]:
from gensim.models import Word2Vec
import nltk
from nltk.tokenize import word_tokenize

# Sample text data
sample_texts = [
    "In my final year of university, the last semester was the most challenging.",
    "The final decision rests with the last committee meeting.",
    "The last time I saw him, it was at the final match of the season."
]

# Tokenization
tokens = [word_tokenize(doc.lower()) for doc in sample_texts]

# Training a Word2Vec model
model = Word2Vec(tokens, vector_size=100, window=5, min_count=1, workers=4)

# Calculating similarity scores
similar_context_score = model.wv.similarity('last', 'final')
different_context_score = model.wv.similarity('last', 'committee')
contrasting_contexts_score = model.wv.similarity('last', 'time')

print("Similar Context Score:", similar_context_score)
print("Different Context Score:", different_context_score)
print("Contrasting Contexts Score:", contrasting_contexts_score)


Similar Context Score: -0.013516951
Different Context Score: -0.042464238
Contrasting Contexts Score: 0.031036329
