In [1]:
import gensim.downloader as api  # Import the gensim downloader API
import numpy.linalg as la  # Import for norm function

# Choose a valid model name from https://radimrehurek.com/gensim/models/word2vec.html#pretrained-vectors
model_name = 'glove-twitter-25'  # Specify the name of the GloVe model to use
model = api.load(model_name)  # Download and load the chosen model

def word_similarity(word1, word2):
    """
    This function takes two words and calculates their similarity using cosine similarity.

    Args:
        word1 (str): The first word.
        word2 (str): The second word.

    Returns:
        float: A value between 0 and 1 representing the similarity between the two words.
    """
    # Option 1: Check if words are in the key_to_index dictionary
    if word1 not in model.key_to_index or word2 not in model.key_to_index:
        return 0.0  # Return 0 similarity if any of the words is not in the model vocabulary

    # Get word vectors using their indices
    word1_index = model.key_to_index[word1]  # Get the index of word1 in the model
    word2_index = model.key_to_index[word2]  # Get the index of word2 in the model
    word1_vector = model.vectors[word1_index]  # Get the vector representation of word1
    word2_vector = model.vectors[word2_index]  # Get the vector representation of word2

    # Cosine similarity calculation
    # Dot product of vectors / (magnitude of word1 vector * magnitude of word2 vector)
    similarity = word1_vector.dot(word2_vector) / (la.norm(word1_vector) * la.norm(word2_vector))
    return similarity  # Return the computed cosine similarity between the two words


# Example usage
word1 = "king"  # First word for comparison
word2 = "queen"  # Second word for comparison
similarity = word_similarity(word1, word2)  # Calculate similarity between the two words

# Print the similarity between the two words
print(f"Similarity between '{word1}' and '{word2}': {similarity:.4f}")


Similarity between 'king' and 'queen': 0.9202


In [None]:
""" This the automation of the above code
    It takes input as a excel file consisting the words to be compared
    Output is stored in a seperate excel file"""


# Function to calculate similarity for each row in the Excel sheet
def calculate_similarity(input_file, output_file):
    # Read the Excel file
    df = pd.read_excel(input_file)  # Read the input Excel file into a DataFrame

    # Initialize a list to store similarities for each row
    similarities = []

    # Iterate through each row in the DataFrame
    for index, row in df.iterrows():
        # specifiy the names of the columns to perform the cosine similarity
        word1 = row['roberta']  # Extract the word from the 'roberta' column
        word2 = row['Answer']  # Extract the word from the 'Answer' column

        # Calculate similarity between the two words using the previously defined word_similarity function
        similarity = word_similarity(word1, word2)

        # Append the calculated similarity to the list of similarities
        similarities.append(similarity)

    # Add a new column 'Similarity' to the DataFrame and populate it with the calculated similarities
    df['Similarity'] = similarities

    # Save the modified DataFrame to a new Excel file
    df.to_excel(output_file, index=False)  # Save the DataFrame to the specified output file path
    print("Similarity calculation completed. Results saved to", output_file)  # Print a message indicating completion

# Example usage
input_file = "/content/t5 vs roberta.xlsx"  # Specify the input file path
output_file = "output.xlsx"  # Specify the output file path
calculate_similarity(input_file, output_file)  # Call the function to calculate similarities and save results


Similarity calculation completed. Results saved to output.xlsx
