References:
- https://radimrehurek.com/gensim/downloader.html
- https://www.nltk.org/howto/gensim.html

# Install and import
After installing `gensim`, restart the session. You can do this by Runtime > Restart session.

In [None]:
!pip install gensim
# !pip install scipy
# !pip install numpy

In [None]:
# !pip show gensim | grep Version # 4.3.3
# !pip show numpy | grep Version # 1.26.4
# !pip show scipy | grep Version  # 1.13.1

In [None]:
import gensim.downloader as api # If this gives you an error, Runtime > Restart session
from scipy.spatial.distance import cosine
from google.colab import files

# Setup variable and model names

In [None]:
# TODO: Change this to YOUR "LASTNAME_FIRSTNAME"
YOUR_NAME = 'KIM_SANGHEE'

In [None]:
# Set output file name
# TODO: Change the file name
_FILE_NAME = 'similarity-scores'
# _FILE_NAME = 'unRelated-words'

In [None]:
# Load pre-trained model

# TODO: Choose one of the three models:
model_name = "glove-twitter-25"
# model_name = "glove-wiki-gigaword-50"
# model_name = "word2vec-google-news-300"

model = api.load(model_name)

# Task 1. Calculte similarity score

In [None]:
model.most_similar('queen')

In [None]:
# Insert words
word1, word2 = 'queen', 'princess'
# word1, word2 = 'cold', 'hot' # Surprise!

# Get embeddings
vector1, vector2 = model[word1], model[word2]

# Print embeddings
print(f"Embeddings of '{word1}': {vector1} \n")
print(f"Embeddings of '{word2}': {vector2} \n")

# Get similarity score
cos_distance = cosine(vector1, vector2) # This is 1 - cosine_similarity; range [0, 2]
cos_similarity = 1 - cos_distance # range [-1, 1]
print(cos_similarity)

# print('\n')
# print(model.similarity(word1, word2)) # NOTE: this is a built-in function; Don't use this for assignments

## Assign word pairs

In [None]:
# TODO: Include all the word pairs (sem, phn, unr) you manually created

word_pairs = [
    ("shark", "whale"),
    ("shark", "butter"),
    ("mile", "inch"),
    ("mile", "fork")



    ]

## Get word pair similarity scores

In [None]:
# Open a file to save output

# NOTE: Before you run this code
#       Make sure you have set the correct _FILE_NAME
#       Check the variable under the 'Setup' section

with open(f"{_FILE_NAME}.txt", "w") as f:

    f.write(f"* -- Submission by {YOUR_NAME} -- *\n")
    f.write(f"* -- Selected model: {model_name} -- *\n")

    for word1, word2 in word_pairs:
        # Check if words exist in the vocabulary
        if word1 in model and word2 in model:
            # Get word vectors
            vec1 = model[word1]
            vec2 = model[word2]

            # Compute similarity
            cosine_distance = cosine(vec1, vec2) # 1 - cosine_similarity; range: [0,2]
            cosine_similarity = 1 - cosine(vec1, vec2)  # range: [-1, 1]

            # Print score
            output = f"Similarity score between '{word1}' and '{word2}': {cosine_similarity:.4f}"
            print(output)
            f.write(output + "\n")

        # If words don't exist in the vocabulary
        else:
            print(f"Word '{word1}' or '{word2}' not in dictionary. Skipping...")

# Task 2. Get (un)related words based on score

## Assign target words

In [None]:
# TODO: Include the rest of the target word(s)
target_words = [
    "shark",
    "mile"

]

## Assign score threshold

In [None]:
# TODO: Change the similarity score threshold
# You should play around with it to find
# what you think is the optimal score.
score_related_min = 0.80
score_unrelated_max =
score_unrelated_min =

## Assign the number of words to generate

In [None]:
# TODO: Define the number top n words;
#       replace the number with a different number (greater than or equal to 5)
#       It just enough to help you see good word candidates
top_n =

## Get (un)related words

In [None]:
# Open a file to save output

# NOTE: Before you run this code
#       Make sure you have set the correct _FILE_NAME
#       Check the variable under the 'Setup' section

with open(f"{_FILE_NAME}.txt", "w") as f:
    f.write(f"* -- Submission by {YOUR_NAME} -- *\n")
    f.write(f"* -- Selected model: {model_name} -- *\n")

    for tw in target_words:
        if tw not in model:
            print(f"'{tw}' not found in vocabulary. Skipping...")
            continue

        word_vec = model[tw]

        # Compute cosine similarity for all words in the vocabulary
        similarities = {
            other_word: 1 - cosine(word_vec, model[other_word])
            for other_word in model.index_to_key if other_word != tw
        }

        # Get related words
        related_words = sorted(
            [(w, s) for w, s in similarities.items() if s > score_related_min],
            key=lambda x: x[1],
            reverse=True
        )[:top_n]

        # Get unrelated words
        # TODO: Do the same thing as above (getting related_words).
        #       Find words whose similarity score with the target word is
        #       greater than or equal to `score_unrelated_min' and
        #       smaller than or equal to `score_unrelated_max'
        unrelated_words =

        # Save and print related words
        f.write(f"\n--- Top {top_n} related words to '{tw}' (score > {score_related_min}) ---\n")
        print(f"\n--- Top {top_n} related words to '{tw}' (score > {score_related_min}) ---")
        for word, score in related_words:
            line = f"{word}: {score:.4f}"
            print(line)
            f.write(line + "\n")

        # Save and print related words
        # TODO: Save and print unrelated words
        #
        #
        #
        #
        #
        #

# Save output


In [None]:
# Check your file name
print(_FILE_NAME)

In [None]:
# TODO: Check your Downloads folder.
# You should be able to find a .txt file with similarity scores printed out.

# NOTE: Make sure you have set the correct _FILE_NAME
#       Check the variable under the 'Setup' section
files.download(f"{_FILE_NAME}.txt")
print(f"Output has been downloaded.")

# Bonus: Analogy
Example: "man is to woman as king is to ?"
This is analogous to: "king" - "man" + "woman"

In [None]:
analogy = model.most_similar(positive=["king", "woman"], negative=["man"], topn=1)
# analogy = model.most_similar(positive=["king", "woman"], negative=["man"], topn=5)

In [None]:
print(f"Result of analogy 'man is to woman as king is to ?': {analogy[0]}")
# print(f"Result of analogy 'man is to woman as king is to ?': {analogy}")