<a href="https://colab.research.google.com/github/suinkangme/Word-Embeddings-Experiments/blob/main/Word2Vec.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Experiments with Word Embeddings

In [1]:
# import libraries
import pandas as pd
import numpy as np
import gensim
import gensim.downloader as api

### Import the MyDrive library from CoLab

In [2]:
from google.colab import drive
drive.mount('/content/drive')

# the directory owns this notebook and dataset
import os
os.chdir('/content/drive/MyDrive/comp472/')

Mounted at /content/drive


### Load the dataset

In [3]:
# load dataset
data_synonym = pd.read_csv('synonym.csv')
data_synonym.head()

Unnamed: 0,question,answer,0,1,2,3
0,enormously,tremendously,appropriately,uniquely,tremendously,decidedly
1,provisions,stipulations,stipulations,interrelations,jurisdictions,interpretations
2,haphazardly,randomly,dangerously,densely,randomly,linearly
3,prominent,conspicuous,battered,ancient,mysterious,conspicuous
4,zenith,pinnacle,completion,pinnacle,outset,decline


In [68]:
analysis_file = open('analysis.csv', 'w')

In [5]:
def find_best_synonym(model, model_name, analysis_file):
  C = 0  # total number of correct labels
  V = 0  # total number of questions without guessing
  with open(f"{model_name}-details.csv", "w") as outputFile:

    outputFile.write("question-word,correct answer-word, model's guess-word,label\n")

    for i in range(len(data_synonym)):
      question_word = data_synonym.iloc[i, 0]
      correct_word = data_synonym.iloc[i, 1]
      guess_words = [data_synonym.iloc[i, j] for j in range(2, 6)]

      system_guess = ''
      result = 0
      if question_word in model.key_to_index:
        for word in guess_words:
          if word in model.key_to_index:
            similarity = model.similarity(question_word, word)
            if similarity > result:
              result = similarity
              system_guess = word

        if system_guess != '':
          V += 1
          # check if the model's guess is correct
          if system_guess == correct_word:
            label = 'correct'
            C += 1
          else:
            label = 'wrong'

          # all four guess-words were not found in the model's similar_guess_words
        else:
          label = 'guess'

        # the question word is NOT found in the model
      else:
        label = 'guess'

      outputFile.write(f"{question_word},{correct_word},{system_guess},{label}\n")

    accuracy = C / V
    vocabulary_size = len(model.key_to_index)
    analysis_file.write(f"{model_name}, {vocabulary_size}, {C}, {V}, {accuracy}\n")
    analysis_file.flush()
    outputFile.close()

In [6]:
def find_best_synonym_wv(model, model_name, analysis_file):
  C = 0  # total number of correct labels
  V = 0  # total number of questions without guessing
  with open(f"{model_name}-details.csv", "w") as outputFile:

    outputFile.write("question-word,correct answer-word, model's guess-word,label\n")

    for i in range(len(data_synonym)):
      question_word = data_synonym.iloc[i, 0]
      correct_word = data_synonym.iloc[i, 1]
      guess_words = [data_synonym.iloc[i, j] for j in range(2, 6)]

      system_guess = ''
      result = 0
      if question_word in model.wv.key_to_index:
        for word in guess_words:
          if word in model.wv.key_to_index:
            similarity = model.wv.similarity(question_word, word)
            if similarity > result:
              result = similarity
              system_guess = word

        if system_guess != '':
          V += 1
          # check if the model's guess is correct
          if system_guess == correct_word:
            label = 'correct'
            C += 1
          else:
            label = 'wrong'

          # all four guess-words were not found in the model's similar_guess_words
        else:
          label = 'guess'

        # the question word is NOT found in the model
      else:
        label = 'guess'

      outputFile.write(f"{question_word},{correct_word},{system_guess},{label}\n")

    accuracy = C / V
    vocabulary_size = len(model.wv.key_to_index)
    analysis_file.write(f"{model_name}, {vocabulary_size}, {C}, {V}, {accuracy}\n")
    analysis_file.flush()
    outputFile.close()

## Task 1

In [30]:
info = api.info()
model = api.load("word2vec-google-news-300")

In [69]:
find_best_synonym(model, "word2vec-google-news-300", analysis_file)

## Task 2

### Different corpora but same embedding size

In [32]:
info = api.info()
model_gt50 = api.load("glove-twitter-50")

In [70]:
find_best_synonym(model_gt50, "glove-twitter-50", analysis_file)

In [34]:
info = api.info()
model_gw50 = api.load("glove-wiki-gigaword-50")

In [71]:
find_best_synonym(model_gw50, "glove-wiki-gigaword-50", analysis_file)

### Same corpora different size

In [36]:
info = api.info()
model_gt25 = api.load("glove-twitter-25")

In [72]:
find_best_synonym(model_gt25, "glove-twitter-25", analysis_file)

In [38]:
info = api.info()
model_gt100 = api.load("glove-twitter-100")

In [73]:
find_best_synonym(model_gt100, "glove-twitter-100", analysis_file)

## Task 3

In [40]:
import nltk
nltk.download('punkt')

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


True

In [41]:
import requests
from bs4 import BeautifulSoup
from nltk.tokenize import sent_tokenize, word_tokenize

In [58]:
def fetch_and_preprocess_books(book_urls):
    corpus = []
    for url in book_urls:
        response = requests.get(url)
        soup = BeautifulSoup(response.text, 'html.parser')
        text = soup.get_text()

        sentences = sent_tokenize(text)

        words_list = [word_tokenize(sentence.lower()) for sentence in sentences]

        corpus.extend(words_list)
    return corpus

book_urls = [
    "https://www.gutenberg.org/files/11/11-0.txt",  # Alice
    "https://www.gutenberg.org/files/1342/1342-0.txt",# Pride and Prejudice
    "https://www.gutenberg.org/cache/epub/84/pg84.txt", # frankenstein
    "https://www.gutenberg.org/cache/epub/2701/pg2701.txt", # moby dick
    "https://www.gutenberg.org/cache/epub/64317/pg64317.txt", # great gatsby
    "https://www.gutenberg.org/cache/epub/46/pg46.txt", #A Christmas Carol in Prose; Being a Ghost Story of Christmas by Charles Dickens
    "https://www.gutenberg.org/cache/epub/1513/pg1513.txt", #Romeo and Juliet
    "https://www.gutenberg.org/cache/epub/145/pg145.txt", #Middle march
    "https://www.gutenberg.org/cache/epub/2641/pg2641.txt", #A room with a view
    "https://www.gutenberg.org/cache/epub/37106/pg37106.txt", # Little Women
    "https://www.gutenberg.org/cache/epub/25344/pg25344.txt", #Scarlet letter
    "https://www.gutenberg.org/cache/epub/174/pg174.txt", # Dorian Gray
    "https://www.gutenberg.org/cache/epub/74/pg74.txt"#Tom Sawyer
]

corpus = fetch_and_preprocess_books(book_urls)

In [74]:
model_1 = gensim.models.Word2Vec(sentences = corpus, vector_size = 50, window = 2, epochs = 15)

In [75]:
find_best_synonym_wv(model_1, "E5-W1", analysis_file)

In [76]:
model_2 = gensim.models.Word2Vec(sentences = corpus, vector_size = 50, window = 5, epochs = 15)

In [77]:
find_best_synonym_wv(model_2, "E5-W2", analysis_file)

In [78]:
model_3 = gensim.models.Word2Vec(sentences = corpus, vector_size = 100, window = 2, epochs = 15)

In [79]:
find_best_synonym_wv(model_3, "E6-W1", analysis_file)

In [80]:
model_4 = gensim.models.Word2Vec(sentences=corpus, vector_size = 100, window= 5, epochs = 15)

In [81]:
find_best_synonym_wv(model_4, "E6-W2", analysis_file)

In [82]:
analysis_file.close()