# Setup

Run first

In [None]:
!pip install vaderSentiment
from vaderSentiment.vaderSentiment import SentimentIntensityAnalyzer

In [1]:
from scipy import spatial
import numpy as np
import pandas as pd

# Sentiment Analysis

In [None]:
# Start VADER

analyzer = SentimentIntensityAnalyzer()
word = 'search find'
vs = analyzer.polarity_scores(word)
print("{:-<65} {}".format(word, str(vs)))

In [None]:
# Converts text to a SA vector through VADER

def text_2_sent_vec(text):
  vs = analyzer.polarity_scores(text)
  sent_vec = np.zeros((len(vs)))
  for idx, key in enumerate(vs):
    sent_vec[idx] = vs[key]

  return sent_vec

In [None]:
text_2_sent_vec("funeral somber")

In [None]:
# Testing analogy pairs with VADER

analyzer = SentimentIntensityAnalyzer()

questions = [["diamond:baseball", "court:poker", "court:jury", "court:grass", "court:squash", 4],
             ["bench:judge", "throne:king", "queen:king", "court:king", "knight:king", 3],
             ["funeral:somber", "tension:festive", "soiree:festive", "eulogy:festive", "sari:festive", 2],
             ["defeat:vanquish", "search:peer", "search:ransack", "search:destroy", "search:find", 4],
             ["slug:land", "shark:seaweed", "shark:ocean", "shark:sky", "shark:slide", 2]]

analogy_predictions = []

for p in range(0, len(questions)):
  print("Q"+ str(p+1) +":")
  q = questions[p]
  
  a = q[0].split(':')
  a_combined = a[0] + ' ' + a[1]
  a_vec = text_2_sent_vec(a_combined)
  print(a_vec)

  sim_scores = []
  for i in range(1,5):
    b = q[i].split(':')
    b_combined = b[0] + ' ' + b[1]
    b_vec = text_2_sent_vec(b_combined)
    print(b_vec)

    sim = 1 - spatial.distance.cosine(a_vec, b_vec)
    sim_scores.append(sim)

  print("Similarities:")
  print(sim_scores)
  analogy_predictions.append(np.argmax(sim_scores)+1)
  print()
  



print("-----")

correct_anal = 0
for i in range (0,len(questions)):
  if questions[i][5] == analogy_predictions[i]:
    print("Correct answer for Question #" + str(i+1))
  else:
    print("Incorrect answer for Question #" + str(i+1))

In [None]:
# Testing with synonyms/antonyms

# Need CSV from Project 3
syntest = pd.read_csv("syntest.csv")
analyzer = SentimentIntensityAnalyzer()

prediction_list = []

for k in range(0,len(syntest)):
  t = syntest.loc[k]
  q = text_2_sent_vec(t.loc["Question"])
  sim_scores = []
  for i in range(1,5):
    sim = 1 - spatial.distance.cosine(q, text_2_sent_vec(t.loc["Answer"+str(i)]))
    sim_scores.append(sim)

  # Use print to see all computed similarity scores
  #print(k+1, sim_scores)
  if t.loc["Type"] == 'synonym':
    prediction = t.loc["Answer"+str(np.argmax(sim_scores) + 1)]
  else:
    prediction = t.loc["Answer"+str(np.argmin(sim_scores) + 1)]

  if prediction == t.loc["Correct"]:
    prediction_list.append(1)
  else:
    prediction_list.append(0)

print("Correct answers: " + str(np.count_nonzero(prediction_list)) +"/"+ str(len(prediction_list)) )

# WordNet

In [None]:
!pip install nltk

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/


In [None]:
import nltk
nltk.download('wordnet')
!unzip /root/nltk_data/corpora/wordnet.zip -d /root/nltk_data/corpora/
from nltk.corpus import wordnet as wn

[nltk_data] Downloading package wordnet to /root/nltk_data...


Archive:  /root/nltk_data/corpora/wordnet.zip
   creating: /root/nltk_data/corpora/wordnet/
  inflating: /root/nltk_data/corpora/wordnet/lexnames  
  inflating: /root/nltk_data/corpora/wordnet/data.verb  
  inflating: /root/nltk_data/corpora/wordnet/index.adv  
  inflating: /root/nltk_data/corpora/wordnet/adv.exc  
  inflating: /root/nltk_data/corpora/wordnet/index.verb  
  inflating: /root/nltk_data/corpora/wordnet/cntlist.rev  
  inflating: /root/nltk_data/corpora/wordnet/data.adj  
  inflating: /root/nltk_data/corpora/wordnet/index.adj  
  inflating: /root/nltk_data/corpora/wordnet/LICENSE  
  inflating: /root/nltk_data/corpora/wordnet/citation.bib  
  inflating: /root/nltk_data/corpora/wordnet/noun.exc  
  inflating: /root/nltk_data/corpora/wordnet/verb.exc  
  inflating: /root/nltk_data/corpora/wordnet/README  
  inflating: /root/nltk_data/corpora/wordnet/index.sense  
  inflating: /root/nltk_data/corpora/wordnet/data.noun  
  inflating: /root/nltk_data/corpora/wordnet/data.adv  


In [2]:
import nltk
nltk.download('wordnet')

# Recent NLTK versions require both packages for WordNet
nltk.download('omw-1.4')

# Sometimes necessary to manually unzip
#!unzip /root/nltk_data/corpora/wordnet.zip -d /root/nltk_data/corpora/

from nltk.corpus import wordnet as wn

[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data] Downloading package omw-1.4 to /root/nltk_data...


In [None]:
#print(nltk.find('corpora/wordnet.zip'))

/root/nltk_data/corpora/wordnet.zip


In [63]:
def get_hypernyms(text, method = 'first', second_text = ''):

  # Retrieves the first discovered hypernym
  # Ordering based on estimated frequency of usage
  if method == 'first':
    hypernyms = []
    for syn in wn.synsets(text):
      hypernyms = syn.hypernyms()
      if len(hypernyms) > 0:
        break

    result = hypernyms[0].lemmas()[0].name()
    return result

  # Retrieves hypernym that is most relevant to 2nd text
  elif method == 'relevant':

    # Most common synset of second text
    sec_syn = wn.synsets(second_text)[0]

    first_synsets = wn.synsets(text)

    best_score = 0
    best_syn = None
    for syn in first_synsets:
      sim_score = syn.path_similarity(sec_syn)
      if best_score < sim_score:
        best_syn = syn

    if best_syn is not None:
      #print(best_syn)
      hypernyms = best_syn.hypernyms()
      result = hypernyms[0].lemmas()[0].name()
      return result
    else:
      return

  # Method undefined
  else:
    return

In [90]:
# Test

print(get_hypernyms('diamond', method = 'first'))
print(get_hypernyms('diamond', method = 'relevant', second_text = 'baseball'))

print(get_hypernyms('park', method = 'first'))
print(get_hypernyms('park', method = 'relevant', second_text = 'car'))

print(get_hypernyms('kid', method = 'first'))
print(get_hypernyms('kid', method = 'relevant', second_text = 'goat'))

jewel
playing_field
tract
steer
juvenile
tease


In [86]:
for syn in wn.synsets('kid'):
  print(syn.definition())

a young person of either sex
soft smooth leather from the hide of a young goat
English dramatist (1558-1594)
a human offspring (son or daughter) of any age
young goat
tell false information to for fun
be silly or tease one another
