In [None]:
pip install --upgrade gensim

In [None]:
pip install plotly==5.3.1

In [None]:
import gensim.downloader as api

# Import the pretrained models
model_1 = api.load("glove-wiki-gigaword-50")
model_2 = api.load("glove-wiki-gigaword-100")
model_3 = api.load("glove-wiki-gigaword-200")
model_4 = api.load("glove-wiki-gigaword-300")



In [None]:
# Store the word vectors for each model in a list for easy traversal.
pretrainedVectors = [model_1, model_2, model_3, model_4]

In [None]:
# Plots the vector distances on a radar graph
def plotFeatures(categories, classes, Data, Title):
 
  import plotly.graph_objects as go
  import plotly.offline as pyo

  # Must append the first element to the ends of the lists because the radar graph is circular.
  categories = [*categories, categories[0]]
  for d in Data:
    d.append(d[0])
  
  # Create the plot
  plotData = []
  for i in range(len(classes)):
    plotData.append(go.Scatterpolar(r=Data[i], theta=categories, name=classes[i]))

  # Add all data to a single figure
  fig = go.Figure(
      data=plotData,
      layout=go.Layout(
          title=go.layout.Title(text=Title),
          polar={'radialaxis': {'visible': True}},
          showlegend=True
      )
  )

  fig.update_layout(
    autosize=False,
    width=800,
    height=800,
  )

  # Save the image as an html file
  pyo.plot(fig, auto_open=True)
  
  # Display the image
  fig.show()

In [None]:
# Book 1: The Extraordinary Adventures of Arsène Lupin, Gentleman-Burglar
import numpy as np
import plotly.graph_objects as go
import plotly.offline as pyo
import gensim.models

# For book 1, the antagonist's names are not tokenized in the word vectors. Instead we use investigator and detective to refer to them
# The first crime scene is not described well. 
classes = ['Gigaword-50', 'Gigaword-100', 'Gigaword-200', 'Gigaword-300']
categories = ['detective', 'jewels', 'box']
categoryLabels = ['Lupin-detective', 'Lupin-jewel', 'Lupin-box']
pretrainedDistances = []
temp = []

# Iterate through Gigaword 50-300 and record cosine distances. Cosine distance is bounded from [0:2].
for wv in pretrainedVectors:
  for c in categories:
    temp.append(min(wv.distances(c, ['arsène', 'lupin'])))

  pretrainedDistances.append(temp[:])
  temp.clear()

# Plots the vector distances using a radar graph.
Title = 'Vector Distances for Pretrained "Gigaword" Model'
plotFeatures(categoryLabels, classes, pretrainedDistances, Title)

pretrainedMinimums = np.array(pretrainedDistances).min(axis=0)

###########

# Location of downloaded Lupin Models
path = '/content/'

dimension = [50, 100, 200, 300]
window    = [2, 3, 5, 10]

novelDistances = []
novelMinimums = []
novelClasses = ['Novel-50', 'Novel-100', 'Novel-200', 'Novel-300']

# Iterate through the models trained on the Lupin book
for w in window:
  for d in dimension:
    G = gensim.models.Word2Vec.load(path + 'model_book1_Vec' + str(d) + '_Window' + str(w))
    wv = G.wv

    for c in categories:
      temp.append(min(wv.distances(c, ['arsène', 'lupin'])))

    novelDistances.append(temp[:])
    temp.clear()

  # Plots the vector distances using a radar graph.
  Title = 'Vector Distances for Novel Trained Model with Window = ' + str(w)
  plotFeatures(categoryLabels, novelClasses, novelDistances, Title)

  novelMinimums.append(np.array(novelDistances).min(axis=0).tolist())
  novelDistances.clear();

print(pretrainedMinimums)
for i in [0, 1, 2, 3]:
  print(novelMinimums[i])

In [None]:
# Book 2: Arsene Lupin
import numpy as np
import plotly.graph_objects as go
import plotly.offline as pyo
import gensim.models

# For book 2, the antagonist's names are not tokenized in the word vectors. Instead we use investigator and detective to refer to them.
classes = ['Gigaword-50', 'Gigaword-100', 'Gigaword-200', 'Gigaword-300']
categories = ['detective', 'inspector', 'picture', 'chalk']
categoryLabels = ['Lupin-detective', 'Lupin-inspector', 'Lupin-picture', 'Lupin-chalk']
pretrainedDistances = []
temp = []


# Iterate through Gigaword 50-300 and record cosine distances. Cosine distance is bounded from [0:2].
for wv in pretrainedVectors:
  for c in categories:
    temp.append(min(wv.distances(c, ['arsene', 'lupin'])))

  pretrainedDistances.append(temp[:])
  temp.clear()

# Plots the vector distances using a radar graph.
Title = 'Vector Distances for Pretrained "Gigaword" Model'
plotFeatures(categoryLabels, classes, pretrainedDistances, Title)

pretrainedMinimums = np.array(pretrainedDistances).min(axis=0)

###########

# Location of downloaded Lupin Models
path = '/content/'

dimension = [50, 100, 200, 300]
window    = [2, 3, 5, 10]

novelDistances = []
novelMinimums = []
novelClasses = ['Novel-50', 'Novel-100', 'Novel-200', 'Novel-300']

# Iterate through the models trained on the Lupin book
for w in window:
  for d in dimension:
    G = gensim.models.Word2Vec.load(path + 'model_book2_Vec' + str(d) + '_Window' + str(w))
    wv = G.wv

    for c in categories:
      temp.append(min(wv.distances(c, ['arsene', 'lupin'])))

    novelDistances.append(temp[:])
    temp.clear()

  # Plots the vector distances using a radar graph.
  Title = 'Vector Distances for Novel Trained Model with Window = ' + str(w)
  plotFeatures(categoryLabels, novelClasses, novelDistances, Title)

  novelMinimums.append(np.array(novelDistances).min(axis=0).tolist())
  novelDistances.clear();

print('Wiki Trained          ', pretrainedMinimums)
for i in range(4):
  print('Novel Trained Window=' + str(window[i]) , novelMinimums[i])

In [None]:
# Book 3: Arsène Lupin versus Herlock Sholmes
import numpy as np
import plotly.graph_objects as go
import plotly.offline as pyo
import gensim.models

# For book 3, the antagonist's names are not tokenized in the word vectors. Instead we use investigator and detective to refer to them.
classes = ['Gigaword-50', 'Gigaword-100', 'Gigaword-200', 'Gigaword-300']
categories = ['detective', 'desk', 'ticket', 'diamond', 'handkerchief']
categoryLabels = ['Lupin-detective', 'Lupin-desk', 'Lupin-ticket', 'Lupin-diamond',
              'Lupin-handkerchief']
pretrainedDistances = []
temp = []


# Iterate through Gigaword 50-300 and record cosine distances. Cosine distance is bounded from [0:2].
for wv in pretrainedVectors:
  for c in categories:
    temp.append(min(wv.distances(c, ['arsène', 'lupin'])))

  pretrainedDistances.append(temp[:])
  temp.clear()

# Plots the vector distances using a radar graph.
Title = 'Vector Distances for Pretrained "Gigaword" Model'
plotFeatures(categoryLabels, classes, pretrainedDistances, Title)

pretrainedMinimums = np.array(pretrainedDistances).min(axis=0)

###########

# Location of downloaded Lupin Models
path = '/content/'

dimension = [50, 100, 200, 300]
window    = [2, 3, 5, 10]

novelDistances = []
novelMinimums = []
novelClasses = ['Novel-50', 'Novel-100', 'Novel-200', 'Novel-300']

# Iterate through the models trained on the Lupin book
for w in window:
  for d in dimension:
    G = gensim.models.Word2Vec.load(path + 'model_book3_Vec' + str(d) + '_Window' + str(w))
    wv = G.wv

    for c in categories:
      temp.append(min(wv.distances(c, ['arsène', 'lupin'])))

    novelDistances.append(temp[:])
    temp.clear()

  # Plots the vector distances using a radar graph.
  Title = 'Vector Distances for Novel Trained Model with Window = ' + str(w)
  plotFeatures(categoryLabels, novelClasses, novelDistances, Title)

  novelMinimums.append(np.array(novelDistances).min(axis=0).tolist())
  novelDistances.clear();

print('Wiki Trained          ', pretrainedMinimums)
for i in range(4):
  print('Novel Trained Window=' + str(window[i]) , novelMinimums[i])

In [None]:
# Book 4: The Hollow Needle
import numpy as np
import plotly.graph_objects as go
import plotly.offline as pyo
import gensim.models

# For book 5, the antagonist's names are not tokenized in the word vectors. Instead we use investigator and detective to refer to them.
classes = ['Gigaword-50', 'Gigaword-100', 'Gigaword-200', 'Gigaword-300']
categories = ['detective', 'inspector', 'knife']
categoryLabels = ['Lupin-detective', 'Lupin-inspector', 'Lupin-knife']
pretrainedDistances = []
temp = []


# Iterate through Gigaword 50-300 and record cosine distances. Cosine distance is bounded from [0:2].
for wv in pretrainedVectors:
  for c in categories:
    temp.append(min(wv.distances(c, ['arsène', 'lupin'])))

  pretrainedDistances.append(temp[:])
  temp.clear()

# Plots the vector distances using a radar graph.
Title = 'Vector Distances for Pretrained "Gigaword" Model'
plotFeatures(categoryLabels, classes, pretrainedDistances, Title)

pretrainedMinimums = np.array(pretrainedDistances).min(axis=0)

###########

# Location of downloaded Lupin Models
path = '/content/'

dimension = [50, 100, 200, 300]
window    = [2, 3, 5, 10]

novelDistances = []
novelMinimums = []
novelClasses = ['Novel-50', 'Novel-100', 'Novel-200', 'Novel-300']

# Iterate through the models trained on the Lupin book
for w in window:
  for d in dimension:
    G = gensim.models.Word2Vec.load(path + 'model_book4_Vec' + str(d) + '_Window' + str(w))
    wv = G.wv

    for c in categories:
      temp.append(min(wv.distances(c, ['arsène', 'lupin'])))

    novelDistances.append(temp[:])
    temp.clear()

  # Plots the vector distances using a radar graph.
  Title = 'Vector Distances for Novel Trained Model with Window = ' + str(w)
  plotFeatures(categoryLabels, novelClasses, novelDistances, Title)

  novelMinimums.append(np.array(novelDistances).min(axis=0).tolist())
  novelDistances.clear();

print(pretrainedMinimums)
for i in [0, 1, 2, 3]:
  print(novelMinimums[i])




In [None]:
# Book 5: The Confessions of Arsène Lupin
import numpy as np
import plotly.graph_objects as go
import plotly.offline as pyo
import gensim.models

# For book 5, the antagonist's names are not tokenized in the word vectors. Instead we use investigator and detective to refer to them.
classes = ['Gigaword-50', 'Gigaword-100', 'Gigaword-200', 'Gigaword-300']
categories = ['detectives', 'inspector', 'francs', 'revolver']
categoryLabels = ['Lupin-detective', 'Lupin-inspector', 'Lupin-francs', 'Lupin-revolver']
pretrainedDistances = []
temp = []


# Iterate through Gigaword 50-300 and record cosine distances. Cosine distance is bounded from [0:2].
for wv in pretrainedVectors:
  for c in categories:
    temp.append(min(wv.distances(c, ['arsène', 'lupin'])))

  pretrainedDistances.append(temp[:])
  temp.clear()

# Plots the vector distances using a radar graph.
Title = 'Vector Distances for Pretrained "Gigaword" Model'
plotFeatures(categoryLabels, classes, pretrainedDistances, Title)

pretrainedMinimums = np.array(pretrainedDistances).min(axis=0)

###########

# Location of downloaded Lupin Models
path = '/content/'

dimension = [50, 100, 200, 300]
window    = [2, 3, 5, 10]

novelDistances = []
novelMinimums = []
novelClasses = ['Novel-50', 'Novel-100', 'Novel-200', 'Novel-300']

# Iterate through the models trained on the Lupin book
for w in window:
  for d in dimension:
    G = gensim.models.Word2Vec.load(path + 'model_book5_Vec' + str(d) + '_Window' + str(w))
    wv = G.wv

    for c in categories:
      temp.append(min(wv.distances(c, ['arsène', 'lupin'])))

    novelDistances.append(temp[:])
    temp.clear()

  # Plots the vector distances using a radar graph.
  Title = 'Vector Distances for Novel Trained Model with Window = ' + str(w)
  plotFeatures(categoryLabels, novelClasses, novelDistances, Title)

  novelMinimums.append(np.array(novelDistances).min(axis=0).tolist())
  novelDistances.clear();

print('Wiki Trained          ', pretrainedMinimums)
for i in range(4):
  print('Novel Trained Window=' + str(window[i]) , novelMinimums[i])
