In [1]:
from gensim.models.keyedvectors import KeyedVectors
from gensim.models import word2vec
import gensim.downloader as api

In [None]:
word_vectors = api.load('glove-wiki-gigaword-50')

In [2]:
word_vectors.most_similar('abuse', topn = 25)

[('sexual', 0.8234277367591858),
 ('harassment', 0.8090097904205322),
 ('abuses', 0.7943168878555298),
 ('sex', 0.7941537499427795),
 ('criminal', 0.7893994450569153),
 ('rape', 0.7847622632980347),
 ('cases', 0.782743513584137),
 ('misconduct', 0.7807194590568542),
 ('torture', 0.7727799415588379),
 ('crime', 0.7662270069122314),
 ('neglect', 0.762278139591217),
 ('discrimination', 0.7608848810195923),
 ('alleged', 0.7530607581138611),
 ('allegations', 0.7488146424293518),
 ('serious', 0.744958221912384),
 ('charges', 0.7440094947814941),
 ('mistreatment', 0.741011381149292),
 ('dealing', 0.7386057376861572),
 ('victim', 0.7379998564720154),
 ('stemming', 0.7320647239685059),
 ('crimes', 0.7304348349571228),
 ('abused', 0.7293891906738281),
 ('complaints', 0.7288611531257629),
 ('child', 0.7285066843032837),
 ('corruption', 0.7261059284210205)]

In [3]:
word_vectors.most_similar('physical', topn = 25)

[('mental', 0.8731808066368103),
 ('psychological', 0.8420647978782654),
 ('experience', 0.8173168301582336),
 ('lack', 0.8085145354270935),
 ('stress', 0.7857223153114319),
 ('knowledge', 0.7729593515396118),
 ('certain', 0.7690230011940002),
 ('skill', 0.7685284614562988),
 ('skills', 0.760212779045105),
 ('abilities', 0.7577658295631409),
 ('learning', 0.7564065456390381),
 ('minimal', 0.7552646994590759),
 ('and/or', 0.7535192966461182),
 ('particular', 0.7467431426048279),
 ('emotional', 0.7463260293006897),
 ('serious', 0.7413700819015503),
 ('proper', 0.7393556833267212),
 ('ability', 0.7382082939147949),
 ('visual', 0.7366421222686768),
 ('trauma', 0.7357707023620605),
 ('basic', 0.7356463074684143),
 ('quality', 0.7337371706962585),
 ('treatment', 0.732905387878418),
 ('reasons', 0.7316986918449402),
 ('aspects', 0.7313452959060669)]

In [4]:
word_vectors.most_similar('depression', topn = 25)

[('onset', 0.7914988398551941),
 ('severe', 0.7839459776878357),
 ('illness', 0.7577464580535889),
 ('alcoholism', 0.7406427264213562),
 ('fever', 0.7339359521865845),
 ('debilitating', 0.7328546643257141),
 ('anxiety', 0.7284860014915466),
 ('symptoms', 0.7284563779830933),
 ('respiratory', 0.7220224738121033),
 ('experiencing', 0.7218852639198303),
 ('complications', 0.7164477705955505),
 ('suffering', 0.7126669883728027),
 ('caused', 0.7055109739303589),
 ('pneumonia', 0.7045726776123047),
 ('acute', 0.7023511528968811),
 ('chronic', 0.6988701820373535),
 ('causes', 0.6948482394218445),
 ('schizophrenia', 0.6914389133453369),
 ('suffer', 0.6903968453407288),
 ('worsened', 0.6852149963378906),
 ('dementia', 0.6822355389595032),
 ('postpartum', 0.6819018125534058),
 ('childhood', 0.6802726984024048),
 ('disorder', 0.677268922328949),
 ('illnesses', 0.6771678924560547)]

## Experimenting the "glove.6B.50d.txt"

1. Download the files from here: https://github.com/stanfordnlp/GloVe?tab=readme-ov-file#download-pre-trained-word-vectors
    - Common Crawl (42B tokens, 1.9M vocab, uncased, 300d vectors, 1.75 GB download): [glove.42B.300d.zip](https://huggingface.co/stanfordnlp/glove/resolve/main/glove.42B.300d.zip) [[mirror](https://nlp.stanford.edu/data/wordvecs/glove.42B.300d.zip)]
    - Common Crawl (840B tokens, 2.2M vocab, cased, 300d vectors, 2.03 GB download): [glove.840B.300d.zip](https://huggingface.co/stanfordnlp/glove/resolve/main/glove.840B.300d.zip) [[mirror](https://nlp.stanford.edu/data/wordvecs/glove.840B.300d.zip)]
    - Wikipedia 2014 + Gigaword 5 (6B tokens, 400K vocab, uncased, 300d vectors, 822 MB download): [glove.6B.zip](https://huggingface.co/stanfordnlp/glove/resolve/main/glove.6B.zip) [[mirror](https://nlp.stanford.edu/data/wordvecs/glove.6B.zip)]
    - Twitter (2B tweets, 27B tokens, 1.2M vocab, uncased, 200d vectors, 1.42 GB download): [glove.twitter.27B.zip](https://huggingface.co/stanfordnlp/glove/resolve/main/glove.twitter.27B.zip) [[mirror](https://nlp.stanford.edu/data/wordvecs/glove.twitter.27B.zip)]
1. If you choose to download the `Wikipedia 2014 + Gigaword 5` model, extract the files in folder named data.
1. IF you use `KeyedVectors.load_word2vec_format('glove.6B.50d.txt', binary=False)` it will give you error.
```
    ValueError: invalid literal for int() with base 10: 'the'
```
- To hack this, you need to add the `400000 50` to the very first **line** of the text file, since it has 400k vocab and 50 dimension vectors.

In [3]:
def insert_line_to_test(text_file, insert_line):
    with open(text_file, 'r+') as file:
        first_line = file.readline()
        if insert_line not in first_line:
            file.seek(0, 0) 
            file_data = file.read()
            file.seek(0, 0)  
            file.write(insert_line + '\n' + file_data) 

In [4]:
# for "glove.6B.50d" file:
num_vocab = 400000
dimension = 50
text_file = 'data/glove.6B.50d_modified.txt'
insert_line = f'{num_vocab} {dimension}'
insert_line_to_test(text_file, insert_line)

In [6]:
word_vectors = KeyedVectors.load_word2vec_format('data/glove.6B.50d_modified.txt', binary=False)

In [10]:
word_vectors.most_similar('abuse', topn = 25)

[('sexual', 0.8234277367591858),
 ('harassment', 0.8090097904205322),
 ('abuses', 0.7943168878555298),
 ('sex', 0.7941537499427795),
 ('criminal', 0.7893994450569153),
 ('rape', 0.7847622632980347),
 ('cases', 0.782743513584137),
 ('misconduct', 0.7807194590568542),
 ('torture', 0.7727799415588379),
 ('crime', 0.7662270069122314),
 ('neglect', 0.762278139591217),
 ('discrimination', 0.7608848810195923),
 ('alleged', 0.7530607581138611),
 ('allegations', 0.7488146424293518),
 ('serious', 0.744958221912384),
 ('charges', 0.7440094947814941),
 ('mistreatment', 0.741011381149292),
 ('dealing', 0.7386057376861572),
 ('victim', 0.7379998564720154),
 ('stemming', 0.7320647239685059),
 ('crimes', 0.7304348349571228),
 ('abused', 0.7293891906738281),
 ('complaints', 0.7288611531257629),
 ('child', 0.7285066843032837),
 ('corruption', 0.7261059284210205)]

In [15]:
word_vectors.most_similar('depression', topn = 25)

[('onset', 0.7914988398551941),
 ('severe', 0.7839459776878357),
 ('illness', 0.7577464580535889),
 ('alcoholism', 0.7406427264213562),
 ('fever', 0.7339359521865845),
 ('debilitating', 0.7328546643257141),
 ('anxiety', 0.7284860014915466),
 ('symptoms', 0.7284563779830933),
 ('respiratory', 0.7220224738121033),
 ('experiencing', 0.7218852639198303),
 ('complications', 0.7164477705955505),
 ('suffering', 0.7126669883728027),
 ('caused', 0.7055109739303589),
 ('pneumonia', 0.7045726776123047),
 ('acute', 0.7023511528968811),
 ('chronic', 0.6988701820373535),
 ('causes', 0.6948482394218445),
 ('schizophrenia', 0.6914389133453369),
 ('suffer', 0.6903968453407288),
 ('worsened', 0.6852149963378906),
 ('dementia', 0.6822355389595032),
 ('postpartum', 0.6819018125534058),
 ('childhood', 0.6802726984024048),
 ('disorder', 0.677268922328949),
 ('illnesses', 0.6771678924560547)]

In [5]:
# for "glove.6B.100d" file:
num_vocab = 400000
dimension = 100
text_file = 'data/glove.6B.100d_modified.txt'
insert_line = f'{num_vocab} {dimension}'
insert_line_to_test(text_file, insert_line)

In [10]:
from gensim.models.keyedvectors import KeyedVectors
from gensim.models import word2vec

import numpy as np

def load_glove_model(File):
    print("Loading Glove Model")
    glove_model = {}
    with open(File,'r') as f:
        for line in f:
            split_line = line.split()
            word = split_line[0]
            embedding = np.array(split_line[1:], dtype=np.float64)
            glove_model[word] = embedding
    print(f"{len(glove_model)} words loaded!")
    return glove_model

gloveModel = load_glove_model('/home/shahab/Documents/VU/NIT6003/NLP/word2vec/glove.6B.50d.txt')
print(gloveModel['hello'])
# gloveModel.most_similar('depression', topn = 25)


Loading Glove Model
400001 words loaded!
[-0.38497   0.80092   0.064106 -0.28355  -0.026759 -0.34532  -0.64253
 -0.11729  -0.33257   0.55243  -0.087813  0.9035    0.47102   0.56657
  0.6985   -0.35229  -0.86542   0.90573   0.03576  -0.071705 -0.12327
  0.54923   0.47005   0.35572   1.2611   -0.67581  -0.94983   0.68666
  0.3871   -1.3492    0.63512   0.46416  -0.48814   0.83827  -0.9246
 -0.33722   0.53741  -1.0616   -0.081403 -0.67111   0.30923  -0.3923
 -0.55002  -0.68827   0.58049  -0.11626   0.013139 -0.57654   0.048833
  0.67204 ]
