# Libraries

In [1]:
from collections import Counter
from gensim.models import Word2Vec, KeyedVectors
import pandas as pd
import nltk
import tqdm

# Downloads

In [2]:
nltk.download()

showing info https://raw.githubusercontent.com/nltk/nltk_data/gh-pages/index.xml


# Data Manipulations

In [None]:
# open file
with open('../inputs/corpus_all.txt', mode='r', encoding='utf-8') as file:
    # read and store all text data to data variable
    data = file.read()

In [None]:
# find the length of the data
print(f"The length of the whole data: {len(data)}")

# file is too large and therefore, only some portion will be used
section = data[:10001]

# verify the data
print(section)

In [None]:
# tokenize selected section to its sentences
sentences = nltk.sent_tokenize(section)

# tokenize each sentence to its words
tokenized_sentences = [nltk.word_tokenize(sentence) for sentence in sentences]

In [None]:
# create counter
word_to_frequency = Counter()

# count the instance of each word in the section
for tokenized_sentence in tokenized_sentences:
    word_to_frequency.update(tokenized_sentence)

In [None]:
# list counter in descending order
word_to_frequency.most_common()

[('.', 93),
 ('i', 68),
 ('və', 34),
 ('l', 32),
 ('in', 21),
 ('b', 16),
 ('bir', 15),
 ('amerika', 14),
 ('müsəlman', 14),
 ('ın', 14),
 ('bu', 13),
 ('ı', 13),
 ('o', 12),
 ('inə', 10),
 ('onlar', 9),
 ('onların', 9),
 ('isə', 9),
 ('həm', 9),
 ('də', 9),
 ('d', 9),
 ('k', 9),
 ('mənim', 9),
 ('yeni', 9),
 ('hər', 8),
 ('lər', 8),
 ('lar', 8),
 ('həyat', 7),
 ('a', 7),
 ('ir', 7),
 ('insanlar', 7),
 ('ruhum', 7),
 ('r', 6),
 ('yer', 6),
 ('lə', 6),
 ('böyük', 6),
 ('amerikalı', 6),
 ('ilə', 5),
 ('amerikada', 5),
 ('öz', 5),
 ('onun', 5),
 ('olan', 5),
 ('amerikanın', 5),
 ('ında', 5),
 ('f', 5),
 ('ikdə', 5),
 ('çox', 5),
 ('üçün', 5),
 ('asiya', 5),
 ('birləşmiş', 4),
 ('mən', 4),
 ('görə', 4),
 ('yüksəkl', 4),
 ('id', 4),
 ('şəhər', 4),
 ('əsr', 4),
 ('e', 4),
 ('s', 4),
 ('əsas', 4),
 ('əsrdə', 4),
 ('buraya', 4),
 ('bəziləri', 4),
 ('amerikalılar', 4),
 ('edir', 3),
 ('bütün', 3),
 ('digər', 3),
 ('of', 3),
 ('üzrə', 3),
 ('ruhlu', 3),
 ('müsəlmanlar', 3),
 ('ən', 3),
 ('ə', 3)

In [None]:
# store word frequencies in a txt file as word    frequency
with open('../outputs/word_frequencies.txt', mode='w', encoding='utf-8') as file:
    for word, frequency in word_frequency_counter.items():
        file.write(f"{word}\t{frequency}\n")
        

In [None]:
# initialize model
model = Word2Vec(tokenized_sentences, vector_size=32, window=5, min_count=1, workers=12)

In [None]:
model.save("../outputs/word2vec.model")

In [54]:
model.wv.most_similar('o', topn=10)

[('iqad', 0.4952297806739807),
 ('yeni', 0.4774841368198395),
 ('ishaq', 0.46629467606544495),
 ('ider', 0.460332989692688),
 ('müsəlmandır', 0.44022542238235474),
 ('the', 0.43287786841392517),
 ('gəz', 0.4305134415626526),
 ('edirəm', 0.4297322630882263),
 ('dearborn', 0.41629061102867126),
 ('əlcəzirə', 0.41326069831848145)]

In [13]:
model.wv.most_similar(positive=['ata', 'qız'], negative=['oğul'], topn=10)

[('çarpazlayib', 0.7706415057182312),
 ('cavanların', 0.7612033486366272),
 ('qızlar', 0.7561378479003906),
 ('oğlan', 0.7469136714935303),
 ('qadın', 0.741458535194397),
 ('arvadların', 0.7367134690284729),
 ('gözününqabağına', 0.7341235876083374),
 ('balaca', 0.7317864894866943),
 ('gəlinin', 0.7315199971199036),
 ('qızın', 0.729299783706665)]

In [19]:
model.wv.most_similar(positive=['kitablar', 'ev'], negative=['kitab'],topn=10)

[('mənzilləri', 0.8121401071548462),
 ('evlər', 0.7882879376411438),
 ('heyvanları', 0.781944751739502),
 ('mağaza', 0.7659145593643188),
 ('malqara', 0.7541760802268982),
 ('körpələr', 0.7480019927024841),
 ('eveşik', 0.7466618418693542),
 ('mənzillər', 0.7448297739028931),
 ('restoran', 0.7447935938835144),
 ('süfrə', 0.7425639629364014)]

In [76]:
model1 = Word2Vec.load("word2vec_AZ_K.model")


In [77]:
model1.wv.most_similar(positive=['ata', 'qız'], negative=['oğul'],topn=10)

[('oğlan', 0.8091729283332825),
 ('qadın', 0.7986456155776978),
 ('kişi', 0.7945387959480286),
 ('balaca', 0.7855933904647827),
 ('cavan', 0.7789908051490784),
 ('uşağı', 0.7773816585540771),
 ('mələk', 0.7757386565208435),
 ('qızlar', 0.7734727263450623),
 ('arvadı', 0.7717491984367371),
 ('oğlanın', 0.7637009620666504)]