In [1]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 5GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

In [1]:
from gensim.models import Word2Vec
from nltk.tokenize import sent_tokenize, word_tokenize

In [1]:
f = open(r'/kaggle/input/textdata/sinhala.txt', 'r')
text = f.read()
sent = sent_tokenize(text)
sent = [word_tokenize(s) for s in sent]

print(sent[:10])

## Skipgram model

In [1]:
skipgramModel = Word2Vec(min_count=1, 
                         sg=1, 
                         size=300,
                         window=5,
                        seed=3)

skipgramModel.build_vocab(sentences = sent, 
                           progress_per=1000)

skipgramModel.train(sentences = sent,
                     total_examples=skipgramModel.corpus_count, 
                     epochs=10, 
                     report_delay=1)

In [1]:
print("Vocabulary size:" + str(len(skipgramModel.wv.vocab)))
print("Corpus size:" + str(skipgramModel.corpus_count))

In [1]:
print(skipgramModel.wv['මහතා'])

In [1]:
print("Dimention of a word vector" + str(len(skipgramModel.wv['මහතා'])))

### try to increase min_count=2 to ignore some words

In [1]:
skipgramModel2 = Word2Vec(min_count=2, 
                         sg=1, 
                         size=300,
                         window=5,
                         seed=3)

skipgramModel2.build_vocab(sentences = sent, 
                           progress_per=1000)

skipgramModel2.train(sentences = sent,
                     total_examples=skipgramModel2.corpus_count, 
                     epochs=10, 
                     report_delay=1)

print("Vocabulary size:" + str(len(skipgramModel2.wv.vocab)))

### since vocabulary size of the model has significantly dropped, continue to use min_count=1

## CBOW model

In [1]:
cbowModel = Word2Vec(min_count=1, 
                    sg=0, 
                    size=300,
                    window=5,
                    seed=3)

cbowModel.build_vocab(sentences = sent, 
                      progress_per=1000)

cbowModel.train(sentences = sent,
                total_examples=cbowModel.corpus_count, 
                epochs=10, 
                report_delay=1)

In [1]:
common_words = ['ජාතික', 'ජනතා', 'එම', 'අංක', 'මහතා', 'කොපමණද', 'යටතේ', 'සඳහා', 'ඊට', 'ලදී']

## similar words predicted by skipgram model

In [1]:
skipgramOutFile = open(r'SkipgramOut.txt','w')

for w in common_words:
    print("\r\n" + w + "------------>\r\n")
    skipgramOutFile.write('\r\n' + w + '------------>\r\n')
    lst = skipgramModel.wv.similar_by_word(word=w)
    [print(i) for i in lst]
    skipgramOutFile.write('\r\n'.join('{} {}'.format(x[0],x[1]) for x in lst))

## Similar words predicted by CBOW model

In [1]:
cbowOutFile = open(r'CBOWOut.txt','w')

for w in common_words:   
    print("\r\n" + w + "------------>\r\n")
    cbowOutFile.write('\r\n' + w + '------------>\r\n')
    lst = cbowModel.wv.similar_by_word(word=w)
    [print(i) for i in lst]
    cbowOutFile.write('\r\n'.join('{} {}'.format(x[0],x[1]) for x in lst))

### Use pretrained fasttext word vectors from Facebook to generate similar words to the same set of above common words
[https://fasttext.cc/docs/en/crawl-vectors.html](http://)

In [1]:
import fasttext.util
fasttext.util.download_model('si', if_exists='ignore')

In [1]:
ft = fasttext.load_model('cc.si.300.bin')
for w in common_words:   
    print("\r\n" + w + "------------>\r\n")
    lst = ft.get_nearest_neighbors(word=w)
    [print(i) for i in lst]