In [None]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [None]:
from google.colab import drive
import os
os.chdir("/content/drive/My Drive/Colab Notebooks/IR")
os.listdir()

['hi-en.dict',
 'vecmap',
 'data',
 'wiki.hi.align.vec',
 'wiki.en.align.vec',
 'spa-en',
 'hi-en.full.dict',
 'hi-en-small-emb',
 'hi_mapped.emb',
 'fasttext_experiments.ipynb',
 'en_mapped.emb']

In [None]:
# https://github.com/artetxem/vecmap
# https://fasttext.cc/docs/en/crawl-vectors.html

# SPANISH-ENGLISH MODEL

In [None]:
import gensim
from gensim.models import Word2Vec, KeyedVectors
import spacy

In [None]:
!python -m spacy download en_core_web_sm

In [None]:
nlp = spacy.load('en_core_web_sm', disable = ['ner', 'tagger', 'parser'])

In [None]:

def loadfile(filename):
  with open(filename, 'r') as infile:
    collection = [[token.text for token in nlp(line.lower()) if not token.is_punct] for line in infile]
    return collection

In [None]:
ensents = loadfile('data/eng.txt')
essents = loadfile('data/spa.txt')
enmodel = Word2Vec(sentences=ensents, size=100, window=5, min_count=2, sg=1, negative=8)
enmodel.wv.save_word2vec_format('eng.w2v.model')
esmodel = Word2Vec(sentences=essents, size=100, window=5, min_count=2, sg=1, negative=8)
esmodel.wv.save_word2vec_format('spa.w2v.model')

In [None]:
!unzip spa.w2v.model.zip -d spa.w2v.model
!unzip eng.w2v.model.zip -d eng.w2v.model

Archive:  spa.w2v.model.zip
  inflating: spa.w2v.model/spa.w2v.model  
   creating: spa.w2v.model/__MACOSX/
  inflating: spa.w2v.model/__MACOSX/._spa.w2v.model  
Archive:  eng.w2v.model.zip
  inflating: eng.w2v.model/eng.w2v.model  
   creating: eng.w2v.model/__MACOSX/
  inflating: eng.w2v.model/__MACOSX/._eng.w2v.model  


In [None]:
!python3 vecmap/map_embeddings.py --supervised data/es-en.train.txt spa.w2v.model eng.w2v.model spa_mapped.emb eng_mapped.emb

<_io.TextIOWrapper name='spa.w2v.model' mode='r' encoding='utf-8'>
<_io.TextIOWrapper name='eng.w2v.model' mode='r' encoding='utf-8'>


In [None]:
spa2eng = gensim.models.KeyedVectors.load_word2vec_format('spa_mapped.emb')
eng2spa = gensim.models.KeyedVectors.load_word2vec_format('eng_mapped.emb')

for word in ['playa', 'villa', 'perros', 'naufragio', 'islas', 'cantar', 'calles', 'naranjas', 'bomberos', 'escalera', 'nadó','frontera', 'pasaporte', 'fábrica']:
    print ("%s %s" % (word, eng2spa.similar_by_vector(spa2eng[word])[0]))

playa ('beach', 0.8319563865661621)
villa ('village', 0.6976901292800903)
perros ('dogs', 0.861842930316925)
naufragio ('shipwreck', 0.7925400733947754)
islas ('islands', 0.8338174819946289)
cantar ('sing', 0.8648631572723389)
calles ('streets', 0.8817466497421265)
naranjas ('onion', 0.7858734130859375)
bomberos ('firefighters', 0.7856603860855103)
escalera ('roof', 0.7830263376235962)
nadó ('talgar', 0.7015026807785034)
frontera ('border', 0.9006616473197937)
pasaporte ('passport', 0.8105301856994629)
fábrica ('factory', 0.8499798774719238)


In [None]:
!python3 vecmap/eval_translation.py spa-en/spa_mapped.emb spa-en/eng_mapped.emb -d data/es-en.train.txt --retrieval csls --cuda

Coverage: 99.89%  Accuracy: 58.77%


# HINDI-ENGLISH MODEL

In [None]:
!unzip Archive.zip -d embmodels

Archive:  Archive.zip
  inflating: embmodels/wiki.en.align.vec  
   creating: embmodels/__MACOSX/
  inflating: embmodels/__MACOSX/._wiki.en.align.vec  
  inflating: embmodels/wiki.hi.align.vec  
  inflating: embmodels/__MACOSX/._wiki.hi.align.vec  


In [None]:
!python3 vecmap/map_embeddings.py --semi_supervised hi-en.full.dict wiki.hi.align.vec wiki.en.align.vec hi_mapped.emb en_mapped.emb --cuda

<_io.TextIOWrapper name='wiki.hi.align.vec' mode='r' encoding='utf-8'>
<_io.TextIOWrapper name='wiki.en.align.vec' mode='r' encoding='utf-8'>
tcmalloc: large alloc 3023249408 bytes == 0x563ad553c000 @  0x7f67a00e81e7 0x7f679db2846e 0x7f679db78c7b 0x7f679db7935f 0x7f679dc1b103 0x563ac7077544 0x563ac7077240 0x563ac70eb627 0x563ac70e59ee 0x563ac7078bda 0x563ac70e7737 0x563ac70e5ced 0x563ac7078bda 0x563ac70e6915 0x563ac70e59ee 0x563ac70e56f3 0x563ac71af4c2 0x563ac71af83d 0x563ac71af6e6 0x563ac7187163 0x563ac7186e0c 0x7f679eed2bf7 0x563ac7186cea
tcmalloc: large alloc 3023249408 bytes == 0x563b8f362000 @  0x7f67a00e81e7 0x7f679db2846e 0x7f679db78c7b 0x7f679db7935f 0x7f679dc1b103 0x563ac6fb82eb 0x7f678f0a6a1c 0x7f678f0f231c 0x7f678f0aa16e 0x563ac7077544 0x563ac7077240 0x563ac70eb627 0x563ac70e59ee 0x563ac7078bda 0x563ac70ead00 0x563ac7078afa 0x563ac70e6915 0x563ac7078afa 0x563ac70ead00 0x563ac70e5ced 0x563ac7078bda 0x563ac70e6915 0x563ac70e59ee 0x563ac70e56f3 0x563ac71af4c2 0x563ac71af83d 0x5

In [None]:
from gensim.models import KeyedVectors
from gensim import models
import gensim

hin2eng = gensim.models.KeyedVectors.load_word2vec_format('hi_mapped.emb')
eng2hin = gensim.models.KeyedVectors.load_word2vec_format('en_mapped.emb')

for word in ['लाल', 'गैरी', 'मकान', 'मंदिर', 'सागरतट', 'गाना', 'सेब']:
    print ("%s %s" % (word, eng2hin.similar_by_vector(hin2eng[word])[0]))

लाल ('red', 0.6013443470001221)
गैरी ('mcnaney', 0.5269891619682312)
मकान ('cottage', 0.5818758010864258)
मंदिर ('temple', 0.6957459449768066)
सागरतट ('coastline', 0.5641415119171143)
गाना ('song', 0.7094157934188843)
सेब ('cherries', 0.4921698570251465)
