# 1. Word2Vec Model

In [1]:
from google.colab import drive
drive.mount("/content/drive/")

Mounted at /content/drive/


In [3]:
cd /content/drive/MyDrive/FrenchWE/Final_code

/content/drive/MyDrive/FrenchWE/Final_code


In [None]:
import os
import gensim
#/content/drive/MyDrive/FrenchWE/Final_code/utils.py
from utils import *

import warnings
warnings.filterwarnings(action='ignore', category=UserWarning, module='gensim')

## 1.1. Train Word2Vec Model

In [None]:
!wget https://objectstore.e2enetworks.net/ai4b-public-nlu-nlg/v1-indiccorp/ta.txt

--2023-11-29 17:00:07--  https://objectstore.e2enetworks.net/ai4b-public-nlu-nlg/v1-indiccorp/ta.txt
Resolving objectstore.e2enetworks.net (objectstore.e2enetworks.net)... 101.53.152.30, 101.53.136.19, 164.52.206.155, ...
Connecting to objectstore.e2enetworks.net (objectstore.e2enetworks.net)|101.53.152.30|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 11877039783 (11G) [text/plain]
Saving to: ‘ta.txt.1’

ta.txt.1              2%[                    ] 229.20M  21.0MB/s    eta 9m 46s ^C


### Tamil

In [None]:
# I use a memory friendly iterator
text_w2t = MySentences(dirname='./Tamil/')

In [None]:
model_w2t = gensim.models.Word2Vec(sentences=text_w2t, vector_size=300, sg=0, hs=1)
if not os.path.exists('./WE_models'):
    os.mkdir('./WE_models')
model_w2t.save('WE_models/w2t_cbow_300D')

### Punjabi

In [None]:
# I use a memory friendly iterator
text_w2p = MySentences(dirname='./Punjabi/')

In [None]:
model_w2p = gensim.models.Word2Vec(sentences=text_w2p, vector_size=300, sg=1, hs=1)
if not os.path.exists('./WE_models'):
    os.mkdir('./WE_models')
model_w2p.save('WE_models/w2p_sg_300D')



## 1.2. Load Word2Vec Model

### - Tamil Monolingual

In [None]:
model_w2v = gensim.models.Word2Vec.load('WE_models/w2t_cbow_300D')

- #### Similar Words

In [None]:
model_w2v.wv.most_similar('மனைவி')

### - Punjabi Monolingual

In [None]:
model_w2v = gensim.models.Word2Vec.load('WE_models/w2p_cbow_300D')
model_w2v.wv.save_word2vec_format('w2t_cbow_300D.txt', binary=False)

- #### Similar Words

In [None]:
model_w2v.wv.most_similar('')

### Bilingual Model Training & Evaluation

In [None]:
!python3 bilingual_code/map_embeddings.py --acl2017_seed train_p2t.txt /content/drive/MyDrive/FrenchWE/Final_code/model/w2p_sg_300D /content/drive/MyDrive/FrenchWE/Final_code/model/w2t_sg_300D SRC_MAPPED.emb TRG_MAPPED.emb --cuda

In [9]:
!python3 bilingual_code/eval_translation.py

ਨਦੀ
1.ஆற்றில்
2.கரையில்
3.கால்வாய்
4.கால்வாயில்
5.அணைக்கட்டிலிருந்து


In [10]:
!pip install streamlit

Installing collected packages: watchdog, validators, smmap, pydeck, gitdb, gitpython, streamlit
Successfully installed gitdb-4.0.11 gitpython-3.1.40 pydeck-0.8.1b0 smmap-5.0.1 streamlit-1.28.2 validators-0.22.0 watchdog-3.0.0


In [12]:
!streamlit run bilingual_code/eval_translation.py


Collecting usage statistics. To deactivate, set browser.gatherUsageStats to False.
[0m
[0m
[34m[1m  You can now view your Streamlit app in your browser.[0m
[0m
[34m  Network URL: [0m[1mhttp://172.28.0.12:8501[0m
[34m  External URL: [0m[1mhttp://35.224.103.160:8501[0m
[0m
[34m  Stopping...[0m
^C


In [13]:
import numpy as np
from scipy.stats import t

def simple_t_test(data, mu0, alpha, method = "two_sided"):
    sample_mean = np.mean(data)
    sample_std = np.std(data)

    empirical_t = (sample_mean - mu0) / (sample_std / np.sqrt(data.size))

    df = data.size - 1

    # perform left-tailed test
    if (method == "left"):
        critical_value = t.ppf(alpha / 2, df = df)

        # test decision
        if (empirical_t < critical_value):
            reject = True
        else:
            reject = False

    # perform right-tailed test
    elif (method == "right"):
        critical_value = t.ppf(1 - alpha, df = df)

        if (empirical_t > critical_value):
            reject = True
        else:
            reject = False

    # perform two-sided test
    else:
        critical_value = t.ppf(alpha / 2, df = df)

        if ((-np.abs(empirical_t) < -np.abs(critical_value)) or
            (np.abs(empirical_t) > np.abs(critical_value))):
            reject = True
        else:
            reject = False

    print("Significance level:", alpha)
    print("Degrees of freedom:", df)
    print("Test statistic:", round(empirical_t, 4))
    print("Critical value:", round(critical_value, 4))
    print("Reject H0:", reject)

    return reject

In [None]:
simple_t_test(sample_weights, mu_0, alpha)