In [1]:
!pip install transformers

Defaulting to user installation because normal site-packages is not writeable
You should consider upgrading via the '/share/pkg.7/python3/3.7.7/install/bin/python3.7 -m pip install --upgrade pip' command.[0m


In [25]:
import torch
from marian_mt import MarianMTModel
from transformers import MarianTokenizer, MarianConfig

In [26]:
romance_model_name   = 'Helsinki-NLP/opus-mt-afa-en'
rmt = MarianMTModel.from_pretrained(romance_model_name, output_attentions=True)

In [27]:
romance_tokenizer = MarianTokenizer.from_pretrained(romance_model_name)

In [28]:
#based on the coverage penalty equation found here https://arxiv.org/pdf/1807.11243.pdf
def coverage_penalty(n_heads, n_layers, bs, slen, dec_attention, tlength):
    # dec_attention: n_layers x (bs,n_heads,slen,slen)

    tlen = tlength[:, :, :, 0]  # view of the tlength tensor of sentence lengths, of shape: (n_layers, bs, n_heads)
    t_ones = torch.ones([n_layers, bs, n_heads, slen], dtype=torch.float32)
    t_sum = torch.zeros([n_layers, bs, n_heads, slen], dtype=torch.float32)
    for i in range(n_layers):
        t_sum[i] = torch.sum(dec_attention[i], dim=2)  #sum of the target weights over one src sentence token from all tgt sentence tokens
    t_min = torch.min(t_sum, t_ones)  #min (sum,1)
    t_min = torch.add(t_min, 1e-8) #adding small epsilon to avoid log nans
    t_log = torch.log(t_min)    #calculate the logarithm
    for i in range(bs):   #setting to 0 any positions > original sentence length   (log0 would penalize these o/w)
        lsent = int(tlen[0,i,0])
        t_log[:,i,:,lsent:]=0
    t_sum_log = torch.sum(t_log, dim=3) #sum over all source tokens of src sentence
    covp = - torch.div(t_sum_log, tlen)  # divide by |x|= sentence length
    return covp

In [46]:
import pandas as pd
# import glob

# csv = glob.glob('drive/My Drive/Colab Notebooks/extracted.csv')
# print(csv)
df = pd.read_csv('train.csv')

In [47]:
# df = df[pd.notnull(df['source'])]
# df = df.drop([df.index[4200],df.index[4201]])
# print(df.head)
print(df.columns)
# df = df.truncate(after=99999)
source_sent = df['source'].values
target_sent = df['target'].values
print(source_sent[0])

Index(['source', 'target'], dtype='object')
Kooxda ayaa ka dooday Maajo 2, 2018, EP koodii ugu horreeyay oo cinwaan looga dhigay "I Am" iyo heesta""Latata""



In [48]:
print(len(source_sent))

15376


In [49]:

# print(source_sent[4201]) dropping 4200 4201
print(source_sent[4201])

Dhaqanka Soomaalida (; ) waa hab nololeedka iyo nidaamka dhaqan ee dadka Soomaalida ah; taasi ooy ka mid tahay luuqada, cuntada, suugaanta, fanka, farshaxanta, habaynta iyo soo bandhiga heesaha Soomaalida, ruwaayadaha, majaajilooyinka u dhaqanka ah dadka Soomaalida ah meelkasta ooy joogaan, gaar ahaan inta ku dhaqan Bariga Afrika.



In [50]:
print(target_sent[4201])

The culture of Somalia is an amalgamation of traditions in that were developed independently since the proto-Somali era through interaction with neighboring and far away civilizations, including other parts of Africa, the Arabian Peninsula, and Indian subcontinent.



In [51]:
# source_sent = source_sent[:-1]
# target_sent = target_sent[:-1]
print(len(source_sent),len(target_sent))
# source_sent_u = set(source_sent)
# target_sent_u = set(target_sent)
# print(len(source_sent_u),len(target_sent_u))

15376 15376


In [52]:
from torch.utils.data import Dataset, DataLoader
marian_loader = DataLoader(source_sent,batch_size=24, shuffle=False)

In [53]:
for data in marian_loader:
    for text in data:
        print(text)
    print(len(data))
    break
    

Kooxda ayaa ka dooday Maajo 2, 2018, EP koodii ugu horreeyay oo cinwaan looga dhigay "I Am" iyo heesta""Latata""

Aftida Dastuurka Waxay ka dhacday Soomaaliya 20 Juun 1961 si loogu codeeyo dastuurka cusub ee dalka waxaa soo abuubulay Dhulka Biritishka ee Soomaaliya iyo Dhulka Talyaaniga ee Soomaaliya.

Doorashada Baarlamaanka waxay ka dhacday Soomaaliya 26 Maarso 1969.

Aftida Dastuurka waxay ka dhacday Soomaaliya 25 Agoosto 1979.

Doorashada Baarlamaanka waxay ka dhacday Soomaaliya 31 Diseembar 1984.

2005) File:Real Madrid v Real Sociedad.jpg|Ronaldo (2005) File:Ronaldo.jpeg|Ronaldo (2005) Doorasho madaxtinimo Doorashooyinka Madaxtinimada ee somaliland ayaa laqabtaa Shantii sanaba mar markii ugu danbaysay ee laqabtaa waxa ay ahayd 26 june 2010.waxaa ka qayb galay sadex xisbi qaran iyo lix musharax oo kala ah madaxweyne iyo kuxigeeno waxaana ku guulaystaya axmed maxamed maxamud silaanyo iyo abdtrixmaan cabdilahi saylici oo madaxweyne ku xigeen noqday waxaanay kasoo baxeen xisbiga Kulm

In [54]:
bsz = 24
scores = []
n_heads = 8
n_layers = 6
counter=0
for data in marian_loader:
#     print(data)
#     src_texts = [text for text in data]
#     print(src_texts)
    batch = romance_tokenizer.prepare_seq2seq_batch(data)
    # print(batch)
    outputs = rmt(**batch, return_dict=False)
    dec_attn=[]
    for i in range(n_layers):
        dec_attn += (outputs[i][:,:,:,:].detach().clone(),)
    
    #print(dec_attn[1].shape) 
    len2=batch['input_ids'].shape[1] 
    #print(len2)
#     sentences_list = romance_tokenizer(src_texts,padding=False)['input_ids']
    sentences_list = romance_tokenizer(data,padding=False)['input_ids']
    len1 = [len(item) -1 for item in sentences_list]  
    #print(len1)
    tlength = torch.zeros(6, bsz, 8, len2)
    for i in range(bsz):
        tlength[:, i, :, :] = len1[i]
    #print(tlength)
    cp = coverage_penalty(n_heads, n_layers, bsz, len2, dec_attn, tlength)
    cp = torch.mean(cp.float(), dim=2)
    cpd = cp.numpy()
    for i in range(bsz):
        scores.append(cpd[:,i])
    counter = counter +1
    if(counter%5000==0):
        print(counter)
#     if(counter>520):
#         print(counter)
    #print(cp.shape)  


Token indices sequence length is longer than the specified maximum sequence length for this model (625 > 512). Running this sequence through the model will result in indexing errors


IndexError: list index out of range

In [55]:
import numpy as np
scores = np.asarray(scores)
print(len(scores))
print(scores[0])

15360
[0.4320268  1.0666841  1.3853736  1.1238257  0.49261624 0.47892174]


In [56]:
np.savetxt("somali_scores.csv", scores, delimiter=",")

In [None]:
# d = {'source':source_sent[:97400],'target':target_sent[:97400]}
# df2 = pd.DataFrame(data=d)
# df2.to_csv('smaple.')