In [14]:
!pip install numpy
!pip install pandas
!pip install matplotlib
!pip install nltk



In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

In [3]:
mwsa_ru_df = pd.read_csv('data/ru_MWSA.tsv', sep='\t', header=None)
mwsa_ru_df.columns = ['word', 'pos', 'unkn', 'def_1', 'def_2', 'label']

mwsa_ru_df.tail(10)

Unnamed: 0,word,pos,unkn,def_1,def_2,label
3849,явление,noun,,"|| то, что происходит, случается, имеет место ...","событие, случай.",related
3850,явление,noun,,"событие, случай.","событие, факт.",exact
3851,явление,noun,,"событие, факт.","событие, случай.",exact
3852,явление,noun,,"в пьесе: часть акта, в к-рой состав действующи...",действие по знач. глаг. явиться.,none
3853,явление,noun,,"в пьесе: часть акта, в к-рой состав действующи...","часть акта, действия (в пьесе), в которой сост...",exact
3854,явление,noun,,"часть акта, действия (в пьесе), в которой сост...","в пьесе: часть акта, в к-рой состав действующи...",exact
3855,явление,noun,,"в пьесе: часть акта, в к-рой состав действующи...","внешнее выражение сущности предметов, процессо...",none
3856,явление,noun,,"в пьесе: часть акта, в к-рой состав действующи...","всякое проявление чего-л., каких-л. сил, проце...",none
3857,явление,noun,,"в пьесе: часть акта, в к-рой состав действующи...","|| то, что происходит, случается, имеет место ...",none
3858,явление,noun,,"в пьесе: часть акта, в к-рой состав действующи...","событие, факт.",none


In [4]:
mwsa_en_df = pd.read_csv('data/en_MWSA.tsv', sep='\t', header=None)
mwsa_en_df.columns = mwsa_ru_df.columns

mwsa_en_df.tail(10)

Unnamed: 0,word,pos,unkn,def_1,def_2,label
10941,offer,verb,,ask (someone) to marry you,"to bid, as a price, reward, or wages;",none
10942,offer,verb,,ask (someone) to marry you,to put in opposition to; to manifest in an off...,none
10943,offer,verb,,mount or put up,to present itself; to be at hand.,none
10944,offer,verb,,mount or put up,"to present, as an act of worship; to immolate;...",none
10945,offer,verb,,mount or put up,to bring to or before; to hold out to; to pres...,none
10946,offer,verb,,mount or put up,to make an attempt; to make an essay or a tria...,none
10947,offer,verb,,mount or put up,to present in words; to proffer; to make a pro...,none
10948,offer,verb,,mount or put up,to attempt; to undertake.,none
10949,offer,verb,,mount or put up,"to bid, as a price, reward, or wages;",none
10950,offer,verb,,mount or put up,to put in opposition to; to manifest in an off...,none


In [5]:
ru_words = set(mwsa_ru_df['word'].unique())
en_words = set(mwsa_en_df['word'].unique())

print('Number of russian words:', len(ru_words))
print('Number of russian words:', len(en_words))

Number of russian words: 208
Number of russian words: 680


In [6]:
mwsa_ru_df['label'].unique()

array(['none', 'narrower', 'broader', 'exact', 'related'], dtype=object)

In [7]:
test_ru_df = pd.read_json('data/MCL-WiC/test/multilingual/test.ru-ru.data')

test_ru_df.tail()

Unnamed: 0,id,lemma,pos,sentence1,sentence2,start1,end1,start2,end2
995,test.ru-ru.995,заставлять,VERB,"Лидерство – феномен власти, способность одного...",Обычно событие заставляет человека задуматься ...,52,62,15,25
996,test.ru-ru.996,нависать,VERB,"Над Корсаком нависает угроза ареста, и он, пер...",Над дном в виде шатра нависает крыша четвёртог...,13,21,22,30
997,test.ru-ru.997,нависать,VERB,"Над Корсаком нависает угроза ареста, и он, пер...",Над залом нависает гигантский купол с деревянн...,13,21,10,18
998,test.ru-ru.998,давно,VERB,"Кроме того, если язык отделился от праязыка до...",Человеку давно были известны простейшие кузнеч...,55,60,9,14
999,test.ru-ru.999,давно,VERB,"Кроме того, если язык отделился от праязыка до...",Не так давно этот маленький город приносил око...,55,60,7,12


In [8]:
test_en_df = pd.read_json('data/MCL-WiC/test/multilingual/test.en-en.data')

test_en_df.tail()

Unnamed: 0,id,lemma,pos,sentence1,sentence2,start1,end1,start2,end2
995,test.en-en.995,shade,VERB,The young man is holding the parasol in order ...,The name apparently stuck when other members o...,49,54,124,129
996,test.en-en.996,mildness,NOUN,Various explanations have been suggested for t...,"According to G.W.B Huntingford, Iyasu ""owed hi...",102,110,73,81
997,test.en-en.997,mildness,NOUN,Various explanations have been suggested for t...,"The scarcity of reports, due in part to misdia...",102,110,66,74
998,test.en-en.998,superficial,ADJ,"She confronts Toby, finally seeing him for the...",He had difficulty learning Latin and thus got ...,59,70,53,64
999,test.en-en.999,superficial,ADJ,"She confronts Toby, finally seeing him for the...",The tree received some superficial damage on i...,59,70,23,34


In [9]:
is_ru_mwsa = [lemma in ru_words for lemma in test_ru_df['lemma'].unique()]

print('For russian:', sum(is_ru_mwsa) / len(is_ru_mwsa))

For russian: 0.012


In [10]:
is_en_mwsa = [lemma in en_words for lemma in test_en_df['lemma'].unique()]

print('For english:', sum(is_en_mwsa) / len(is_en_mwsa))

For english: 0.049079754601226995


Read dataset to semeval format

In [15]:
import json
from nltk.tokenize import word_tokenize

In [42]:
with open('data/trial/multilingual/trial.en-en.data', 'r') as f:
    data_json = json.load(f)
    print(len(data_json))
    print(data_json[0])

8
{'id': 'trial.en-en.0', 'lemma': 'recall', 'pos': 'VERB', 'sentence1': 'Members recalled in that connection that the Committee on earlier occasions had remarked that the absence of such cases might stem from a lack of information.', 'sentence2': 'He recalled that his country was a young country.', 'start1': 8, 'end1': 16, 'start2': 3, 'end2': 11}


In [45]:
data_json[1]

{'id': 'trial.en-en.1',
 'lemma': 'recall',
 'pos': 'VERB',
 'sentence1': 'Members recalled in that connection that the Committee on earlier occasions had remarked that the absence of such cases might stem from a lack of information.',
 'sentence2': 'It recalls that the Court of Roskilde heard the complaint on 19 November 1991 and delivered its reasoned judgement on 5 May 1992.',
 'start1': 8,
 'end1': 16,
 'start2': 3,
 'end2': 10}

In [46]:
def construct_sample(left_context, word, right_context, sample_id, target_lemma, pos):
    left_tokens = word_tokenize(left_context)
    right_tokens = word_tokenize(right_context)
    
    def get_context_token_info(token):
        return (token, '', '', '', False)
    
    left_tokens = [get_context_token_info(token) for token in left_tokens]
    right_tokens = [get_context_token_info(token) for token in right_tokens]
    target_token = (word, target_lemma, pos, sample_id, True)
    
    return left_tokens + [target_token] + right_tokens

In [47]:
def parse_wic_sample(wic_sample):
    start_1 = int(wic_sample['start1'])
    end_1 = int(wic_sample['end1'])
    sample_1_id = wic_sample['id'] + '#1'
    wsd_sample_1 = construct_sample(left_context=wic_sample['sentence1'][:start_1],
                                    word=wic_sample['sentence1'][start_1:end_1],
                                    right_context=wic_sample['sentence1'][end_1:],
                                    sample_id=sample_1_id,
                                    target_lemma=wic_sample['lemma'], pos=wic_sample['pos'])
    
    start_2 = int(wic_sample['start2'])
    end_2 = int(wic_sample['end2'])
    sample_2_id = wic_sample['id'] + '#2'
    wsd_sample_2 = construct_sample(left_context=wic_sample['sentence2'][:start_2],
                                    word=wic_sample['sentence2'][start_2:end_2],
                                    right_context=wic_sample['sentence2'][end_2:],
                                    sample_id=sample_2_id,
                                    target_lemma=wic_sample['lemma'], pos=wic_sample['pos'])
    
    
    return [wsd_sample_1, wsd_sample_2]

In [48]:
def transform_wic_data(wic_data):
    wsd_data = []
    
    for wic_sample in wic_data:
        samples_pair = parse_wic_sample(wic_sample)
        wsd_data.extend(samples_pair)
        
    return wsd_data

In [49]:
wsd_data = transform_wic_data(data_json)
print(len(wsd_data))

16


In [54]:
wsd_data[8]

[('In', '', '', '', False),
 ('the', '', '', '', False),
 ('private', '', '', '', False),
 ('sector', 'sector', 'NOUN', 'trial.en-en.4#1', True),
 (',', '', '', '', False),
 ('activities', '', '', '', False),
 ('are', '', '', '', False),
 ('guided', '', '', '', False),
 ('by', '', '', '', False),
 ('the', '', '', '', False),
 ('motive', '', '', '', False),
 ('to', '', '', '', False),
 ('earn', '', '', '', False),
 ('money', '', '', '', False),
 ('.', '', '', '', False)]