In [1]:
import pandas as pd

In [2]:
%ls

Data Analysis.ipynb   snli_1.0_dev.txt      snli_1.0_train.jsonl
description.pdf       snli_1.0_test.jsonl   snli_1.0_train.txt
snli_1.0_dev.jsonl    snli_1.0_test.txt


Загрузим данные. Известно, что выборки содержат следующее количество объектов:
* Обучающая (train) - 550 152
* Валидационная (dev) - 10 000
* Тестовая (test) - 10 000

In [3]:
df_train = pd.read_json('snli_1.0_train.jsonl', lines=True)
df_dev = pd.read_json('snli_1.0_dev.jsonl', lines=True)
df_test = pd.read_json('snli_1.0_test.jsonl', lines=True)

# Анализ данных

Целевая метка - **gold_label**, которая содержит 3 следующих класса:
1. entailment (из параграфа 1 следует параграф 2)
2. contradiction (параграф 1 противоречит параграфу 2)
3. neutral (в параграфе 1 и параграфе 2 содержится схожая по смыслу информация)

Датасет был размечен вручную, каждая пара параграфов оценивалась пятью независимыми экспертами. Столбцы label1...label5 соответствуют оценке каждого человека, а столбец gold_label содержит обобщенную оценку (majority voting). Помимо этого, в датасете можно найти лексический разбор каждого параграфа, что может оказаться очень полезным при подготовке фичей.

Взглянем на признаки исследуемых объектов. Как видим, каждая выборка содержит одинаковое количество признаков

In [4]:
df_train.columns

Index(['annotator_labels', 'captionID', 'gold_label', 'pairID', 'sentence1',
       'sentence1_binary_parse', 'sentence1_parse', 'sentence2',
       'sentence2_binary_parse', 'sentence2_parse'],
      dtype='object')

In [5]:
df_dev.columns

Index(['annotator_labels', 'captionID', 'gold_label', 'pairID', 'sentence1',
       'sentence1_binary_parse', 'sentence1_parse', 'sentence2',
       'sentence2_binary_parse', 'sentence2_parse'],
      dtype='object')

In [6]:
df_test.columns

Index(['annotator_labels', 'captionID', 'gold_label', 'pairID', 'sentence1',
       'sentence1_binary_parse', 'sentence1_parse', 'sentence2',
       'sentence2_binary_parse', 'sentence2_parse'],
      dtype='object')

Посмотрим на распределение классов во всех трех выборках. Как видим, оно почти одинаковое во всех трех случаях:

In [7]:
df_train.gold_label.value_counts()

entailment       183416
contradiction    183187
neutral          182764
-                   785
Name: gold_label, dtype: int64

In [8]:
df_dev.gold_label.value_counts()

entailment       3329
contradiction    3278
neutral          3235
-                 158
Name: gold_label, dtype: int64

In [9]:
df_test.gold_label.value_counts()

entailment       3368
contradiction    3237
neutral          3219
-                 176
Name: gold_label, dtype: int64

In [10]:
df_train

Unnamed: 0,annotator_labels,captionID,gold_label,pairID,sentence1,sentence1_binary_parse,sentence1_parse,sentence2,sentence2_binary_parse,sentence2_parse
0,[neutral],3416050480.jpg#4,neutral,3416050480.jpg#4r1n,A person on a horse jumps over a broken down a...,( ( ( A person ) ( on ( a horse ) ) ) ( ( jump...,(ROOT (S (NP (NP (DT A) (NN person)) (PP (IN o...,A person is training his horse for a competition.,( ( A person ) ( ( is ( ( training ( his horse...,(ROOT (S (NP (DT A) (NN person)) (VP (VBZ is) ...
1,[contradiction],3416050480.jpg#4,contradiction,3416050480.jpg#4r1c,A person on a horse jumps over a broken down a...,( ( ( A person ) ( on ( a horse ) ) ) ( ( jump...,(ROOT (S (NP (NP (DT A) (NN person)) (PP (IN o...,"A person is at a diner, ordering an omelette.",( ( A person ) ( ( ( ( is ( at ( a diner ) ) )...,(ROOT (S (NP (DT A) (NN person)) (VP (VBZ is) ...
2,[entailment],3416050480.jpg#4,entailment,3416050480.jpg#4r1e,A person on a horse jumps over a broken down a...,( ( ( A person ) ( on ( a horse ) ) ) ( ( jump...,(ROOT (S (NP (NP (DT A) (NN person)) (PP (IN o...,"A person is outdoors, on a horse.","( ( A person ) ( ( ( ( is outdoors ) , ) ( on ...",(ROOT (S (NP (DT A) (NN person)) (VP (VBZ is) ...
3,[neutral],2267923837.jpg#2,neutral,2267923837.jpg#2r1n,Children smiling and waving at camera,( Children ( ( ( smiling and ) waving ) ( at c...,(ROOT (NP (S (NP (NNP Children)) (VP (VBG smil...,They are smiling at their parents,( They ( are ( smiling ( at ( their parents ) ...,(ROOT (S (NP (PRP They)) (VP (VBP are) (VP (VB...
4,[entailment],2267923837.jpg#2,entailment,2267923837.jpg#2r1e,Children smiling and waving at camera,( Children ( ( ( smiling and ) waving ) ( at c...,(ROOT (NP (S (NP (NNP Children)) (VP (VBG smil...,There are children present,( There ( ( are children ) present ) ),(ROOT (S (NP (EX There)) (VP (VBP are) (NP (NN...
...,...,...,...,...,...,...,...,...,...,...
550147,[contradiction],2267923837.jpg#3,contradiction,2267923837.jpg#3r1c,Four dirty and barefooted children.,( ( ( ( Four dirty ) and ) ( barefooted childr...,(ROOT (NP (NP (CD Four) (NNS dirty)) (CC and) ...,four kids won awards for 'cleanest feet',( ( four kids ) ( ( won awards ) ( ( ( for ` )...,(ROOT (S (NP (CD four) (NNS kids)) (VP (VBD wo...
550148,[neutral],2267923837.jpg#3,neutral,2267923837.jpg#3r1n,Four dirty and barefooted children.,( ( ( ( Four dirty ) and ) ( barefooted childr...,(ROOT (NP (NP (CD Four) (NNS dirty)) (CC and) ...,"four homeless children had their shoes stolen,...",( ( ( ( ( ( four ( homeless children ) ) ( had...,(ROOT (S (S (NP (CD four) (JJ homeless) (NNS c...
550149,[neutral],7979219683.jpg#2,neutral,7979219683.jpg#2r1n,A man is surfing in a bodysuit in beautiful bl...,( ( A man ) ( ( is ( surfing ( in ( ( a bodysu...,(ROOT (S (NP (DT A) (NN man)) (VP (VBZ is) (VP...,A man in a bodysuit is competing in a surfing ...,( ( ( A man ) ( in ( a bodysuit ) ) ) ( ( is (...,(ROOT (S (NP (NP (DT A) (NN man)) (PP (IN in) ...
550150,[contradiction],7979219683.jpg#2,contradiction,7979219683.jpg#2r1c,A man is surfing in a bodysuit in beautiful bl...,( ( A man ) ( ( is ( surfing ( in ( ( a bodysu...,(ROOT (S (NP (DT A) (NN man)) (VP (VBZ is) (VP...,A man in a business suit is heading to a board...,( ( ( A man ) ( in ( a ( business suit ) ) ) )...,(ROOT (S (NP (NP (DT A) (NN man)) (PP (IN in) ...


## Распределение количества оценок экспертов

In [38]:
from collections import defaultdict

def get_labels_distribution(df):
    len_labels = defaultdict(lambda: 0)
    labels = df.annotator_labels
    gold_labels = df.gold_label
    number_of_same_labels = 0
    for i in range(len(labels)):
        label_list = labels[i]
        gold_label = gold_labels[i]
        most_common_label = max(label_list, key = label_list.count)
        if most_common_label == gold_label:
            number_of_same_labels += 1
        len_labels[len(label_list)] += 1
    return (dict(len_labels), number_of_same_labels)

In [41]:
train_distribution, a1 = get_labels_distribution(df_train)
dev_distribution, a2 = get_labels_distribution(df_dev)
test_distribution, a3 = get_labels_distribution(df_test)
print('train:', train_distribution, '\t', a1)
print('dev:', dev_distribution, '\t', a2)
print('test:', test_distribution, '\t', a3)

train: {1: 510711, 5: 36975, 4: 2466} 	 549367
dev: {5: 9986, 4: 14} 	 9842
test: {5: 9990, 4: 10} 	 9824


In [43]:
def print_paragraph(df, i):
    print(df_train.loc[i, 'sentence1'])
    print(df_train.loc[i, 'sentence1_parse'])
    print(df_train.loc[i, 'sentence1_binary_parse'])

In [44]:
print_paragraph(df_train, 0)

A person on a horse jumps over a broken down airplane.
(ROOT (S (NP (NP (DT A) (NN person)) (PP (IN on) (NP (DT a) (NN horse)))) (VP (VBZ jumps) (PP (IN over) (NP (DT a) (JJ broken) (JJ down) (NN airplane)))) (. .)))
( ( ( A person ) ( on ( a horse ) ) ) ( ( jumps ( over ( a ( broken ( down airplane ) ) ) ) ) . ) )
