# Chapter 4: Encoding and Annotation Schemes
## Tabular Datasets
Loading tabular datasets

Programs from the book: [_Python for Natural Language Processing_](https://link.springer.com/book/9783031575488)

__Author__: Pierre Nugues

In [1]:
import pandas as pd
import csv
import requests

## Quora Question Pairs

In [2]:
qqp_url = 'http://qim.fs.quoracdn.net/quora_duplicate_questions.tsv'

In [3]:
col_names = ['id', 'qid1', 'qid2', 'question1', 'question2',
             'is_duplicate']

## csv

In [4]:
qqp_reader = csv.DictReader(
    requests.get(qqp_url).text.splitlines(),
    delimiter='\t')

In [5]:
qqp_dataset = [row for row in qqp_reader]

In [6]:
qqp_dataset[447]

{'id': '447',
 'qid1': '892',
 'qid2': '893',
 'question1': 'What are natural numbers?',
 'question2': 'What is a least natural number?',
 'is_duplicate': '0'}

In [7]:
qqp_dataset[3273]

{'id': '3273',
 'qid1': '6488',
 'qid2': '6489',
 'question1': 'How do you start a bakery?',
 'question2': 'How can one start a bakery business?',
 'is_duplicate': '1'}

Where to store the dataset

In [8]:
PATH = '../datasets/'

In [9]:
with open(PATH + 'qqp.tsv', 'w') as qqp_tsv:
    writer = csv.DictWriter(qqp_tsv, fieldnames=col_names)
    writer.writeheader()
    for row in qqp_dataset:
        writer.writerow(row)

## Pandas

In [10]:
from io import StringIO

In [11]:
qqp_pandas = pd.read_csv(
    StringIO(requests.get(qqp_url).text),
    sep='\t')

In [12]:
qqp_pandas

Unnamed: 0,id,qid1,qid2,question1,question2,is_duplicate
0,0,1,2,What is the step by step guide to invest in sh...,What is the step by step guide to invest in sh...,0
1,1,3,4,What is the story of Kohinoor (Koh-i-Noor) Dia...,What would happen if the Indian government sto...,0
2,2,5,6,How can I increase the speed of my internet co...,How can Internet speed be increased by hacking...,0
3,3,7,8,Why am I mentally very lonely? How can I solve...,Find the remainder when [math]23^{24}[/math] i...,0
4,4,9,10,"Which one dissolve in water quikly sugar, salt...",Which fish would survive in salt water?,0
...,...,...,...,...,...,...
404285,404285,433578,379845,How many keywords are there in the Racket prog...,How many keywords are there in PERL Programmin...,0
404286,404286,18840,155606,Do you believe there is life after death?,Is it true that there is life after death?,1
404287,404287,537928,537929,What is one coin?,What's this coin?,0
404288,404288,537930,537931,What is the approx annual cost of living while...,I am having little hairfall problem but I want...,0


In [13]:
qqp_pandas.iloc[447]

id                                          447
qid1                                        892
qid2                                        893
question1             What are natural numbers?
question2       What is a least natural number?
is_duplicate                                  0
Name: 447, dtype: object

In [14]:
qqp_pandas.to_dict('records')[447]

{'id': 447,
 'qid1': 892,
 'qid2': 893,
 'question1': 'What are natural numbers?',
 'question2': 'What is a least natural number?',
 'is_duplicate': 0}

In [15]:
qqp_pandas.to_csv(PATH + 'qqp_pd.tsv')

In [16]:
import json

with open(PATH + 'qqp.json', 'w') as f:
    json.dump(qqp_pandas.to_dict('records'), f)

## CoNLL

In [17]:
conll_excerpt = """1	From	from	ADP	IN	_	3	case	3:case	_
2	the	the	DET	DT	Definite=Def|PronType=Art	3	det	3:det	_
3	AP	AP	PROPN	NNP	Number=Sing	4	obl	4:obl:from	_
4	comes	come	VERB	VBZ	Mood=Ind|Number=Sing|Person=3|Tense=Pres|VerbForm=Fin	0	root	0:root	_
5	this	this	DET	DT	Number=Sing|PronType=Dem	6	det	6:det	_
6	story	story	NOUN	NN	Number=Sing	4	nsubj	4:nsubj	_
7	:	:	PUNCT	:	_	4	punct	4:punct	_
""".strip().splitlines()
conll_excerpt

['1\tFrom\tfrom\tADP\tIN\t_\t3\tcase\t3:case\t_',
 '2\tthe\tthe\tDET\tDT\tDefinite=Def|PronType=Art\t3\tdet\t3:det\t_',
 '3\tAP\tAP\tPROPN\tNNP\tNumber=Sing\t4\tobl\t4:obl:from\t_',
 '4\tcomes\tcome\tVERB\tVBZ\tMood=Ind|Number=Sing|Person=3|Tense=Pres|VerbForm=Fin\t0\troot\t0:root\t_',
 '5\tthis\tthis\tDET\tDT\tNumber=Sing|PronType=Dem\t6\tdet\t6:det\t_',
 '6\tstory\tstory\tNOUN\tNN\tNumber=Sing\t4\tnsubj\t4:nsubj\t_',
 '7\t:\t:\tPUNCT\t:\t_\t4\tpunct\t4:punct\t_']

In [18]:
col_names = ['ID', 'FORM', 'LEMMA', 'UPOS', 'XPOS', 'FEATS',
             'HEAD', 'DEPREL', 'DEPS', 'MISC']

In [19]:
list(csv.DictReader(conll_excerpt, fieldnames=col_names, delimiter='\t'))

[{'ID': '1',
  'FORM': 'From',
  'LEMMA': 'from',
  'UPOS': 'ADP',
  'XPOS': 'IN',
  'FEATS': '_',
  'HEAD': '3',
  'DEPREL': 'case',
  'DEPS': '3:case',
  'MISC': '_'},
 {'ID': '2',
  'FORM': 'the',
  'LEMMA': 'the',
  'UPOS': 'DET',
  'XPOS': 'DT',
  'FEATS': 'Definite=Def|PronType=Art',
  'HEAD': '3',
  'DEPREL': 'det',
  'DEPS': '3:det',
  'MISC': '_'},
 {'ID': '3',
  'FORM': 'AP',
  'LEMMA': 'AP',
  'UPOS': 'PROPN',
  'XPOS': 'NNP',
  'FEATS': 'Number=Sing',
  'HEAD': '4',
  'DEPREL': 'obl',
  'DEPS': '4:obl:from',
  'MISC': '_'},
 {'ID': '4',
  'FORM': 'comes',
  'LEMMA': 'come',
  'UPOS': 'VERB',
  'XPOS': 'VBZ',
  'FEATS': 'Mood=Ind|Number=Sing|Person=3|Tense=Pres|VerbForm=Fin',
  'HEAD': '0',
  'DEPREL': 'root',
  'DEPS': '0:root',
  'MISC': '_'},
 {'ID': '5',
  'FORM': 'this',
  'LEMMA': 'this',
  'UPOS': 'DET',
  'XPOS': 'DT',
  'FEATS': 'Number=Sing|PronType=Dem',
  'HEAD': '6',
  'DEPREL': 'det',
  'DEPS': '6:det',
  'MISC': '_'},
 {'ID': '6',
  'FORM': 'story',
  'LEMMA':

Reading from GitHub

In [20]:
base_url = 'https://raw.githubusercontent.com/UniversalDependencies/UD_English-EWT/master/'
ewt_train_url = base_url + 'en_ewt-ud-train.conllu'
ewt_dev_url = base_url + 'en_ewt-ud-dev.conllu'
ewt_test_url = base_url + 'en_ewt-ud-test.conllu'

The `csv` module

In [21]:
conll_reader = csv.DictReader(
    requests.get(ewt_dev_url).text.splitlines(),
    fieldnames=col_names,
    delimiter='\t')

In [22]:
conll_dataset = [row for row in conll_reader]

In [23]:
conll_dataset[4:11]

[{'ID': '1',
  'FORM': 'From',
  'LEMMA': 'from',
  'UPOS': 'ADP',
  'XPOS': 'IN',
  'FEATS': '_',
  'HEAD': '3',
  'DEPREL': 'case',
  'DEPS': '3:case',
  'MISC': '_'},
 {'ID': '2',
  'FORM': 'the',
  'LEMMA': 'the',
  'UPOS': 'DET',
  'XPOS': 'DT',
  'FEATS': 'Definite=Def|PronType=Art',
  'HEAD': '3',
  'DEPREL': 'det',
  'DEPS': '3:det',
  'MISC': '_'},
 {'ID': '3',
  'FORM': 'AP',
  'LEMMA': 'AP',
  'UPOS': 'PROPN',
  'XPOS': 'NNP',
  'FEATS': 'Number=Sing',
  'HEAD': '4',
  'DEPREL': 'obl',
  'DEPS': '4:obl:from',
  'MISC': '_'},
 {'ID': '4',
  'FORM': 'comes',
  'LEMMA': 'come',
  'UPOS': 'VERB',
  'XPOS': 'VBZ',
  'FEATS': 'Mood=Ind|Number=Sing|Person=3|Tense=Pres|VerbForm=Fin',
  'HEAD': '0',
  'DEPREL': 'root',
  'DEPS': '0:root',
  'MISC': '_'},
 {'ID': '5',
  'FORM': 'this',
  'LEMMA': 'this',
  'UPOS': 'DET',
  'XPOS': 'DT',
  'FEATS': 'Number=Sing|PronType=Dem',
  'HEAD': '6',
  'DEPREL': 'det',
  'DEPS': '6:det',
  'MISC': '_'},
 {'ID': '6',
  'FORM': 'story',
  'LEMMA':

## Pandas

In [24]:
conll_pandas = pd.read_csv(
    ewt_dev_url,
    sep='\t',
    names=col_names,
    usecols=[0, 1, 2, 3, 4, 5])

In [25]:
conll_pandas

Unnamed: 0,ID,FORM,LEMMA,UPOS,XPOS,FEATS
0,# newdoc id = weblog-blogspot.com_nominations_...,,,,,
1,# sent_id = weblog-blogspot.com_nominations_20...,,,,,
2,# newpar id = weblog-blogspot.com_nominations_...,,,,,
3,# text = From the AP comes this story :,,,,,
4,1,From,from,ADP,IN,_
...,...,...,...,...,...,...
30523,8,and,and,CCONJ,CC,_
30524,9,a,a,DET,DT,Definite=Ind|PronType=Art
30525,10,very,very,ADV,RB,_
30526,11,knowledgeable,knowledgeable,ADJ,JJ,Degree=Pos


In [26]:
conll_pandas.iloc[4:11]

Unnamed: 0,ID,FORM,LEMMA,UPOS,XPOS,FEATS
4,1,From,from,ADP,IN,_
5,2,the,the,DET,DT,Definite=Def|PronType=Art
6,3,AP,AP,PROPN,NNP,Number=Sing
7,4,comes,come,VERB,VBZ,Mood=Ind|Number=Sing|Person=3|Tense=Pres|VerbF...
8,5,this,this,DET,DT,Number=Sing|PronType=Dem
9,6,story,story,NOUN,NN,Number=Sing
10,7,:,:,PUNCT,:,_


In [27]:
conll_pandas.to_dict('records')[4:11]

[{'ID': '1',
  'FORM': 'From',
  'LEMMA': 'from',
  'UPOS': 'ADP',
  'XPOS': 'IN',
  'FEATS': '_'},
 {'ID': '2',
  'FORM': 'the',
  'LEMMA': 'the',
  'UPOS': 'DET',
  'XPOS': 'DT',
  'FEATS': 'Definite=Def|PronType=Art'},
 {'ID': '3',
  'FORM': 'AP',
  'LEMMA': 'AP',
  'UPOS': 'PROPN',
  'XPOS': 'NNP',
  'FEATS': 'Number=Sing'},
 {'ID': '4',
  'FORM': 'comes',
  'LEMMA': 'come',
  'UPOS': 'VERB',
  'XPOS': 'VBZ',
  'FEATS': 'Mood=Ind|Number=Sing|Person=3|Tense=Pres|VerbForm=Fin'},
 {'ID': '5',
  'FORM': 'this',
  'LEMMA': 'this',
  'UPOS': 'DET',
  'XPOS': 'DT',
  'FEATS': 'Number=Sing|PronType=Dem'},
 {'ID': '6',
  'FORM': 'story',
  'LEMMA': 'story',
  'UPOS': 'NOUN',
  'XPOS': 'NN',
  'FEATS': 'Number=Sing'},
 {'ID': '7',
  'FORM': ':',
  'LEMMA': ':',
  'UPOS': 'PUNCT',
  'XPOS': ':',
  'FEATS': '_'}]