In [1]:
import sys
sys.path.insert(0, "..")

In [2]:
from src.scrapper import parse_conllu_file
from src.visualization import plot_frequency_of_
from analyses.utils import (
    get_stats,
    build_counts,
    build_dataframes,
    print_top_words_given_tag,
    display_side_by_side,
)

# English dataset

* Name: ESLSpok
* Link to repository [here](https://github.com/UniversalDependencies/UD_English-ESLSpok/tree/master)
* Training data [here](https://github.com/UniversalDependencies/UD_English-ESLSpok/blob/master/en_eslspok-ud-train.conllu)
* Test data [here](https://github.com/UniversalDependencies/UD_English-ESLSpok/blob/master/en_eslspok-ud-test.conllu)

In [3]:
train_info = parse_conllu_file("../datasets/en_eslspok-ud-train.conllu")
test_info = parse_conllu_file("../datasets/en_eslspok-ud-test.conllu")

In [4]:
get_stats(train_info)

Total sentences: 1856
Average sentence length: 9
Minimum sentence length: 2
Maximum sentence length: 48
Percentile 25, lenght: 5.0
Percentile 50, lenght: 7.0
Percentile 75, lenght: 11.0


In [5]:
get_stats(test_info)

Total sentences: 232
Average sentence length: 10
Minimum sentence length: 2
Maximum sentence length: 60
Percentile 25, lenght: 5.0
Percentile 50, lenght: 8.0
Percentile 75, lenght: 12.0


In [6]:
train_word_counts, train_tag_counts, train_pair_counts = build_counts(train_info)
test_word_counts, test_tag_counts, test_pair_counts = build_counts(test_info)

In [7]:
plot_frequency_of_("words", train_word_counts, test_word_counts)

In [8]:
plot_frequency_of_("tags", train_tag_counts, test_tag_counts)

In [9]:
plot_frequency_of_("word-pair tag", train_pair_counts, test_pair_counts)

In [10]:
train_df = build_dataframes(train_info)
test_df = build_dataframes(test_info)

tags = train_df.tags.unique()

In [11]:
train_dfs = [print_top_words_given_tag(train_df, tag) for tag in tags]
display_side_by_side(*train_dfs)

Unnamed: 0,tags,words,count
0,pron,i,840
1,pron,you,231
2,pron,it,226
3,pron,my,216
4,pron,we,92

Unnamed: 0,tags,words,count
0,aux,is,226
1,aux,'s,129
2,aux,was,112
3,aux,'m,84
4,aux,do,82

Unnamed: 0,tags,words,count
0,noun,time,44
1,noun,people,32
2,noun,car,31
3,noun,school,31
4,noun,train,31

Unnamed: 0,tags,words,count
0,punct,.,1622
1,punct,",",778
2,punct,?,144
3,punct,"""",48
4,punct,-,18

Unnamed: 0,tags,words,count
0,cconj,and,576
1,cconj,but,178
2,cconj,or,93
3,cconj,so,45
4,cconj,either,1

Unnamed: 0,tags,words,count
0,adv,so,218
1,adv,very,126
2,adv,just,58
3,adv,much,42
4,adv,now,41

Unnamed: 0,tags,words,count
0,verb,have,117
1,verb,go,92
2,verb,like,76
3,verb,know,75
4,verb,thank,73

Unnamed: 0,tags,words,count
0,det,the,508
1,det,a,294
2,det,this,49
3,det,some,35
4,det,that,27

Unnamed: 0,tags,words,count
0,intj,no,41
1,intj,yeah,35
2,intj,yes,34
3,intj,please,17
4,intj,like,14

Unnamed: 0,tags,words,count
0,adj,good,39
1,adj,last,34
2,adj,nice,29
3,adj,many,27
4,adj,other,22

Unnamed: 0,tags,words,count
0,part,to,463
1,part,n't,126
2,part,not,71
3,part,'s,22
4,part,',6

Unnamed: 0,tags,words,count
0,sconj,because,65
1,sconj,when,31
2,sconj,if,30
3,sconj,that,24
4,sconj,after,9

Unnamed: 0,tags,words,count
0,adp,in,200
1,adp,of,140
2,adp,for,97
3,adp,on,70
4,adp,with,68

Unnamed: 0,tags,words,count
0,num,one,58
1,num,two,32
2,num,four,10
3,num,three,9
4,num,five,8

Unnamed: 0,tags,words,count
0,propn,charlie,31
1,propn,tokyo,15
2,propn,hokkaido,14
3,propn,japan,14
4,propn,hiroshima,13

Unnamed: 0,tags,words,count
0,x,-,4
1,x,ku,3
2,x,nante,3
3,x,bye,2
4,x,nabe,2


In [12]:
test_dfs = [print_top_words_given_tag(test_df, tag) for tag in tags]
display_side_by_side(*test_dfs)

Unnamed: 0,tags,words,count
0,pron,i,109
1,pron,you,29
2,pron,it,27
3,pron,my,25
4,pron,she,13

Unnamed: 0,tags,words,count
0,aux,is,23
1,aux,'s,20
2,aux,do,17
3,aux,was,17
4,aux,'m,12

Unnamed: 0,tags,words,count
0,noun,time,8
1,noun,people,7
2,noun,house,5
3,noun,party,5
4,noun,work,5

Unnamed: 0,tags,words,count
0,punct,.,203
1,punct,",",116
2,punct,?,15
3,punct,"""",8
4,punct,...,3

Unnamed: 0,tags,words,count
0,cconj,and,83
1,cconj,but,27
2,cconj,so,8
3,cconj,or,4

Unnamed: 0,tags,words,count
0,adv,so,30
1,adv,very,21
2,adv,much,11
3,adv,just,9
4,adv,really,6

Unnamed: 0,tags,words,count
0,verb,like,13
1,verb,have,12
2,verb,know,12
3,verb,go,9
4,verb,thank,9

Unnamed: 0,tags,words,count
0,det,the,70
1,det,a,41
2,det,some,10
3,det,this,8
4,det,that,7

Unnamed: 0,tags,words,count
0,intj,yeah,8
1,intj,yes,7
2,intj,like,5
3,intj,no,3
4,intj,ok,2

Unnamed: 0,tags,words,count
0,adj,good,6
1,adj,last,6
2,adj,little,5
3,adj,interesting,4
4,adj,many,4

Unnamed: 0,tags,words,count
0,part,to,62
1,part,n't,25
2,part,not,13
3,part,'s,3
4,part,na,1

Unnamed: 0,tags,words,count
0,sconj,because,7
1,sconj,when,5
2,sconj,that,4
3,sconj,if,3
4,sconj,as,1

Unnamed: 0,tags,words,count
0,adp,in,22
1,adp,for,19
2,adp,of,19
3,adp,at,13
4,adp,from,5

Unnamed: 0,tags,words,count
0,num,one,8
1,num,two,8
2,num,three,3
3,num,eleven,1
4,num,five,1

Unnamed: 0,tags,words,count
0,propn,charlie,3
1,propn,hokkaido,2
2,propn,japan,2
3,propn,osaka,2
4,propn,saitama,2

Unnamed: 0,tags,words,count
0,x,jukai,1
1,x,m,1
2,x,m.,1
3,x,nawatobi,1
