Let's see how the data is distributed and what we should produce to improve the performance.

In [37]:
import numpy as np
import pandas as pd
from scipy.interpolate.dfitpack import types

ds = pd.read_csv('./labelling/data_cleaned_manual.csv')

questions = list(ds['Question'])

ds.head()

Unnamed: 0,Question,Global Subject,Question Intent
0,Hi,start,greet
1,Hello,start,greet
2,Describe the automaton,automaton,description
3,Is there a transition between q2 and q0?,transition,existence_between
4,Is there a transition between q5 and q7,transition,existence_between


In [38]:
# count the number of questions using the pandas dataframe
questions_count = ds['Question'].count()
# count of unique questions
unique_questions_count = ds['Question'].nunique()
# count of 'Global Subject' unique values
unique_global_subject_count = ds['Global Subject'].nunique()

# display all the unique values of 'Global Subject' using the jupyter notebook; while also displaying the other values
ds['Global Subject'].value_counts()

Global Subject
transition    76
automaton     49
state         49
grammar       33
theory        15
off_topic      6
start          2
Name: count, dtype: int64

In [39]:
ds['Question Intent'].value_counts()

Question Intent
count                 29
existence_from        18
list                  17
description           16
accepted              14
representation        13
existence_between     12
transitions           12
description_brief     10
pattern               10
existence_directed     9
start                  8
final                  8
symbols                7
cycles                 4
label                  4
example_input          4
final_list             3
states                 3
generic                3
regex                  2
variation              2
definition             2
greet                  2
final_count            2
validity               2
simulation             2
input                  1
change                 1
existence_into         1
directionality         1
details                1
self_loop              1
Name: count, dtype: int64

In [40]:
# join the 'Global Subject' and 'Question Intent' columns to see how many "combined" unique values there are
comb = ds['Global Subject'] + ':' + ds['Question Intent']
comb.value_counts()

state:count                      19
transition:existence_from        18
transition:list                  15
automaton:description            14
grammar:accepted                 14
automaton:representation         13
transition:existence_between     12
transition:count                 10
automaton:description_brief      10
transition:existence_directed     9
automaton:pattern                 9
state:start                       8
state:final                       7
grammar:symbols                   7
state:transitions                 7
theory:transitions                5
grammar:example_input             4
transition:cycles                 4
transition:label                  4
state:final_list                  3
theory:generic                    3
theory:states                     3
grammar:variation                 2
theory:definition                 2
grammar:regex                     2
state:final_count                 2
start:greet                       2
grammar:validity            

Now let's also load the additional data made by me.

In [41]:
ds_automaton = pd.read_csv('./new_questions/automaton_questions.csv')
ds_state = pd.read_csv('./new_questions/state_questions.csv')
ds_transition = pd.read_csv('./new_questions/transition_questions.csv')
ds_grammar = pd.read_csv('./new_questions/grammar_questions.csv')
ds_start = pd.read_csv('./new_questions/start_questions.csv')
ds_theory = pd.read_csv('./new_questions/theory_questions.csv')

print("Automaton questions count: ", ds_automaton['Question'].count())
print("State questions count: ", ds_state['Question'].count())
print("Transition questions count: ", ds_transition['Question'].count())
print("Grammar questions count: ", ds_grammar['Question'].count())
print("Start questions count: ", ds_start['Question'].count())
print("Theory questions count: ", ds_theory['Question'].count())

Automaton questions count:  92
State questions count:  56
Transition questions count:  148
Grammar questions count:  111
Start questions count:  17
Theory questions count:  100


Finally, let's join all the data we have to see the distribution of the data.

In [44]:
combined = pd.concat([ds, ds_automaton, ds_state, ds_transition, ds_grammar, ds_start, ds_theory])

print("Combined questions count: ", combined['Question'].count())
print("Combined unique questions count: ", combined['Question'].nunique())

combined['Global Subject'].value_counts()

Combined questions count:  754
Combined unique questions count:  715


Global Subject
transition    224
grammar       144
automaton     141
theory        115
state         105
start          19
off_topic       6
Name: count, dtype: int64

In [45]:
combined['Question Intent'].value_counts()

Question Intent
description           74
accepted              57
existence_from        42
count                 40
generic               39
list                  38
label                 36
transitions           34
pattern               27
existence_between     25
existence_directed    21
final                 21
simulation            20
variation             19
greet                 19
representation        19
states                18
existence_into        17
description_brief     16
definition            16
start                 15
validity              14
symbols               14
input                 12
cycles                12
details               12
self_loop             11
example_input         11
regex                  9
final_count            8
final_list             6
optimization           6
deterministic          5
reachability           5
start_final            3
dead                   3
image                  2
change                 1
directionality         1
Name: cou