# Automatically annotating questions for type and complexity scores

English, Finnish, Korean

In [16]:
from typing import Dict, List, Optional, Tuple
from collections import defaultdict
import glob
from conllu import parse_incr
import networkx as nx
import pandas as pd
from pathlib import Path
from udapi.core.document import Document
from udapi.block.read.conllu import Conllu
import os
import re


## language configuration


In [None]:
LANG_INFO = {
  'en': {'qwords': ['what', 'who', 'where', 'when', 'why', 'how'], 'canonical_order': 'SVO'},
  'ko': {'qwords': [], 'canonical_order': 'SOV'},
  'fi': {'qwords': [], 'canonical_order': 'SVO'}
}

## Dataset skeleton

importing files


In [3]:
import pandas as pd

pd.set_option('display.max_columns', None)  
pd.set_option('display.width', None)        
pd.set_option('display.max_rows', None)     
pd.set_option('display.expand_frame_repr', False)


columns = [
    'text',                    # original question text
    'language',                # language code (en, fi, ko, ja, id, ru)
    'question_type',           # polar or content
    'clause_count',            # number of clauses
    'dep_depth',               # dependency depth
    'core_args_count',         # count of core arguments
    'dep_distances',           # dependency distances
    'question_word_movement',  # distance of question word movement
    'nonproj_deps_count'       # count of non-projective dependencies
]


df = pd.DataFrame(columns=columns)

print("Full DataFrame:")
print(df)
print("\n")


sample_row = {
    'text': 'What did you eat?',
    'language': 'en',
    'question_type': 'content',
    'clause_count': 1,
    'dep_depth': 3,
    'core_args_count': 2,
    'dep_distances': '2,1,3',
    'question_word_movement': 3,
    'nonproj_deps_count': 0
}

df.loc[len(df)] = sample_row

print("DataFrame with sample row:")
print(df)

Full DataFrame:
Empty DataFrame
Columns: [text, language, question_type, clause_count, dep_depth, core_args_count, dep_distances, question_word_movement, nonproj_deps_count]
Index: []


DataFrame with sample row:
                text language question_type  clause_count  dep_depth  core_args_count dep_distances  question_word_movement  nonproj_deps_count
0  What did you eat?       en       content             1          3                2         2,1,3                       3                   0


### Question Type Annotations

The World Atlas of Language Structures (WALS) documents linguistic features of languages. These features are structural properties of language that describe aspects of cross-linguistic diversity. Most features come with their own chapters detailing their values and expected roles. For example, feature 81A represents the canonical word order syntax. The corresponding chapter catalogs the values, provides examples and oftentimes interesting discussions of relevance and distribution. 

We will be looking at word order and features related to question formation strategies:
- 92A: Position of Polar Question Particles
- 93A: Position of Interrogative Phrases in Content Questions
- 116: Polar Question



In [None]:
import re
from collections import defaultdict

class QuestionClassifier:
    def __init__(self):
        
        self.en_wh_words = r'\b(what*|who*|where*|when*|why*|how*|which*)\b'
        self.en_polar_starters = r'^(is|are|do|does|did|have|has|can|could|will|would|should|may|might)'
        self.embedded_verbs = r'\b(know|tell|confirm|explain|understand|think|show|mean|see)\b'

        
        self.fi_wh_words = r'\b(mikä|mitä|missä|mistä|mihin|milloin|miksi|kuka|ketkä|kumpi|kuinka|miten)\b'
        
        self.fi_special_content = r'\bmontako\b'
        
        self.ko_wh_words = r'(무엇|뭐|어디|언제|누구|왜|어떻게|무슨|어느|몇)'
        
    def classify_question(self, text, language='english'):
        """
        Classify a question as either polar or content based on its structure and language.
        Returns 'polar' or 'content' as classification.
        """
        
        text = text.strip().lower()
        
        if not text:
            return None
            
        if language == 'english':
            return self._classify_english(text)
        elif language == 'finnish':
            return self._classify_finnish(text)
        elif language == 'korean':
            return self._classify_korean(text)
        else:
            raise ValueError(f"Unsupported language: {language}")
            
    def _classify_english(self, text):
        
        if re.match(f'^{self.en_wh_words[2:]}', text, re.I):
            return 'content'
            
        if re.match(f'{self.en_polar_starters}.*{self.embedded_verbs}.*{self.en_wh_words}', text, re.I):
            return 'polar'
            
        if re.match(self.en_polar_starters, text, re.I):
            return 'polar'

        return None
        
    
    def _classify_finnish(self, text):
        words = text.replace(',', ' ').replace('?', ' ').split()
        for word in words:
            if word.endswith('ko') or word.endswith('kö'):
                return 'polar'
        if 'vai' in words or 'montako' in words:
            return 'content'
        return 'content'
        
      
        
        
    def _classify_korean(self, text):
        
        if re.search(self.ko_wh_words, text):
            return 'content'
        return 'polar'



def process_file(input_filename, language):
    classifier = QuestionClassifier()
    questions = {'polar': [], 'content': []}
    
    with open(input_filename, 'r', encoding='utf-8') as file:
        for line in file:
            sentence_text = line.strip()
            if sentence_text:
                question_type = classifier.classify_question(sentence_text, language)
                if question_type:
                    questions[question_type].append(sentence_text)
                    
    return questions


def open_folder(directory, language):
    all_questions = {'polar': [], 'content': []}
    path = Path(directory)

    files = list(path.glob('*.txt'))  # Now looking for text files
    if not files:
        print(f'No files found in {path}')
        return all_questions
    
    for file in files:
        print(f'Processing {file.name}...')
        questions = process_file(file, language)
        all_questions['polar'].extend(questions['polar'])
        all_questions['content'].extend(questions['content'])
    
    return all_questions


def write_output_files(questions, language):
    with open(f"{language}_polar_questions.txt", 'w', encoding='utf-8') as f:
        for sentence in questions['polar']:
            f.write(f"{sentence}\n")
    
    with open(f"{language}_content_questions.txt", 'w', encoding='utf-8') as f:
        for sentence in questions['content']:
            f.write(f"{sentence}\n")


def main():
  
    path = ""

    language = ""
    
    try:
        questions = open_folder(path, language)
        
        write_output_files(questions, language)
        
        print(f"Processing complete!")
        print(f"Found {len(questions['polar'])} polar questions")
        print(f"Found {len(questions['content'])} content questions")
        print(f"Results written to 'polar_questions.txt' and 'content_questions.txt'")
        
    except Exception as e:
        print(f"An error occurred: {str(e)}")

if __name__ == "__main__":
    main()

Processing questions_en_ewt-ud-train.txt...
Processing questions_en_ewt-ud-dev.txt...
Processing questions_en_ewt-ud-test.txt...
Processing complete!
Found 323 polar questions
Found 240 content questions
Results written to 'polar_questions.txt' and 'content_questions.txt'


Finnish:

\bMikä |Millainen |Ketä |Mitähän |Mikähän |Mitä |Minä |Missä |Mistä |Mihin |Milloin |Miksi |Kuka |Ketkä |Kumpi |Kuinka |Miten |Minkä |Mitkä |Montako | |Millä |Keitä |Kenen |Minkälainen |Koska |Millaista \b

### Complexity Scoring



In [29]:
import udapi
from udapi.core.document import Document
from udapi.block.read.conllu import Conllu
from udapi.block.write.html import Html

finnish_content = '/home/robin/Research/qtype-eval/src/UD-finnish-questions/content_questions_finnish_UD.conllu'
finnish_polar = '/home/robin/Research/qtype-eval/src/UD-finnish-questions/polar_questions_finnish_UD.conllu'

doc = udapi.Document(finnish_content)

target_tree = next(tree for tree in doc.trees if tree.sent_id == "b605.17")

target_tree.draw(layout="align", attributes="ord,form,upos,deprel,feats,misc")

# sent_id = b605.17
# text = Miksi aina silloin, kun oikeasti tarvitsee jotain, ei löydä mitään?
─┮                                                                                                    
 │ ╭─╼       [32m1[0m  [33mMiksi[0m     [31mADV[0m   [34madvmod[0m _[0m                                                               _[0m
 │ │     ╭─╼ [32m2[0m  [33maina[0m      [31mADV[0m   [34madvmod[0m _[0m                                                               _[0m
 │ │   ╭─┶   [32m3[0m  [33msilloin[0m   [31mADV[0m   [34madvmod[0m _[0m                                                               SpaceAfter=No[0m
 │ │   ┢─╼   [32m4[0m  [33m,[0m         [31mPUNCT[0m [34mpunct[0m  _[0m                                                               _[0m
 │ │ ╭─┶     [32m5[0m  [33mkun[0m       [31mSCONJ[0m [34mmark[0m   _[0m                                                               _[0m
 │ │ ┢─╼     [32m6[0m  [33moikeasti[0m 