In [1]:
import os
from pathlib import Path

def count_lines_in_files(folder_path):
    folder = Path(folder_path)
    txt_files = folder.glob("*.txt")
    
    file_line_counts = {}
    
    for txt_file in txt_files:
        with open(txt_file, 'r', encoding='utf-8') as f:
            line_count = sum(1 for line in f)
            file_line_counts[txt_file.name] = line_count
    
    return file_line_counts

folder_path = '/home/robin/Research/qtype-eval/data/TyDi-questions'
line_counts = count_lines_in_files(folder_path)

for file_name, line_count in line_counts.items():
    print(f"{file_name}: {line_count} lines")

tidy-questions-polar-train-russian.txt: 994 lines
tidy-questions-polar-train-english.txt: 444 lines
tidy-questions-polar-valid-arabic.txt: 98 lines
tidy-questions-polar-valid-russian.txt: 195 lines
tidy-questions-wh-valid-arabic.txt: 1282 lines
tidy-questions-wh-valid-korean.txt: 1633 lines
tidy-questions-wh-train-korean.txt: 10685 lines
tidy-questions-polar-valid-korean.txt: 65 lines
tidy-questions-wh-valid-english.txt: 954 lines
tidy-questions-wh-train-russian.txt: 11809 lines
tidy-questions-wh-train-english.txt: 8767 lines
tidy-questions-wh-valid-russian.txt: 1430 lines
tidy-questions-wh-valid-japanese.txt: 1567 lines
tidy-questions-polar-valid-finnish.txt: 94 lines
tidy-questions-wh-valid-indonesian.txt: 1741 lines
tidy-questions-polar-train-finnish.txt: 950 lines
tidy-questions-wh-train-indonesian.txt: 14624 lines
tidy-questions-wh-train-finnish.txt: 14335 lines
tidy-questions-polar-valid-japanese.txt: 142 lines
tidy-questions-wh-train-arabic.txt: 21913 lines
tidy-questions-polar-

In [None]:
from pathlib import Path
from conllu import parse
from statistics import mean


In [6]:
def analyze_file(filepath):
    with open(filepath, 'r', encoding='utf-8') as f:
        sentences = list(parse(f.read()))
        
    lengths = [len(sent) for sent in sentences]
    
    stats = {
        'total_sentences': len(sentences),
        'avg_length': mean(lengths) if lengths else 0,
        'over_5': sum(1 for x in lengths if x > 5),
        'over_10': sum(1 for x in lengths if x > 10),
        'over_15': sum(1 for x in lengths if x > 15),
        'over_20': sum(1 for x in lengths if x > 20)
    }
    
    return stats

def main():
    print("\nSentence Length Statistics for CoNLL-U Files")
    print("="*50)
    
    for file in Path('/home/robin/Research/qtype-eval/UD-questions').glob('*.conllu'):
        try:
            stats = analyze_file(file)
            
            print(f"\nFile: {file.name}")
            print("-" * 30)
            print(f"Total sentences:     {stats['total_sentences']}")
            print(f"Average length:      {stats['avg_length']:.1f}")
            print("\nSentences longer than:")
            print(f"  5  tokens:         {stats['over_5']}")
            print(f"  10 tokens:         {stats['over_10']}")
            print(f"  15 tokens:         {stats['over_15']}")
            print(f"  20 tokens:         {stats['over_20']}")
            
        except Exception as e:
            print(f"\nError processing {file.name}: {str(e)}")

main()



Sentence Length Statistics for CoNLL-U Files

File: questions_ko_kaist-ud-dev.conllu
------------------------------
Total sentences:     29
Average length:      9.9

Sentences longer than:
  5  tokens:         22
  10 tokens:         12
  15 tokens:         5
  20 tokens:         1

File: questions_ar_padt-ud-test.conllu
------------------------------
Total sentences:     7
Average length:      101.4

Sentences longer than:
  5  tokens:         7
  10 tokens:         6
  15 tokens:         6
  20 tokens:         6

File: questions_ru_taiga-ud-dev.conllu
------------------------------
Total sentences:     53
Average length:      8.8

Sentences longer than:
  5  tokens:         35
  10 tokens:         12
  15 tokens:         4
  20 tokens:         2

File: questions_id_gsd-ud-train.conllu
------------------------------
Total sentences:     478
Average length:      13.8

Sentences longer than:
  5  tokens:         424
  10 tokens:         262
  15 tokens:         140
  20 tokens:        

## Testing out UDon2 for projectivity visuals

In [None]:
with open(self.input_file, 'r', encoding='utf-8') as f:
      for sentence in parse_incr(f):
        sent_id = sentence.metadata.get('sent_id', '')
        text = sentence.metadata.get('text', '')

        for file, function in functions.items():
          score = function(sentence)
          results[file].append({'sentence_id': sent_id, 'text': text, 'score': score})


    for file, result in results.items():
      df = pd.DataFrame(result)
      out_path = self.output_dir / file
      df.to_csv(out_path, index=False)
      print(f'save {out_path} ok')

In [None]:
class ComplexityScore:

  def __init__(self, input_file, output_dir):
    self.input_file = input_file
    self.output_dir = Path(output_dir)
    self.output_dir.mkdir(parents=True, exist_ok=True)

############################################
  def graph(self, sentence):
    pass

  
############################################
  def clauses(self, sentence):
    clausal_dependencies = ['acl:relcl', 'advcl', 'ccomp', 'xcomp', 'csubj', 'parataxis']
    clauses = 1

    for token in sentence:
      if token['deprel'] in clausal_dependencies:
        clauses += 1

    return clauses

############################################
  def arguments(sentence):
    pass
############################################
  def distance(self, sentence):
    words = [token for token in sentence if token['upos'] != 'PUNCT' and isinstance(token['id'], int)]

    roots = [token for token in words if token['deprel'] == 'root']
    if len(roots) != 1:
      return None
  
    distance = 0
    for token in words:
      if token['deprel'] != 'root':
        if token['deprel'] in ['flat:name', 'fixed', 'goeswith']:
          continue

        token_id = int(token['id'])
        head_id = int(token['head'])
        length = abs(token_id - head_id)

        distance += length
  
    return distance
  
############################################
  def depth(self, sentence):
    depths = {0: 0}  # Root has depth 0
            
    # Process all tokens
    tokens = [token for token in sentence if isinstance(token['id'], int)]
    while tokens:
        # Find tokens whose head's depth is known
        for token in tokens[:]:  # Copy list for iteration
            head_id = token['head']
            if head_id in depths:
                # Skip tokens that are part of fixed expressions
                if token['deprel'] in ['flat:name', 'fixed', 'goeswith']:
                    depths[token['id']] = depths[head_id]
                else:
                    depths[token['id']] = depths[head_id] + 1
                tokens.remove(token)
                
    # Return maximum depth
    return max(depths.values()) if depths else 0



############################################
  def projectivity(sentence):
    pass




  def run(self):
    # 'clause_count.csv': self.clauses
    # 'dep_distance.csv': self.distance
    functions = {'tree_depth.csv': self.depth}
    results = {name: [] for name in functions.keys()}

    doc = Document()
    reader = Conllu(files=[self.input_file])
    reader.process_document(doc)

    for tree in doc.trees:
      sentence = []
      for node in tree.descendants:
        token = {
                    'id': node.ord,
                    'form': node.form,
                    'lemma': node.lemma,
                    'upos': node.upos,
                    'xpos': node.xpos,
                    'feats': node.feats,
                    'head': node.parent.ord if node.parent else 0,
                    'deprel': node.deprel,
                    'deps': node.deps,
                    'misc': node.misc
                }
        
        sentence.append(token)

      sent_id = tree.sent_id
      text = tree.text

      for file, function in functions.items():
          score = function(sentence)
          results[file].append({
              'sentence_id': sent_id,
              'text': text,
              'score': score
          })

    for file, result in results.items():
        df = pd.DataFrame(result)
        out_path = self.output_dir / file
        df.to_csv(out_path, index=False)
        print(f'save {out_path} ok')


def main():

  input_file = '/home/robin/Research/qtype-eval/src/UD-finnish-questions/content_questions_finnish_UD.conllu'
  output_dir = '/home/robin/Research/qtype-eval/src/UD-finnish-questions/scores_content'
  scorer = ComplexityScore(input_file, output_dir)
  scorer.run()

if __name__ == "__main__":
  main()
