## Data Analysis

In [None]:
import json
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from datasets import load_dataset
import seaborn as sns
import re
from collections import defaultdict
from IPython.display import display, HTML


  from .autonotebook import tqdm as notebook_tqdm


### dataset table

In [3]:
dataset = load_dataset("rokokot/question-type-and-complexity", name="base", split="train")
lang_map = {'ar': 'Arabic', 'en': 'English', 'fi': 'Finnish','id': 'Indonesian', 'ja': 'Japanese','ko': 'Korean', 'ru': 'Russian'}

splits = ['train', 'validation', 'test']
all_results = []

for split in splits:
  dataset = load_dataset('rokokot/question-type-and-complexity', name='base', split=split)
  
  for lang in list(lang_map.keys()):
    lang_data = dataset.filter(lambda x: x['language'] == lang)

    n_questions = len(lang_data)

    question_types = lang_data['question_type']

    polar_count = sum(1 for qt in question_types if qt == 1)
    content_count = sum(1 for qt in question_types if qt == 0)

    polar_pct = round((polar_count / n_questions) * 100, 1)
    content_pct = round((content_count / n_questions) * 100, 1)

    avg_complexity = round(np.mean(lang_data['complexity_score']), 2)

    all_results.append({'Language': lang_map[lang],'Questions': n_questions,'Polar (%)': polar_pct,'Content (%)': content_pct,'Avg. Complexity': avg_complexity})
stats_df = pd.DataFrame(all_results)
print(stats_df.to_string(index=False))

  Language  Questions  Polar (%)  Content (%)  Avg. Complexity
    Arabic        995       49.9         50.1             1.50
   English       1192       50.0         50.0             1.60
   Finnish       1195       50.0         50.0             1.37
Indonesian        954       47.9         52.1             1.86
  Japanese       1191       50.0         50.0             1.60
    Korean        739       46.1         53.9             1.97
   Russian       1194       50.0         50.0             1.76
    Arabic         44       45.5         54.5             1.73
   English         72       50.0         50.0             1.74
   Finnish         63       47.6         52.4             1.64
Indonesian         72       50.0         50.0             2.01
  Japanese         46       52.2         47.8             1.71
    Korean         72       50.0         50.0             2.05
   Russian         72       50.0         50.0             1.83
    Arabic         77       28.6         71.4          

In [4]:
def analyze_averages():
    splits = ["train", "validation", "test"]
    lang_map = {
        'ar': 'Arabic', 'en': 'English', 'fi': 'Finnish',
        'id': 'Indonesian', 'ja': 'Japanese',
        'ko': 'Korean', 'ru': 'Russian'
    }
    
    combined_stats = {lang: {'Questions': 0, 'Polar': 0, 'Content': 0, 'Complexity': []} 
                     for lang in lang_map.values()}
    
    total_questions = 0

    for split in splits:
        try:
            dataset = load_dataset("rokokot/question-type-and-complexity", name="base", split=split)
            
            for lang_code, lang_name in lang_map.items():
                lang_data = dataset.filter(lambda x: x['language'] == lang_code)
                
                if len(lang_data) == 0:
                    print(f"No data for {lang_name} in {split} split")
                    continue
                
                combined_stats[lang_name]['Questions'] += len(lang_data)
                total_questions += len(lang_data)
                
                question_types = lang_data['question_type']
                polar_count = sum(1 for qt in question_types if qt == 1)
                combined_stats[lang_name]['Polar'] += polar_count
                combined_stats[lang_name]['Content'] += (len(lang_data) - polar_count)
                
                combined_stats[lang_name]['Complexity'].extend(lang_data['complexity_score'])
                
        except Exception as e:
            print(f"Error processing {split} split: {e}")
    
    results = []
    for lang_name, stats in combined_stats.items():
        if stats['Questions'] > 0:
            polar_pct = round((stats['Polar'] / stats['Questions']) * 100, 1)
            content_pct = round((stats['Content'] / stats['Questions']) * 100, 1)
            avg_complexity = round(np.mean(stats['Complexity']), 2) if stats['Complexity'] else 0
            dataset_pct = round((stats['Questions'] / total_questions) * 100, 1)

            results.append({'Language': lang_name,'Dataset %': dataset_pct,'Polar %': polar_pct,'Content %': content_pct,'Avg. Complexity': avg_complexity
            })
    
    stats_df = pd.DataFrame(results)
    
    return stats_df

stats_df = analyze_averages()
print(stats_df.to_string(index=False))


Filter: 100%|██████████| 7460/7460 [00:00<00:00, 14698.16 examples/s]
Filter: 100%|██████████| 7460/7460 [00:00<00:00, 17265.90 examples/s]
Filter: 100%|██████████| 7460/7460 [00:00<00:00, 16705.33 examples/s]
Filter: 100%|██████████| 7460/7460 [00:00<00:00, 16333.39 examples/s]
Filter: 100%|██████████| 7460/7460 [00:00<00:00, 16168.38 examples/s]
Filter: 100%|██████████| 7460/7460 [00:00<00:00, 14513.94 examples/s]
Filter: 100%|██████████| 7460/7460 [00:00<00:00, 10511.51 examples/s]
Filter: 100%|██████████| 441/441 [00:00<00:00, 8500.91 examples/s]
Filter: 100%|██████████| 441/441 [00:00<00:00, 5911.36 examples/s]
Filter: 100%|██████████| 441/441 [00:00<00:00, 6433.41 examples/s]
Filter: 100%|██████████| 441/441 [00:00<00:00, 7682.32 examples/s]
Filter: 100%|██████████| 441/441 [00:00<00:00, 8276.27 examples/s]
Filter: 100%|██████████| 441/441 [00:00<00:00, 8150.38 examples/s]
Filter: 100%|██████████| 441/441 [00:00<00:00, 5701.61 examples/s]
Filter: 100%|██████████| 719/719 [00:00<0

  Language  Dataset %  Polar %  Content %  Avg. Complexity
    Arabic       12.9     48.3       51.7             1.55
   English       15.9     50.0       50.0             1.61
   Finnish       15.9     49.9       50.1             1.40
Indonesian       13.2     48.2       51.8             1.88
  Japanese       15.4     50.8       49.2             1.66
    Korean       10.7     46.9       53.1             1.98
   Russian       16.0     50.0       50.0             1.76
