In [1]:
datapath = "./data/mintaka_test.json"

In [3]:
import json
import pandas as pd
from sklearn.model_selection import train_test_split

# Load dataset from JSON file
with open(datapath, "r", encoding="utf-8") as f:
    data = json.load(f)

# Convert to DataFrame
df = pd.DataFrame(data)

# Define the sample size (e.g., 20% of the dataset)
sample_fraction = 0.2  

# Perform stratified sampling based on 'complexityType' and 'category'
stratified_sample, _ = train_test_split(
    df, 
    test_size=1 - sample_fraction,  # Keep the desired fraction
    stratify=df[["complexityType", "category"]],
    random_state=42
)

# Convert back to JSON
sampled_data = stratified_sample.to_dict(orient="records")

# Save the stratified sample to a new JSON file
with open("sampled_dataset.json", "w", encoding="utf-8") as f:
    json.dump(sampled_data, f, indent=4, ensure_ascii=False)

print(f"Stratified sample of {len(sampled_data)} items saved to 'sampled_dataset.json'")


Stratified sample of 800 items saved to 'sampled_dataset.json'


In [14]:
df['category'].value_counts() / df.shape[0]

category
history       0.125
movies        0.125
music         0.125
videogames    0.125
sports        0.125
books         0.125
geography     0.125
politics      0.125
Name: count, dtype: float64

In [15]:
df['complexityType'].value_counts() / df.shape[0]

complexityType
generic         0.2
intersection    0.1
count           0.1
comparative     0.1
yesno           0.1
ordinal         0.1
multihop        0.1
difference      0.1
superlative     0.1
Name: count, dtype: float64

In [16]:
stratified_sample[['category', 'complexityType']].value_counts() / stratified_sample.shape[0]

category    complexityType
books       generic           0.0250
politics    generic           0.0250
videogames  generic           0.0250
sports      generic           0.0250
movies      generic           0.0250
                               ...  
videogames  intersection      0.0125
            multihop          0.0125
            ordinal           0.0125
            superlative       0.0125
            yesno             0.0125
Name: count, Length: 72, dtype: float64

In [12]:
stratified_sample['complexityType'].value_counts() / stratified_sample.shape[0]

complexityType
generic         0.2
yesno           0.1
multihop        0.1
superlative     0.1
ordinal         0.1
comparative     0.1
intersection    0.1
count           0.1
difference      0.1
Name: count, dtype: float64

In [32]:
def distribution(df, columns):
    """
    Calculate the distribution of each column in the DataFrame individually.
    """
    distributions = {}
    for column in columns:
        distributions[column] = df[column].value_counts(normalize=True).reset_index(name='distribution')
    return distributions

def distribution(df, columns):
    """
    Calculate the distribution of each column in the DataFrame individually
    and return it as a single DataFrame.
    """
    distributions = []
    for column in columns:
        dist = df[column].value_counts(normalize=True).reset_index()
        dist.columns = [column, 'distribution']
        dist['column'] = column
        distributions.append(dist)
    return pd.concat(distributions, ignore_index=True)

def distribution(df, column: str, do_print=True):
    """
    Calculate the distribution of each column in the DataFrame individually
    and return it as a single DataFrame.
    """
    assert isinstance(column, str)
    dist = df[column].value_counts(normalize=True).reset_index()
    dist_df = pd.DataFrame(dist)
    if do_print:
        print(dist_df.to_markdown())
    return dist_df

In [34]:
distribution(df, 'category')

|    | category   |   proportion |
|---:|:-----------|-------------:|
|  0 | history    |        0.125 |
|  1 | movies     |        0.125 |
|  2 | music      |        0.125 |
|  3 | videogames |        0.125 |
|  4 | sports     |        0.125 |
|  5 | books      |        0.125 |
|  6 | geography  |        0.125 |
|  7 | politics   |        0.125 |


Unnamed: 0,category,proportion
0,history,0.125
1,movies,0.125
2,music,0.125
3,videogames,0.125
4,sports,0.125
5,books,0.125
6,geography,0.125
7,politics,0.125
