<a href="https://colab.research.google.com/github/sketcher03/learning_app_v1/blob/main/Word_Difficulty.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
!pip install nltk scikit-learn pandas syllapy textblob

Collecting syllapy
  Downloading syllapy-0.7.2-py3-none-any.whl.metadata (854 bytes)
Downloading syllapy-0.7.2-py3-none-any.whl (24 kB)
Installing collected packages: syllapy
Successfully installed syllapy-0.7.2


In [None]:
import nltk
import pandas as pd
from nltk.corpus import cmudict

In [None]:
# Download necessary NLTK data files
nltk.download('punkt')
nltk.download('cmudict')

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package cmudict to /root/nltk_data...
[nltk_data]   Package cmudict is already up-to-date!


True

In [None]:
# Initialize CMU Pronouncing Dictionary for syllable count
d = cmudict.dict()

In [None]:
# Load dataset
file_path = '/content/lemmas_60k.xlsx'
df = pd.read_excel(file_path)

## Preprocessing

In [None]:
df

Unnamed: 0,rank,lemma,PoS,freq,perMil,%caps,%allC,range,disp,blog,...,news,acad,blogPM,webPM,TVMPM,spokPM,ficPM,magPM,newsPM,acadPM
0,5,of,i,23159162,23321.89,0.01,0.00,477933,0.97,2897295,...,2867922,4500485,23317.58,26588.36,10918.77,20157.43,19270.82,25796.39,23557.38,37569.65
1,15,do,v,8186412,8244.70,0.10,0.01,415641,0.98,1052753,...,557889,321166,8472.61,6888.18,18344.18,11672.50,8055.41,4716.30,4582.55,2681.07
2,25,they,p,4503650,4535.29,0.23,0.00,382938,0.98,626530,...,493994,345418,5042.35,4390.60,4129.95,7414.70,4606.30,3683.25,4057.71,2883.52
3,35,she,p,3188078,3210.48,0.32,0.00,206627,0.93,181410,...,295580,87513,1460.00,1693.13,3771.82,2812.08,10844.81,2263.90,2427.92,730.55
4,45,about,i,2427703,2444.76,0.02,0.01,350741,0.98,360952,...,231251,158515,2904.96,2328.58,3194.97,3844.24,2097.08,1865.94,1899.52,1323.27
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
6095,60955,interatomic,j,42,0.04,0.00,0.00,23,0.35,1,...,0,37,0.01,0.01,0.00,0.00,0.00,0.02,0.00,0.31
6096,60965,horizontality,n,42,0.04,0.00,0.00,28,0.63,6,...,4,23,0.05,0.02,0.00,0.00,0.01,0.04,0.03,0.19
6097,60975,white-noise,j,42,0.04,0.00,0.00,40,0.73,0,...,6,6,0.00,0.02,0.00,0.02,0.09,0.11,0.05,0.05
6098,60985,vending-machine,j,42,0.04,0.07,0.00,39,0.72,1,...,13,2,0.01,0.00,0.02,0.02,0.05,0.13,0.11,0.02


In [None]:
# Remove unnecessary columns
columns_to_remove = ['%caps', '%allC', 'range', 'disp', 'blog', 'news', 'acad', 'blogPM', 'webPM',
                     'TVMPM', 'spokPM', 'ficPM', 'magPM', 'newsPM', 'acadPM', 'web', 'TVM', 'spok',	'fic',	'mag', 'rank']
df.drop(columns=columns_to_remove, inplace=True)

In [None]:
df

Unnamed: 0,lemma,PoS,freq,perMil
0,of,i,23159162,23321.89
1,do,v,8186412,8244.70
2,they,p,4503650,4535.29
3,she,p,3188078,3210.48
4,about,i,2427703,2444.76
...,...,...,...,...
6095,interatomic,j,42,0.04
6096,horizontality,n,42,0.04
6097,white-noise,j,42,0.04
6098,vending-machine,j,42,0.04


In [None]:
# Remove very frequent words that are too easy for spelling tests (e.g., articles, prepositions, etc.)
too_easy_words = ['the', 'a', 'an', 'in', 'of', 'to', 'for', 'and', 'is', 'are', 'was', 'were', 'be', 'he', 'she', 'it', 'do', 'its', 'off', 'no', 'as', 'tv']
df = df[~df['lemma'].isin(too_easy_words)]

In [None]:
# Find hyphenated words and split them
df['lemma'] = df['lemma'].apply(lambda x: x.split('-') if '-' in x else [x])

# Step 2: Explode the 'lemma' column so that each word in the hyphenated phrase gets its own row
df = df.explode('lemma', ignore_index=True)

# Step 3: Remove duplicates from the dataframe
df = df.drop_duplicates(subset=['lemma'])

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df['lemma'] = df['lemma'].apply(lambda x: x.split('-') if '-' in x else [x])


In [None]:
df

Unnamed: 0,lemma,PoS,freq,perMil
0,they,p,4503650,4535.29
1,about,i,2427703,2444.76
2,there,e,1980173,1994.09
3,him,p,1717209,1729.27
4,could,v,1529795,1540.54
...,...,...,...,...
7210,interatomic,j,42,0.04
7211,horizontality,n,42,0.04
7214,vending,j,42,0.04
7215,machine,j,42,0.04


## Feature Extraction

In [None]:
import syllapy

# Define function to count syllables using syllapy
def syllable_count(word):
    return syllapy.count(word)

# Define function to calculate word length and syllable count
def word_length(word):
    return len(word)

def word_syllable(word):
    return syllable_count(word)

# Define phonological complexity feature
def phonological_complexity(word):
    word = word.lower()
    # Phonological complexity based on clusters (e.g., 'thr', 'str', 'ph', etc.)
    clusters = ['str', 'thr', 'ph', 'ch', 'sh', 'kn', 'wr', 'wh', 'gh', 'tch', 'ou', 'ough', 'nt', 'nd', 'ed', 'ing', 'ie']
    complexity_score = 0
    for cluster in clusters:
        if cluster in word:
            complexity_score += 1
    return complexity_score

In [None]:
# Apply feature extraction to the dataset
df['length'] = df['lemma'].apply(word_length)
df['syllable_count'] = df['lemma'].apply(word_syllable)
df['phonological_complexity'] = df['lemma'].apply(phonological_complexity)


In [None]:
df

Unnamed: 0,lemma,PoS,freq,perMil,length,syllable_count,phonological_complexity
0,they,p,4503650,4535.29,4,1,0
1,about,i,2427703,2444.76,5,2,1
2,there,e,1980173,1994.09,5,1,0
3,him,p,1717209,1729.27,3,1,0
4,could,v,1529795,1540.54,5,1,1
...,...,...,...,...,...,...,...
7210,interatomic,j,42,0.04,11,5,1
7211,horizontality,n,42,0.04,13,6,1
7214,vending,j,42,0.04,7,2,2
7215,machine,j,42,0.04,7,2,1


In [None]:
# Drop rows with any NaN values
df.dropna(inplace=True)

In [None]:
df

Unnamed: 0,lemma,PoS,freq,perMil,length,syllable_count,phonological_complexity
0,they,p,4503650,4535.29,4,1,0
1,about,i,2427703,2444.76,5,2,1
2,there,e,1980173,1994.09,5,1,0
3,him,p,1717209,1729.27,3,1,0
4,could,v,1529795,1540.54,5,1,1
...,...,...,...,...,...,...,...
7210,interatomic,j,42,0.04,11,5,1
7211,horizontality,n,42,0.04,13,6,1
7214,vending,j,42,0.04,7,2,2
7215,machine,j,42,0.04,7,2,1


In [None]:
import re

# Drop rows where 'lemma' contains any digits (numbers)
df = df[~df['lemma'].str.contains(r'\d', regex=True)]

In [None]:
df

Unnamed: 0,lemma,PoS,freq,perMil,length,syllable_count,phonological_complexity
0,they,p,4503650,4535.29,4,1,0
1,about,i,2427703,2444.76,5,2,1
2,there,e,1980173,1994.09,5,1,0
3,him,p,1717209,1729.27,3,1,0
4,could,v,1529795,1540.54,5,1,1
...,...,...,...,...,...,...,...
7210,interatomic,j,42,0.04,11,5,1
7211,horizontality,n,42,0.04,13,6,1
7214,vending,j,42,0.04,7,2,2
7215,machine,j,42,0.04,7,2,1


## Age Group Assignment

In [None]:
# Define a function to assign age group first
def assign_age_group(row):
    word_length = row['length']
    syllables = row['syllable_count']
    phonological_complexity_score = row['phonological_complexity']

    # Age group classification based on rules
    if word_length <= 6 and syllables <= 2:
        age_group = '5-6'
    elif word_length <= 9 and syllables <= 3:
        age_group = '7-8'
    elif word_length <= 12 and syllables <= 4:
        age_group = '9-10'
    else:
        age_group = '11-12'

    return age_group

In [None]:
# Apply age group assignment to the dataframe
df['age_group'] = df.apply(assign_age_group, axis=1)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df['age_group'] = df.apply(assign_age_group, axis=1)


In [None]:
df = df[df['length'] > 2]

In [None]:
df['age_group'].value_counts()

Unnamed: 0_level_0,count
age_group,Unnamed: 1_level_1
7-8,2295
5-6,2043
9-10,1171
11-12,511


In [None]:
# Create separate dataframes for each age group
df1 = df[df['age_group'] == '5-6']  # Age group 5-6
df2 = df[df['age_group'] == '7-8']  # Age group 7-8
df3 = df[df['age_group'] == '9-10']  # Age group 9-10
df4 = df[df['age_group'] == '11-12']  # Age group 11-12

## 5-6 years

In [None]:
df1

Unnamed: 0,lemma,PoS,freq,perMil,length,syllable_count,phonological_complexity,age_group
0,they,p,4503650,4535.29,4,1,0,5-6
1,about,i,2427703,2444.76,5,2,1,5-6
2,there,e,1980173,1994.09,5,1,0,5-6
3,him,p,1717209,1729.27,3,1,0,5-6
4,could,v,1529795,1540.54,5,1,1,5-6
...,...,...,...,...,...,...,...,...
7195,skald,n,42,0.04,5,1,0,5-6
7202,carry,j,42,0.04,5,2,0,5-6
7206,janus,j,42,0.04,5,2,0,5-6
7208,idiot,j,42,0.04,5,2,0,5-6


In [None]:
# Normalize frequency data and categorize it as low, medium, or high frequency
def categorize_frequency56(freq):
    if freq < 200:
        return 'low'
    elif 200 <= freq < 3000:
        return 'medium'
    else:
        return 'high'

df1['frequency_category'] = df1['freq'].apply(categorize_frequency56)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df1['frequency_category'] = df1['freq'].apply(categorize_frequency56)


In [None]:
df1['length'].value_counts()

Unnamed: 0_level_0,count
length,Unnamed: 1_level_1
6,736
5,617
4,528
3,162


In [None]:
# Define a function to assign difficulty based on the age group
def assign_difficulty_for_age_group56(row):
    word_length = row['length']
    frequency_category = row['frequency_category']

    # Difficulty rules based on syllables, length, and frequency
    if word_length <= 4 and frequency_category == 'high':
        difficulty = 'easy'
    elif word_length <= 5 and frequency_category == 'medium':
        difficulty = 'moderate'
    else:
        difficulty = 'hard'

    return difficulty


In [None]:
# Apply difficulty assignment to each of the age group dataframes
df1['difficulty'] = df1.apply(assign_difficulty_for_age_group56, axis=1)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df1['difficulty'] = df1.apply(assign_difficulty_for_age_group56, axis=1)


In [None]:
df1

Unnamed: 0,lemma,PoS,freq,perMil,length,syllable_count,phonological_complexity,age_group,frequency_category,difficulty
0,they,p,4503650,4535.29,4,1,0,5-6,high,easy
1,about,i,2427703,2444.76,5,2,1,5-6,high,hard
2,there,e,1980173,1994.09,5,1,0,5-6,high,hard
3,him,p,1717209,1729.27,3,1,0,5-6,high,easy
4,could,v,1529795,1540.54,5,1,1,5-6,high,hard
...,...,...,...,...,...,...,...,...,...,...
7195,skald,n,42,0.04,5,1,0,5-6,low,hard
7202,carry,j,42,0.04,5,2,0,5-6,low,hard
7206,janus,j,42,0.04,5,2,0,5-6,low,hard
7208,idiot,j,42,0.04,5,2,0,5-6,low,hard


In [None]:
df1['difficulty'].value_counts()

Unnamed: 0_level_0,count
difficulty,Unnamed: 1_level_1
hard,1293
moderate,560
easy,190


In [None]:
df1.to_csv('Age 5-6 years.csv', index=False)

## 7-8 years

In [None]:
df2

Unnamed: 0,lemma,PoS,freq,perMil,length,syllable_count,phonological_complexity,age_group
16,anything,p,384508,387.21,8,3,1,7-8
18,percent,n,357515,360.03,7,2,1,7-8
21,service,n,332313,334.65,7,2,0,7-8
22,however,r,326015,328.35,7,3,0,7-8
31,consider,v,244644,246.36,8,3,0,7-8
...,...,...,...,...,...,...,...,...
7199,amassed,j,42,0.04,7,3,1,7-8
7200,assisted,j,42,0.04,8,3,1,7-8
7201,suicide,j,42,0.04,7,3,0,7-8
7214,vending,j,42,0.04,7,2,2,7-8


In [None]:
# Normalize frequency data and categorize it as low, medium, or high frequency
def categorize_frequency78(freq):
    if freq < 200:
        return 'low'
    elif 200 <= freq < 1000:
        return 'medium'
    else:
        return 'high'

df2['frequency_category'] = df2['freq'].apply(categorize_frequency56)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df2['frequency_category'] = df2['freq'].apply(categorize_frequency56)


In [None]:
df2['frequency_category'].value_counts()

Unnamed: 0_level_0,count
frequency_category,Unnamed: 1_level_1
medium,996
low,893
high,406


In [None]:
df2['length'].value_counts()

Unnamed: 0_level_0,count
length,Unnamed: 1_level_1
7,867
8,760
9,603
6,55
5,10


In [None]:
# Define a function to assign difficulty based on the age group
def assign_difficulty_for_age_group78(row):
    word_length = row['length']
    frequency_category = row['frequency_category']

    # Difficulty rules based on syllables, length, and frequency
    if word_length == 9:
        difficulty = 'hard'
    elif word_length <= 8 and frequency_category == 'medium':
        difficulty = 'moderate'
    else:
        difficulty = 'easy'

    return difficulty


In [None]:
# Apply difficulty assignment to each of the age group dataframes
df2['difficulty'] = df2.apply(assign_difficulty_for_age_group78, axis=1)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df2['difficulty'] = df2.apply(assign_difficulty_for_age_group78, axis=1)


In [None]:
df2

Unnamed: 0,lemma,PoS,freq,perMil,length,syllable_count,phonological_complexity,age_group,frequency_category,difficulty
16,anything,p,384508,387.21,8,3,1,7-8,high,easy
18,percent,n,357515,360.03,7,2,1,7-8,high,easy
21,service,n,332313,334.65,7,2,0,7-8,high,easy
22,however,r,326015,328.35,7,3,0,7-8,high,easy
31,consider,v,244644,246.36,8,3,0,7-8,high,easy
...,...,...,...,...,...,...,...,...,...,...
7199,amassed,j,42,0.04,7,3,1,7-8,low,easy
7200,assisted,j,42,0.04,8,3,1,7-8,low,easy
7201,suicide,j,42,0.04,7,3,0,7-8,low,easy
7214,vending,j,42,0.04,7,2,2,7-8,low,easy


In [None]:
df2['difficulty'].value_counts()

Unnamed: 0_level_0,count
difficulty,Unnamed: 1_level_1
easy,963
moderate,729
hard,603


In [None]:
df2.to_csv('Age 7-8 years.csv', index=False)

## 9-10 years

In [None]:
df3

Unnamed: 0,lemma,PoS,freq,perMil,length,syllable_count,phonological_complexity,age_group
39,security,n,201542,202.96,8,4,0,9-10
68,performance,n,127318,128.21,11,3,0,9-10
72,identify,v,120690,121.54,8,4,1,9-10
95,particular,j,92775,93.43,10,4,0,9-10
109,participant,n,81371,81.94,11,4,1,9-10
...,...,...,...,...,...,...,...,...
7177,entablature,n,43,0.04,11,4,1,9-10
7179,daydreamer,n,43,0.04,10,3,0,9-10
7180,scrutinized,j,43,0.04,11,4,1,9-10
7184,consolingly,r,42,0.04,11,4,1,9-10


In [None]:
# Normalize frequency data and categorize it as low, medium, or high frequency
def categorize_frequency910(freq):
    if freq < 200:
        return 'low'
    elif 200 <= freq < 700:
        return 'medium'
    else:
        return 'high'

df3['frequency_category'] = df3['freq'].apply(categorize_frequency56)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df3['frequency_category'] = df3['freq'].apply(categorize_frequency56)


In [None]:
df3['frequency_category'].value_counts()

Unnamed: 0_level_0,count
frequency_category,Unnamed: 1_level_1
medium,532
low,487
high,152


In [None]:
df3['length'].value_counts()

Unnamed: 0_level_0,count
length,Unnamed: 1_level_1
10,526
11,307
12,138
9,132
8,62
7,6


In [None]:
# Define a function to assign difficulty based on the age group
def assign_difficulty_for_age_group910(row):
    word_length = row['length']
    frequency_category = row['frequency_category']

    # Difficulty rules based on syllables, length, and frequency
    if word_length >= 11:
        difficulty = 'hard'
    elif word_length <= 11 and frequency_category == 'medium':
        difficulty = 'moderate'
    else:
        difficulty = 'easy'

    return difficulty


In [None]:
# Apply difficulty assignment to each of the age group dataframes
df3['difficulty'] = df3.apply(assign_difficulty_for_age_group910, axis=1)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df3['difficulty'] = df3.apply(assign_difficulty_for_age_group910, axis=1)


In [None]:
df3

Unnamed: 0,lemma,PoS,freq,perMil,length,syllable_count,phonological_complexity,age_group,frequency_category,difficulty
39,security,n,201542,202.96,8,4,0,9-10,high,easy
68,performance,n,127318,128.21,11,3,0,9-10,high,hard
72,identify,v,120690,121.54,8,4,1,9-10,high,easy
95,particular,j,92775,93.43,10,4,0,9-10,high,easy
109,participant,n,81371,81.94,11,4,1,9-10,high,hard
...,...,...,...,...,...,...,...,...,...,...
7177,entablature,n,43,0.04,11,4,1,9-10,low,hard
7179,daydreamer,n,43,0.04,10,3,0,9-10,low,easy
7180,scrutinized,j,43,0.04,11,4,1,9-10,low,hard
7184,consolingly,r,42,0.04,11,4,1,9-10,low,hard


In [None]:
df3['difficulty'].value_counts()

Unnamed: 0_level_0,count
difficulty,Unnamed: 1_level_1
hard,445
easy,395
moderate,331


In [None]:
df3.to_csv('Age 9-10 years.csv', index=False)

## 11-12 years

In [None]:
df4

Unnamed: 0,lemma,PoS,freq,perMil,length,syllable_count,phonological_complexity,age_group
83,particularly,r,104018,104.75,12,5,0,11-12
122,understanding,n,72651,73.16,13,4,2,11-12
128,possibility,n,69316,69.80,11,5,0,11-12
274,constitutional,j,28065,28.26,14,5,0,11-12
287,biological,j,26435,26.62,10,5,0,11-12
...,...,...,...,...,...,...,...,...
7165,oxygenating,j,43,0.04,11,5,1,11-12
7174,neuroanatomy,n,43,0.04,12,5,0,11-12
7193,osteonecrosis,n,42,0.04,13,5,0,11-12
7210,interatomic,j,42,0.04,11,5,1,11-12


In [None]:
# Normalize frequency data and categorize it as low, medium, or high frequency
def categorize_frequency112(freq):
    if freq < 140:
        return 'low'
    elif 140 <= freq < 400:
        return 'medium'
    else:
        return 'high'

df4['frequency_category'] = df4['freq'].apply(categorize_frequency112)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df4['frequency_category'] = df4['freq'].apply(categorize_frequency112)


In [None]:
df4['frequency_category'].value_counts()

Unnamed: 0_level_0,count
frequency_category,Unnamed: 1_level_1
low,210
high,162
medium,139


In [None]:
df4['length'].value_counts()

Unnamed: 0_level_0,count
length,Unnamed: 1_level_1
13,153
12,99
14,86
11,63
15,46
16,27
10,21
17,9
18,3
20,2


In [None]:
# Define a function to assign difficulty based on the age group
def assign_difficulty_for_age_group112(row):
    word_length = row['length']
    frequency_category = row['frequency_category']

    # Difficulty rules based on syllables, length, and frequency
    if word_length >= 15 or frequency_category == 'low':
        difficulty = 'hard'
    elif word_length <= 14 and frequency_category == 'medium':
        difficulty = 'moderate'
    else:
        difficulty = 'easy'

    return difficulty


In [None]:
# Apply difficulty assignment to each of the age group dataframes
df4['difficulty'] = df4.apply(assign_difficulty_for_age_group112, axis=1)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df4['difficulty'] = df4.apply(assign_difficulty_for_age_group112, axis=1)


In [None]:
df4

Unnamed: 0,lemma,PoS,freq,perMil,length,syllable_count,phonological_complexity,age_group,frequency_category,difficulty
83,particularly,r,104018,104.75,12,5,0,11-12,high,easy
122,understanding,n,72651,73.16,13,4,2,11-12,high,easy
128,possibility,n,69316,69.80,11,5,0,11-12,high,easy
274,constitutional,j,28065,28.26,14,5,0,11-12,high,easy
287,biological,j,26435,26.62,10,5,0,11-12,high,easy
...,...,...,...,...,...,...,...,...,...,...
7165,oxygenating,j,43,0.04,11,5,1,11-12,low,hard
7174,neuroanatomy,n,43,0.04,12,5,0,11-12,low,hard
7193,osteonecrosis,n,42,0.04,13,5,0,11-12,low,hard
7210,interatomic,j,42,0.04,11,5,1,11-12,low,hard


In [None]:
df4['difficulty'].value_counts()

Unnamed: 0_level_0,count
difficulty,Unnamed: 1_level_1
hard,257
easy,140
moderate,114


In [None]:
df4.to_csv('Age 11-12 years.csv', index=False)

## Counts

In [None]:
print(f'5-6 years: {len(df1)}')
print(f'7-8 years: {len(df2)}')
print(f'9-10 years: {len(df3)}')
print(f'11-12 years: {len(df4)}')

5-6 years: 2043
7-8 years: 2295
9-10 years: 1171
11-12 years: 511
