In [3]:
from tqdm import tqdm
from bs4.element import Tag
from bs4 import BeautifulSoup as bs
from os.path import splitext
from time import time
import datetime
import pandas as pd
import os

In [None]:
xml_file = 'EFCAMDAT_Database.xml' 

## Extract the XML file to CSV

In [None]:
from tqdm import tqdm
from bs4.element import Tag
from bs4 import BeautifulSoup as bs
from os.path import splitext
from time import time
import datetime


def parse_xml_to_df(xml_path):
    with open(xml_path, "r", encoding="utf-8") as file:
        content = file.read()

    # Parse the cleaned XML content
    soup = bs(content, "lxml")
    
    data = []
    # Extracting data from the XML
    for writing in tqdm(soup.find_all("writing")):
        row = {}
        
        if writing.has_attr('id'):
            row['id'] = writing['id']
        if writing.has_attr('level'):
            row['level'] = writing['level']
        if writing.has_attr('unit'):
            row['unit'] = writing['unit']
        
        learner = writing.find('learner')
        if learner:
            if learner.has_attr('id'):
                row['learner_id'] = learner['id']
            if learner.has_attr('nationality'):
                row['learner_nationality'] = learner['nationality']
        
        grade = writing.find('grade')
        if grade:
            row['grade'] = grade.text
        
        date = writing.find('date')
        if date:
            row['date'] = date.text
        
        topic = writing.find('topic')
        if topic and topic.has_attr('id'):
            row['topic_id'] = topic['id']
        
        text = writing.find('text')
        if text:
            row['text'] = text.get_text(separator=' ', strip=True)
        
        data.append(row)
    
    # Creating DataFrame
    df = pd.DataFrame(data)
    return df

df = parse_xml_to_df(xml_file)
print(df.head())

## Use CEFR numeric rubric for classifications

In [None]:
def cefr_score(x):
    x = int(x)
    if 1 <= x <= 3:
        return 1
    elif 4 <= x <= 6:
        return 2
    elif 7 <= x <= 9:
        return 3
    elif 10 <= x <= 12:
        return 4
    elif 13 <= x <= 15:
        return 5
    else:
        return 6

In [None]:
df['cefr_numeric'] = df['level'].apply(lambda x: cefr_score(x))

In [None]:
df['cefr_numeric'].value_counts()

In [None]:
def group_cefr(x):
    if x < 3:
        return 1
    elif 3 <= x < 5:
        return 2
    else:
        return 3

In [None]:
df['cefr_grouped'] = df['cefr_numeric'].apply(lambda x: group_cefr(x))

## Save to CSV

In [6]:
df.to_csv("efcamdat_full.csv")

## Brief EDA

In [14]:
df = pd.read_csv("efcamdat_full.csv")

In [8]:
df['cefr_numeric'].value_counts()

1    625985
2    307996
3    168361
4     61329
5     14698
6      1940
Name: cefr_numeric, dtype: int64

In [11]:
print("The % of A1s is: ", df['cefr_numeric'].value_counts()[1] / len(df))
print("The % of A2s is: ", df['cefr_numeric'].value_counts()[2] / len(df))
print("The % of B1s is: ", df['cefr_numeric'].value_counts()[3] / len(df))
print("The % of B2s is: ", df['cefr_numeric'].value_counts()[4] / len(df))
print("The % of C1s is: ", df['cefr_numeric'].value_counts()[5] / len(df))
print("The % of C2s is: ", df['cefr_numeric'].value_counts()[6] / len(df))

The % of A1s is:  0.5303568811218079
The % of A2s is:  0.26094522705494916
The % of B1s is:  0.14264146083779755
The % of B2s is:  0.051960122306955216
The % of C1s is:  0.012452671292009127
The % of C2s is:  0.0016436373864809977


In [12]:
print("The % of As is: ", df['cefr_grouped'].value_counts()[1] / len(df))
print("The % of Bs is: ", df['cefr_grouped'].value_counts()[2] / len(df))
print("The % of Cs is: ", df['cefr_grouped'].value_counts()[3] / len(df))

The % of As is:  0.7913021081767571
The % of Bs is:  0.19460158314475276
The % of Cs is:  0.014096308678490124


## Create Subset to Use

Because there is such a large imbalance in the data, we will create which will be used for training and testing.

In [18]:
df['cefr_numeric'].value_counts()

1    625985
2    307996
3    168361
4     61329
5     14698
6      1940
Name: cefr_numeric, dtype: int64

In [33]:
small = df.loc[(df['cefr_numeric'] == 6) | (df['cefr_numeric'] == 5) | (df['cefr_numeric'] == 4)].copy()

In [34]:
sub = pd.concat([df.loc[(df['cefr_numeric'] == 1)].sample(100000), small])

In [35]:
sub = pd.concat([df.loc[(df['cefr_numeric'] == 2)].sample(100000), sub])

In [36]:
sub = pd.concat([df.loc[(df['cefr_numeric'] == 3)].sample(100000), sub])

In [37]:
sub['cefr_numeric'].value_counts()

3    100000
2    100000
1    100000
4     61329
5     14698
6      1940
Name: cefr_numeric, dtype: int64

In [39]:
print("The % of A1s is: ", sub['cefr_numeric'].value_counts()[1] / len(sub))
print("The % of A2s is: ", sub['cefr_numeric'].value_counts()[2] / len(sub))
print("The % of B1s is: ", sub['cefr_numeric'].value_counts()[3] / len(sub))
print("The % of B2s is: ", sub['cefr_numeric'].value_counts()[4] / len(sub))
print("The % of C1s is: ", sub['cefr_numeric'].value_counts()[5] / len(sub))
print("The % of C2s is: ", sub['cefr_numeric'].value_counts()[6] / len(sub))
print(f"There are {len(sub)} rows")

The % of A1s is:  0.2645733622247445
The % of A2s is:  0.2645733622247445
The % of B1s is:  0.2645733622247445
The % of B2s is:  0.16226019731881355
The % of C1s is:  0.03888699277979295
The % of C2s is:  0.005132723227160043
There are 377967 rows


In [40]:
sub.to_csv('efcamdat_sub.csv', index=False)