In [9]:
#Imports
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report
import re
import joblib

In [3]:
#Load Dataset
data = pd.read_csv("ESL_vocabulary.csv")

In [4]:
#Feature Engineering
#Word Length
data['word_length'] = data['word'].apply(len)

#Count Vowels
def count_vowels(word):
    vowels = 'aeiou'
    return sum(1 for letter in word.lower() if letter in vowels)

data['vowel_count'] = data['word'].apply(count_vowels)


#Count Syllables
def count_syllables(word):
  word = word.lower()
  syllables = re.findall(r'[aeiou]+', word)
  return len(syllables)

data['syllable_count'] = data['word'].apply(count_syllables)

In [7]:
#Train/Test Split

x = data[['word_length','vowel_count','syllable_count']]
y = data['cefr_level']

x_train, x_test, y_train, y_test = train_test_split(
    x, y, test_size=0.2, random_state=42
)

In [10]:
#Train Model

model = RandomForestClassifier()
model.fit(x_train, y_train)

joblib.dump(model, "esl_vocab_model.pkl")

['esl_vocab_model.pkl']

In [11]:
#Evaluate Model

predictions = model.predict(x_test)
print(classification_report(y_test, predictions, zero_division=0))

              precision    recall  f1-score   support

          A1       0.80      1.00      0.89         4
          A2       0.33      0.33      0.33         3
          B1       0.40      0.33      0.36         6
          B2       0.00      0.00      0.00         2
          C1       0.50      0.20      0.29         5
          C2       0.67      0.50      0.57         4

    accuracy                           0.42        24
   macro avg       0.45      0.39      0.41        24
weighted avg       0.49      0.42      0.44        24



In [16]:
#Example Prediction

new_word ="heat"

length = len(new_word)
vowels = count_vowels(new_word)
syllables = count_syllables(new_word)

new_data = pd.DataFrame({
    'word_length' : [length],
    'vowel_count' : [vowels],
    'syllable_count' : [syllables]
})

prediction = model.predict(new_data)
print("predicted level:",prediction[0])

predicted level: A1
