In [None]:
import pandas as pd
import csv

In [None]:
!pip install nltk spacy textstat scikit-learn
!python -m spacy download en_core_web_sm

Collecting en-core-web-sm==3.7.1
  Downloading https://github.com/explosion/spacy-models/releases/download/en_core_web_sm-3.7.1/en_core_web_sm-3.7.1-py3-none-any.whl (12.8 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m12.8/12.8 MB[0m [31m20.0 MB/s[0m eta [36m0:00:00[0m
[38;5;2m✔ Download and installation successful[0m
You can now load the package via spacy.load('en_core_web_sm')
[38;5;3m⚠ Restart to reload dependencies[0m
If you are in a Jupyter or Colab notebook, you may need to restart Python in
order to load all the package's dependencies. You can do this by selecting the
'Restart kernel' or 'Restart runtime' option.


In [7]:
import pandas as pd
import nltk
import spacy
import textstat
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report
from tqdm import tqdm

tqdm.pandas()

# Download NLTK data
nltk.download('punkt')
nltk.download('averaged_perceptron_tagger')

# Load Spacy model
nlp = spacy.load('en_core_web_sm')

# Define feature extraction functions
def word_count(text):
    return len(nltk.word_tokenize(text))

def syllable_count(text):
    return textstat.syllable_count(text)

def character_count(text):
    return len(text)

def complex_word_count(text):
    return textstat.lexicon_count(text, removepunct=True) - textstat.difficult_words(text)

def vocab_size(text):
    return len(set(nltk.word_tokenize(text)))

def lexical_diversity(text):
    words = nltk.word_tokenize(text)
    return len(set(words)) / len(words)

def noun_chunks(text):
    doc = nlp(text)
    return len(list(doc.noun_chunks))

def flesch_kincaid_score(text):
    return textstat.flesch_kincaid_grade(text)

def dale_chall_score(text):
    return textstat.dale_chall_readability_score(text)

def gunning_fog_index(text):
    return textstat.gunning_fog(text)

def coleman_liau_index(text):
    return textstat.coleman_liau_index(text)

def automated_readability_index(text):
    return textstat.automated_readability_index(text)

# create df with features
def extract_features(df, text_column):
    features = pd.DataFrame()
    features['Word Count'] = df[text_column].progress_apply(word_count)
    features['Syllable Count'] = df[text_column].progress_apply(syllable_count)
    features['Character Count'] = df[text_column].progress_apply(character_count)
    features['Complex Word Count'] = df[text_column].progress_apply(complex_word_count)
    features['Vocab Size'] = df[text_column].progress_apply(vocab_size)
    features['Lexical Diversity'] = df[text_column].progress_apply(lexical_diversity)
    features['Noun Chunks'] = df[text_column].progress_apply(noun_chunks)
    features['Flesch Kincaid Score'] = df[text_column].progress_apply(flesch_kincaid_score)
    features['Dale Chall Score'] = df[text_column].progress_apply(dale_chall_score)
    features['Gunning Fog Index'] = df[text_column].progress_apply(gunning_fog_index)
    features['Coleman Liau Index'] = df[text_column].progress_apply(coleman_liau_index)
    features['Automated Readability Index'] = df[text_column].progress_apply(automated_readability_index)
    return features


[nltk_data] Downloading package punkt to
[nltk_data]     /Users/danielskahill/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     /Users/danielskahill/nltk_data...
[nltk_data]   Package averaged_perceptron_tagger is already up-to-
[nltk_data]       date!


## EFCamDat 5 Class Baseline

In [9]:
# Assuming 'balanced_data' is your dataframe and 'text' is the column with text data
from sklearn.impute import SimpleImputer

balanced_data = pd.read_csv('../efcamdat/efcamdat2.csv')

#balanced_data = balanced_data.sample(n=15000, random_state=42)

print(balanced_data['cefr_numeric'].value_counts())

balanced_data['label'] = balanced_data['cefr_numeric'].apply(lambda x: x - 1)
balanced_data['label'] = balanced_data['label'].astype('category')
print(balanced_data['label'].cat.categories)

features = extract_features(balanced_data, 'text')

data_with_features = pd.concat([features, balanced_data['label'].reset_index(drop=True)], axis=1)

# Drop rows with missing values in the label column
data_with_features = data_with_features.dropna(subset=['label'])

X = data_with_features.drop(columns=['label'])
y = data_with_features['label']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Handle missing values using imputer for features only
imputer = SimpleImputer(strategy='mean')
X_train_imputed = imputer.fit_transform(X_train)
X_test_imputed = imputer.transform(X_test)

# Verify the lengths of the training and testing sets
print(f"X_train_imputed length: {len(X_train_imputed)}, y_train length: {len(y_train)}")
print(f"X_test_imputed length: {len(X_test_imputed)}, y_test length: {len(y_test)}")

# Train logistic regression model
log_reg = LogisticRegression(max_iter=1000)
log_reg.fit(X_train_imputed, y_train)

# Predict on test set
y_pred = log_reg.predict(X_test_imputed)

# Print classification report
print(classification_report(y_test, y_pred))

1    149492
2     85753
3     55033
4     22051
5      4891
Name: cefr_numeric, dtype: int64
Int64Index([0, 1, 2, 3, 4], dtype='int64')


100%|████████████████████████████████████████████████████| 317220/317220 [01:24<00:00, 3740.08it/s]
100%|███████████████████████████████████████████████████| 317220/317220 [00:16<00:00, 18851.28it/s]
100%|█████████████████████████████████████████████████| 317220/317220 [00:00<00:00, 1703157.83it/s]
100%|███████████████████████████████████████████████████| 317220/317220 [00:16<00:00, 18960.61it/s]
100%|████████████████████████████████████████████████████| 317220/317220 [01:25<00:00, 3727.21it/s]
100%|████████████████████████████████████████████████████| 317220/317220 [01:24<00:00, 3733.68it/s]
100%|█████████████████████████████████████████████████████| 317220/317220 [17:57<00:00, 294.29it/s]
100%|███████████████████████████████████████████████████| 317220/317220 [00:21<00:00, 14682.84it/s]
100%|███████████████████████████████████████████████████| 317220/317220 [00:22<00:00, 14398.97it/s]
100%|███████████████████████████████████████████████████| 317220/317220 [00:22<00:00, 14223.08it/s]


X_train_imputed length: 253776, y_train length: 253776
X_test_imputed length: 63444, y_test length: 63444
              precision    recall  f1-score   support

           0       0.79      0.88      0.83     29923
           1       0.62      0.49      0.54     17097
           2       0.66      0.74      0.70     10984
           3       0.64      0.54      0.59      4453
           4       0.43      0.29      0.35       987

    accuracy                           0.72     63444
   macro avg       0.63      0.59      0.60     63444
weighted avg       0.71      0.72      0.71     63444



STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


## EFCamDat 6 Class Baseline

In [10]:
# Assuming 'balanced_data' is your dataframe and 'text' is the column with text data
from sklearn.impute import SimpleImputer

balanced_data = pd.read_csv('../efcamdat_sub.csv')

#balanced_data = balanced_data.sample(n=15000, random_state=42)

print(balanced_data['cefr_numeric'].value_counts())

balanced_data['label'] = balanced_data['cefr_numeric'].apply(lambda x: x - 1)
balanced_data['label'] = balanced_data['label'].astype('category')
print(balanced_data['label'].cat.categories)

features = extract_features(balanced_data, 'text')

data_with_features = pd.concat([features, balanced_data['label'].reset_index(drop=True)], axis=1)

# Drop rows with missing values in the label column
data_with_features = data_with_features.dropna(subset=['label'])

X = data_with_features.drop(columns=['label'])
y = data_with_features['label']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Handle missing values using imputer for features only
imputer = SimpleImputer(strategy='mean')
X_train_imputed = imputer.fit_transform(X_train)
X_test_imputed = imputer.transform(X_test)

# Verify the lengths of the training and testing sets
print(f"X_train_imputed length: {len(X_train_imputed)}, y_train length: {len(y_train)}")
print(f"X_test_imputed length: {len(X_test_imputed)}, y_test length: {len(y_test)}")

# Train logistic regression model
log_reg = LogisticRegression(max_iter=1000)
log_reg.fit(X_train_imputed, y_train)

# Predict on test set
y_pred = log_reg.predict(X_test_imputed)

# Print classification report
print(classification_report(y_test, y_pred))

3    100000
2    100000
1    100000
4     61329
5     14698
6      1940
Name: cefr_numeric, dtype: int64
Int64Index([0, 1, 2, 3, 4, 5], dtype='int64')


100%|████████████████████████████████████████████████████| 377967/377967 [02:15<00:00, 2796.12it/s]
100%|███████████████████████████████████████████████████| 377967/377967 [00:24<00:00, 15677.85it/s]
100%|█████████████████████████████████████████████████| 377967/377967 [00:00<00:00, 1717315.08it/s]
100%|███████████████████████████████████████████████████| 377967/377967 [00:31<00:00, 12048.48it/s]
100%|████████████████████████████████████████████████████| 377967/377967 [02:16<00:00, 2771.34it/s]
100%|████████████████████████████████████████████████████| 377967/377967 [02:16<00:00, 2769.93it/s]
100%|█████████████████████████████████████████████████████| 377967/377967 [26:55<00:00, 233.92it/s]
100%|███████████████████████████████████████████████████| 377967/377967 [00:35<00:00, 10555.46it/s]
100%|████████████████████████████████████████████████████| 377967/377967 [00:40<00:00, 9308.81it/s]
100%|████████████████████████████████████████████████████| 377967/377967 [00:40<00:00, 9290.81it/s]


X_train_imputed length: 302373, y_train length: 302373
X_test_imputed length: 75594, y_test length: 75594
              precision    recall  f1-score   support

           0       0.76      0.76      0.76     19928
           1       0.59      0.59      0.59     20008
           2       0.59      0.68      0.63     20055
           3       0.59      0.55      0.57     12308
           4       0.44      0.17      0.24      2929
           5       0.00      0.00      0.00       366

    accuracy                           0.63     75594
   macro avg       0.50      0.46      0.46     75594
weighted avg       0.63      0.63      0.63     75594



STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


## OneStopEnglish Baseline

In [12]:
# Assuming 'balanced_data' is your dataframe and 'text' is the column with text data
from sklearn.impute import SimpleImputer

balanced_data = pd.read_csv('../onestopec.csv')

#balanced_data = balanced_data.sample(n=15000, random_state=42)

print(balanced_data['labels'].value_counts())

balanced_data['label'] = balanced_data['labels'].apply(lambda x: x - 1)
balanced_data['label'] = balanced_data['labels'].astype('category')
print(balanced_data['label'].cat.categories)
balanced_data['text'] = balanced_data['text'].apply(lambda x: str(x))

features = extract_features(balanced_data, 'text')

data_with_features = pd.concat([features, balanced_data['label'].reset_index(drop=True)], axis=1)

# Drop rows with missing values in the label column
data_with_features = data_with_features.dropna(subset=['label'])

X = data_with_features.drop(columns=['label'])
y = data_with_features['label']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Handle missing values using imputer for features only
imputer = SimpleImputer(strategy='mean')
X_train_imputed = imputer.fit_transform(X_train)
X_test_imputed = imputer.transform(X_test)

# Verify the lengths of the training and testing sets
print(f"X_train_imputed length: {len(X_train_imputed)}, y_train length: {len(y_train)}")
print(f"X_test_imputed length: {len(X_test_imputed)}, y_test length: {len(y_test)}")

# Train logistic regression model
log_reg = LogisticRegression(max_iter=1000)
log_reg.fit(X_train_imputed, y_train)

# Predict on test set
y_pred = log_reg.predict(X_test_imputed)

# Print classification report
print(classification_report(y_test, y_pred))

3    2651
2    2595
1    2151
Name: labels, dtype: int64
Int64Index([1, 2, 3], dtype='int64')


100%|████████████████████████████████████████████████████████| 7397/7397 [00:01<00:00, 5228.13it/s]
100%|███████████████████████████████████████████████████████| 7397/7397 [00:00<00:00, 15713.02it/s]
100%|█████████████████████████████████████████████████████| 7397/7397 [00:00<00:00, 1682406.96it/s]
100%|███████████████████████████████████████████████████████| 7397/7397 [00:00<00:00, 15675.66it/s]
100%|████████████████████████████████████████████████████████| 7397/7397 [00:01<00:00, 5774.22it/s]
100%|████████████████████████████████████████████████████████| 7397/7397 [00:01<00:00, 5821.11it/s]
100%|█████████████████████████████████████████████████████████| 7397/7397 [00:21<00:00, 336.26it/s]
100%|███████████████████████████████████████████████████████| 7397/7397 [00:00<00:00, 15885.01it/s]
100%|███████████████████████████████████████████████████████| 7397/7397 [00:00<00:00, 13099.43it/s]
100%|███████████████████████████████████████████████████████| 7397/7397 [00:00<00:00, 13196.51it/s]


X_train_imputed length: 5917, y_train length: 5917
X_test_imputed length: 1480, y_test length: 1480
              precision    recall  f1-score   support

           1       0.50      0.50      0.50       429
           2       0.54      0.52      0.53       547
           3       0.55      0.57      0.56       504

    accuracy                           0.53      1480
   macro avg       0.53      0.53      0.53      1480
weighted avg       0.53      0.53      0.53      1480



STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
