In [22]:
import pandas as pd

# Load the CSV
df = pd.read_csv("cefr_augmented.csv")

# Check the first few rows
print(df.head())


                                                text CEFR
0  M. Meier Müllergasse 1 Stadt X Internationale ...   B2
1  Müller Julia Bahnhofsstr. 1 A Stadt X Armenien...   B2
2  Michael Meier 1 Zentralplatz 1234. Stadt X Aup...   B2
3  Eva Meier Schmidt Müllergasse 12 Stadt X Kroat...   B2
4  Abs. Frau EVA SCHMIDT BAHNHOFSTR, , 1234 STADT...   B1


In [24]:
import pandas as pd
from googletrans import Translator
import time

# Load your cleaned CSV
df = pd.read_csv("merlin_meta_german_clean.csv")

# Only minority classes
minority_df = df[df['CEFR'].isin(['A1', 'C1', 'C2'])]

translator = Translator()
augmented_texts = []

# Number of augmented versions per text
n_augment = 2  # You can change to 1,2,3

for idx, row in minority_df.iterrows():
    text = row['text']
    label = row['CEFR']
    
    for i in range(n_augment):
        try:
            # German -> English
            en_text = translator.translate(text, src='de', dest='en').text
            # English -> German
            de_text = translator.translate(en_text, src='en', dest='de').text
            augmented_texts.append({'text': de_text, 'CEFR': label})
            time.sleep(1)  # avoid hitting Google too fast
        except Exception as e:
            print(f"Translation failed for index {idx}: {e}")

# Create DataFrame of augmented texts
aug_df = pd.DataFrame(augmented_texts)

# Combine with original data
df_augmented = pd.concat([df, aug_df], ignore_index=True)

# Save to CSV
df_augmented.to_csv("cefr_augmented.csv", index=False, encoding='utf-8')
print("Augmentation done! Total rows:", len(df_augmented))


Augmentation done! Total rows: 1239


In [283]:
import pandas as pd
from sklearn.model_selection import train_test_split, StratifiedKFold, cross_val_score
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix

# Load dataset
df = pd.read_csv("cefr_augmented.csv", encoding='latin1')
df['text'] = df['text'].str.replace('Ã¼', 'ü')
df['text'] = df['text'].str.replace('Ã¤', 'ä')
df['text'] = df['text'].str.replace('Ã¶', 'ö')
df['text'] = df['text'].str.replace('ÃŸ', 'ß')

# Merge rare class (assuming C2 is sparse)
df['CEFR'] = df['CEFR'].replace({'C2': 'C1'})

# Features and labels
X = df['text']
y = df['CEFR']

# Train-test split
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42, stratify=y
)

# TF-IDF vectorizer with fixes for German
vectorizer = TfidfVectorizer(
    max_features=15000,
    ngram_range=(1, 3),
    sublinear_tf=True,
    lowercase=False,  # Preserve case for German nouns
    strip_accents=None  # Critical: Keep umlauts and accents intact
)

X_train_vec = vectorizer.fit_transform(X_train)
X_test_vec = vectorizer.transform(X_test)

# Random Forest model (alternative to Logistic; tune n_estimators if needed)
model = RandomForestClassifier(
    n_estimators=200,
    class_weight='balanced',
    random_state=42
)

# Cross-validation for better accuracy estimate
cv = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)
cv_scores = cross_val_score(model, X_train_vec, y_train, cv=cv, scoring='accuracy')
print("Cross-Validation Accuracy Scores:", cv_scores)
print("Mean CV Accuracy:", cv_scores.mean())

# Fit on full train set
model.fit(X_train_vec, y_train)

# Predictions
y_pred = model.predict(X_test_vec)

# Evaluation
print("Test Set Accuracy:", accuracy_score(y_test, y_pred))
print("\nClassification Report:\n", classification_report(y_test, y_pred))
print("\nConfusion Matrix:\n", confusion_matrix(y_test, y_pred, labels=['A1', 'A2', 'B1', 'B2', 'C1']))

Cross-Validation Accuracy Scores: [0.73267327 0.74752475 0.75124378 0.74129353 0.73134328]
Mean CV Accuracy: 0.7408157233633811
Test Set Accuracy: 0.6984126984126984

Classification Report:
               precision    recall  f1-score   support

          A1       0.71      0.69      0.70        35
          A2       0.63      0.75      0.68        63
          B1       0.71      0.54      0.61        67
          B2       0.68      0.83      0.75        59
          C1       1.00      0.71      0.83        28

    accuracy                           0.70       252
   macro avg       0.74      0.70      0.71       252
weighted avg       0.71      0.70      0.70       252


Confusion Matrix:
 [[24 11  0  0  0]
 [ 9 47  6  1  0]
 [ 0 17 36 14  0]
 [ 1  0  9 49  0]
 [ 0  0  0  8 20]]


In [285]:
vectorizer = TfidfVectorizer(max_features=15000, ngram_range=(1,3))
X_train_vec = vectorizer.fit_transform(X_train)
...
model.fit(X_train_vec, y_train)


In [287]:
def predict_cefr(text):
    # Transform the text using the previously fitted vectorizer
    text_vec = vectorizer.transform([text])
    
    # Predict CEFR level
    prediction = model.predict(text_vec)[0]
    
    return prediction


In [289]:
sample_text = """
Er hat sich stundenlang auf die Präsentation vorbereitet.
"""

print("Predicted CEFR level:", predict_cefr(sample_text))


Predicted CEFR level: A1


In [291]:
from transformers import AutoTokenizer, AutoModelForSequenceClassification, Trainer, TrainingArguments
from datasets import Dataset
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, classification_report
import pandas as pd

# Load data (same as above)
df = pd.read_csv("cefr_augmented.csv", encoding='latin1')
# ... (cleaning and merging as before)

# Prepare dataset
train_df, test_df = train_test_split(df, test_size=0.2, random_state=42, stratify=df['CEFR'])
train_dataset = Dataset.from_pandas(train_df[['text', 'CEFR']])
test_dataset = Dataset.from_pandas(test_df[['text', 'CEFR']])

# Label mapping
labels = sorted(df['CEFR'].unique())
id2label = {i: label for i, label in enumerate(labels)}
label2id = {label: i for i, label in id2label.items()}

# Tokenize
tokenizer = AutoTokenizer.from_pretrained("deepset/gelectra-base")

def preprocess(examples):
    examples['label'] = [label2id[l] for l in examples['CEFR']]
    return tokenizer(examples['text'], truncation=True, padding=True, max_length=512)

train_dataset = train_dataset.map(preprocess, batched=True)
test_dataset = test_dataset.map(preprocess, batched=True)

# Model
model = AutoModelForSequenceClassification.from_pretrained("deepset/gelectra-base", num_labels=len(labels), id2label=id2label, label2id=label2id)

# Training
training_args = TrainingArguments(
    output_dir="./results",
    num_train_epochs=3,
    per_device_train_batch_size=16,
    per_device_eval_batch_size=16,
    evaluation_strategy="epoch",
    save_strategy="epoch",
    load_best_model_at_end=True,
    metric_for_best_model="accuracy",
)

def compute_metrics(eval_pred):
    logits, labels = eval_pred
    predictions = logits.argmax(-1)
    return {"accuracy": accuracy_score(labels, predictions)}

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=test_dataset,
    compute_metrics=compute_metrics,
)

trainer.train()

# Evaluate
results = trainer.evaluate()
print(results)
preds = trainer.predict(test_dataset).predictions.argmax(-1)
print(classification_report(test_dataset['label'], preds, target_names=labels))

ModuleNotFoundError: No module named 'datasets'

In [293]:
pip install xgboost textstat

Collecting xgboostNote: you may need to restart the kernel to use updated packages.

  Downloading xgboost-3.1.2-py3-none-win_amd64.whl.metadata (2.1 kB)
Collecting textstat
  Downloading textstat-0.7.11-py3-none-any.whl.metadata (15 kB)
Collecting pyphen (from textstat)
  Downloading pyphen-0.17.2-py3-none-any.whl.metadata (3.2 kB)
Downloading xgboost-3.1.2-py3-none-win_amd64.whl (72.0 MB)
   ---------------------------------------- 0.0/72.0 MB ? eta -:--:--
   ---------------------------------------- 0.0/72.0 MB ? eta -:--:--
   ---------------------------------------- 0.8/72.0 MB 2.8 MB/s eta 0:00:26
   - -------------------------------------- 1.8/72.0 MB 5.3 MB/s eta 0:00:14
   - -------------------------------------- 2.4/72.0 MB 3.4 MB/s eta 0:00:21
   - -------------------------------------- 2.4/72.0 MB 3.4 MB/s eta 0:00:21
   - -------------------------------------- 3.1/72.0 MB 2.9 MB/s eta 0:00:24
   - -------------------------------------- 3.4/72.0 MB 2.5 MB/s eta 0:00:28
   -

In [295]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split, StratifiedKFold, cross_val_score
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix
from sklearn.compose import ColumnTransformer, make_column_transformer
from sklearn.pipeline import Pipeline
from sklearn.base import BaseEstimator, TransformerMixin
from xgboost import XGBClassifier
from sklearn.preprocessing import LabelEncoder
import textstat  # For readability scores

# Custom transformer for linguistic features
class LinguisticFeatures(BaseEstimator, TransformerMixin):
    def fit(self, X, y=None):
        return self
    
    def transform(self, X):
        features = []
        for text in X:
            words = text.split()
            num_words = len(words)
            num_unique_words = len(set(words))
            avg_word_len = np.mean([len(w) for w in words]) if num_words > 0 else 0
            ttr = num_unique_words / num_words if num_words > 0 else 0
            sent_len = len(text)  # Char length as proxy for sentence complexity
            flesch = textstat.flesch_reading_ease(text)  # Adaptable to German
            features.append([num_words, num_unique_words, avg_word_len, ttr, sent_len, flesch])
        return np.array(features)

# Load dataset
df = pd.read_csv("cefr_augmented.csv", encoding='latin1')
df['text'] = df['text'].str.replace('Ã¼', 'ü')
df['text'] = df['text'].str.replace('Ã¤', 'ä')
df['text'] = df['text'].str.replace('Ã¶', 'ö')
df['text'] = df['text'].str.replace('ÃŸ', 'ß')

# Merge rare class
df['CEFR'] = df['CEFR'].replace({'C2': 'C1'})

# Features and labels
X = df[['text']]  # Use DF for column transformer
y = df['CEFR']

# Encode labels for XGBoost (needs numeric)
le = LabelEncoder()
y_encoded = le.fit_transform(y)

# Train-test split
X_train, X_test, y_train, y_test = train_test_split(
    X, y_encoded, test_size=0.2, random_state=42, stratify=y_encoded
)

# Pipeline: TF-IDF on text + linguistic features
preprocessor = make_column_transformer(
    (TfidfVectorizer(max_features=15000, ngram_range=(1, 3), sublinear_tf=True, lowercase=False, strip_accents=None), 'text'),
    (LinguisticFeatures(), 'text')
)

# XGBoost model
model = Pipeline([
    ('preprocessor', preprocessor),
    ('classifier', XGBClassifier(n_estimators=200, random_state=42, eval_metric='mlogloss', scale_pos_weight=1))  # Handles multi-class
])

# Cross-validation
cv = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)
cv_scores = cross_val_score(model, X_train, y_train, cv=cv, scoring='accuracy')
print("Cross-Validation Accuracy Scores:", cv_scores)
print("Mean CV Accuracy:", cv_scores.mean())

# Fit on full train
model.fit(X_train, y_train)

# Predictions
y_pred = model.predict(X_test)
y_pred_labels = le.inverse_transform(y_pred)
y_test_labels = le.inverse_transform(y_test)

# Evaluation
print("Test Set Accuracy:", accuracy_score(y_test_labels, y_pred_labels))
print("\nClassification Report:\n", classification_report(y_test_labels, y_pred_labels))
print("\nConfusion Matrix:\n", confusion_matrix(y_test_labels, y_pred_labels, labels=le.classes_))

Parameters: { "scale_pos_weight" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)
Parameters: { "scale_pos_weight" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)
Parameters: { "scale_pos_weight" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)
Parameters: { "scale_pos_weight" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)
Parameters: { "scale_pos_weight" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)


Cross-Validation Accuracy Scores: [0.7029703  0.72772277 0.68656716 0.70646766 0.70149254]
Mean CV Accuracy: 0.705044086498202


Parameters: { "scale_pos_weight" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)


Test Set Accuracy: 0.7341269841269841

Classification Report:
               precision    recall  f1-score   support

          A1       0.82      0.77      0.79        35
          A2       0.76      0.76      0.76        63
          B1       0.67      0.72      0.69        67
          B2       0.67      0.75      0.70        59
          C1       1.00      0.64      0.78        28

    accuracy                           0.73       252
   macro avg       0.78      0.73      0.75       252
weighted avg       0.75      0.73      0.74       252


Confusion Matrix:
 [[27  8  0  0  0]
 [ 5 48 10  0  0]
 [ 0  7 48 12  0]
 [ 1  0 14 44  0]
 [ 0  0  0 10 18]]


In [297]:
pip install imbalanced-learn

Note: you may need to restart the kernel to use updated packages.


In [299]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split, StratifiedKFold, cross_val_score, GridSearchCV
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix
from sklearn.compose import make_column_transformer
from sklearn.pipeline import Pipeline
from sklearn.base import BaseEstimator, TransformerMixin
from xgboost import XGBClassifier
from sklearn.preprocessing import LabelEncoder, OrdinalEncoder
from imblearn.over_sampling import SMOTE
from imblearn.pipeline import Pipeline as ImbPipeline  # For SMOTE integration
import textstat

# Custom transformer for linguistic features (enhanced with syllable proxy)
class LinguisticFeatures(BaseEstimator, TransformerMixin):
    def fit(self, X, y=None):
        return self
    
    def transform(self, X):
        features = []
        for text in X:
            words = [w.strip('.,!?;:"()') for w in text.split() if w.isalpha()]  # Clean words
            num_words = len(words)
            num_unique_words = len(set(words))
            avg_word_len = np.mean([len(w) for w in words]) if num_words > 0 else 0
            ttr = num_unique_words / num_words if num_words > 0 else 0
            sent_len = len(text)
            flesch = textstat.flesch_reading_ease(text)
            
            # Simple German syllable proxy: count vowels + adjustments (ä/ö/ü/ei count extra)
            def syllable_count(word):
                vowels = 'aeiouäöüAEIOUÄÖÜ'
                count = sum(1 for char in word if char in vowels)
                if any(dip in word.lower() for dip in ['ei', 'ai', 'au', 'eu', 'äu']):
                    count += 1  # Diphthongs
                return max(1, count)
            
            avg_syllables = np.mean([syllable_count(w) for w in words]) if num_words > 0 else 0
            features.append([num_words, num_unique_words, avg_word_len, ttr, sent_len, flesch, avg_syllables])
        return np.array(features)

# Load dataset
df = pd.read_csv("cefr_augmented.csv", encoding='latin1')
df['text'] = df['text'].str.replace('Ã¼', 'ü')
df['text'] = df['text'].str.replace('Ã¤', 'ä')
df['text'] = df['text'].str.replace('Ã¶', 'ö')
df['text'] = df['text'].str.replace('ÃŸ', 'ß')

# Merge rare class
df['CEFR'] = df['CEFR'].replace({'C2': 'C1'})

# Ordinal encoding for ordered loss (A1=0, A2=1, B1=2, B2=3, C1=4)
ordinal_map = {'A1': 0, 'A2': 1, 'B1': 2, 'B2': 3, 'C1': 4}
df['CEFR_ordinal'] = df['CEFR'].map(ordinal_map)
y = df['CEFR']
y_ordinal = df['CEFR_ordinal']

# Features
X = df[['text']]

# Encode categorical y for reports
le = LabelEncoder()
y_encoded = le.fit_transform(y)

# Train-test split (use ordinal for training, categorical for eval)
X_train, X_test, y_train_ord, y_test_ord = train_test_split(
    X, y_ordinal, test_size=0.2, random_state=42, stratify=y_ordinal
)
y_train_cat = le.transform([le.classes_[i] for i in y_train_ord])
y_test_cat = le.transform([le.classes_[i] for i in y_test_ord])

# Preprocessor
preprocessor = make_column_transformer(
    (TfidfVectorizer(max_features=15000, ngram_range=(1, 3), sublinear_tf=True, lowercase=False, strip_accents=None), 'text'),
    (LinguisticFeatures(), 'text'),
    remainder='drop'
)

# Pipeline with SMOTE + XGBoost (treat as multi-class on ordinal labels)
model = ImbPipeline([
    ('preprocessor', preprocessor),
    ('smote', SMOTE(random_state=42, k_neighbors=3)),  # Oversample minorities
    ('classifier', XGBClassifier(random_state=42, eval_metric='mlogloss', objective='multi:softprob'))
])

# Basic tuning
param_grid = {
    'classifier__n_estimators': [150, 200],
    'classifier__max_depth': [4, 6],
    'classifier__learning_rate': [0.05, 0.1]
}
grid_search = GridSearchCV(model, param_grid, cv=3, scoring='accuracy', n_jobs=-1)
grid_search.fit(X_train, y_train_ord)

print("Best Params:", grid_search.best_params_)
model = grid_search.best_estimator_

# Cross-validation on tuned model
cv = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)
cv_scores = cross_val_score(model, X_train, y_train_ord, cv=cv, scoring='accuracy')
print("Cross-Validation Accuracy Scores:", cv_scores)
print("Mean CV Accuracy:", cv_scores.mean())

# Fit and predict (threshold ordinal preds to nearest class)
y_pred_ord = model.predict(X_test)
y_pred_cat = le.transform([le.classes_[np.round(p).astype(int)] for p in y_pred_ord])  # Round to nearest level

# Evaluation
print("Test Set Accuracy:", accuracy_score(y_test_cat, y_pred_cat))
print("\nClassification Report:\n", classification_report(y_test_cat, y_pred_cat, target_names=le.classes_))
print("\nConfusion Matrix:\n", confusion_matrix(y_test_cat, y_pred_cat, labels=range(len(le.classes_))))

Best Params: {'classifier__learning_rate': 0.1, 'classifier__max_depth': 6, 'classifier__n_estimators': 200}
Cross-Validation Accuracy Scores: [0.72277228 0.73762376 0.71144279 0.72139303 0.71144279]
Mean CV Accuracy: 0.7209349293138269
Test Set Accuracy: 0.7261904761904762

Classification Report:
               precision    recall  f1-score   support

          A1       0.79      0.74      0.76        35
          A2       0.74      0.76      0.75        63
          B1       0.67      0.69      0.68        67
          B2       0.67      0.76      0.71        59
          C1       1.00      0.64      0.78        28

    accuracy                           0.73       252
   macro avg       0.77      0.72      0.74       252
weighted avg       0.74      0.73      0.73       252


Confusion Matrix:
 [[26  9  0  0  0]
 [ 5 48 10  0  0]
 [ 1  8 46 12  0]
 [ 1  0 13 45  0]
 [ 0  0  0 10 18]]


In [303]:
!pip install datasets

Collecting datasets
  Downloading datasets-4.4.1-py3-none-any.whl.metadata (19 kB)
Collecting pyarrow>=21.0.0 (from datasets)
  Downloading pyarrow-22.0.0-cp312-cp312-win_amd64.whl.metadata (3.3 kB)
Collecting xxhash (from datasets)
  Downloading xxhash-3.6.0-cp312-cp312-win_amd64.whl.metadata (13 kB)
Collecting multiprocess<0.70.19 (from datasets)
  Downloading multiprocess-0.70.18-py312-none-any.whl.metadata (7.5 kB)
Collecting dill<0.4.1,>=0.3.0 (from datasets)
  Downloading dill-0.4.0-py3-none-any.whl.metadata (10 kB)
Downloading datasets-4.4.1-py3-none-any.whl (511 kB)
Downloading multiprocess-0.70.18-py312-none-any.whl (150 kB)
Downloading dill-0.4.0-py3-none-any.whl (119 kB)
Downloading pyarrow-22.0.0-cp312-cp312-win_amd64.whl (28.0 MB)
   ---------------------------------------- 0.0/28.0 MB ? eta -:--:--
    --------------------------------------- 0.5/28.0 MB 2.1 MB/s eta 0:00:14
   - -------------------------------------- 1.0/28.0 MB 2.5 MB/s eta 0:00:11
   - -----------------

  You can safely remove it manually.


In [7]:
pip install --upgrade pip


Collecting pip
  Downloading pip-25.3-py3-none-any.whl.metadata (4.7 kB)
Downloading pip-25.3-py3-none-any.whl (1.8 MB)
   ---------------------------------------- 0.0/1.8 MB ? eta -:--:--
   ---------------------------------------- 0.0/1.8 MB ? eta -:--:--
   ---------------------------------------- 0.0/1.8 MB ? eta -:--:--
   ---------------------------------------- 0.0/1.8 MB ? eta -:--:--
   ---------------------------------------- 0.0/1.8 MB ? eta -:--:--
   ----- ---------------------------------- 0.3/1.8 MB ? eta -:--:--
   ----- ---------------------------------- 0.3/1.8 MB ? eta -:--:--
   ----- ---------------------------------- 0.3/1.8 MB ? eta -:--:--
   ----- ---------------------------------- 0.3/1.8 MB ? eta -:--:--
   ----- ---------------------------------- 0.3/1.8 MB ? eta -:--:--
   ----- ---------------------------------- 0.3/1.8 MB ? eta -:--:--
   ----- ---------------------------------- 0.3/1.8 MB ? eta -:--:--
   ----- ---------------------------------- 0.3/1.8

In [9]:
pip install torch torchvision torchaudio --index-url https://download.pytorch.org/whl/cu118

Looking in indexes: https://download.pytorch.org/whl/cu118
Note: you may need to restart the kernel to use updated packages.


In [11]:
pip install transformers pandas scikit-learn tqdm




In [15]:
# --------------------------------------------------------------
# Install (run once in Anaconda Prompt/Jupyter)
# --------------------------------------------------------------
# pip install datasets  # For HF datasets
# conda install pyarrow -c conda-forge  # If needed
# pip install torch torchvision torchaudio --index-url https://download.pytorch.org/whl/cu118  # or cpuonly
# pip install transformers pandas scikit-learn tqdm

# --------------------------------------------------------------
# COMPLETE WORKING CODE – copy-paste and run
# --------------------------------------------------------------
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix
import torch
from torch.utils.data import Dataset as TorchDataset, DataLoader
from transformers import AutoTokenizer, AutoModelForSequenceClassification
from torch.optim import AdamW
from torch.optim.lr_scheduler import LinearLR
from tqdm.auto import tqdm
from datasets import load_dataset  # Reliable HF loader
import os

# ------------------- 1. Load your original data -------------------
print("Loading your cefr_augmented.csv...")
df_user = pd.read_csv("cefr_augmented.csv", encoding='latin1')
df_user['text'] = df_user['text'].str.replace('Ã¼', 'ü').str.replace('Ã¤', 'ä')\
                                 .str.replace('Ã¶', 'ö').str.replace('ÃŸ', 'ß')
df_user['CEFR'] = df_user['CEFR'].replace({'C2': 'C1'})

# ------------------- 2. Load public datasets via HF library -------------------
print("Loading public German CEFR datasets...")

# Dataset 1: UniversalCEFR/elg_cefr_de (~60 samples)
ds1 = load_dataset("UniversalCEFR/elg_cefr_de")
df1 = ds1['train'].to_pandas() if 'train' in ds1 else ds1.to_pandas()
df1 = df1[['text', 'cefr_level']].rename(columns={'cefr_level': 'CEFR'})
df1['CEFR'] = df1['CEFR'].replace({'C2': 'C1'})

# Dataset 2: EliasAhl/german-cefr (~500 samples)
ds2 = load_dataset("EliasAhl/german-cefr")
df2 = ds2['train'].to_pandas() if 'train' in ds2 else ds2.to_pandas()
df2 = df2[['text', 'cefrLevel']].rename(columns={'cefrLevel': 'CEFR'})
df2['CEFR'] = df2['CEFR'].replace({'C2': 'C1'})

# ------------------- 3. Merge everything -------------------
df_all = pd.concat([df_user[['text', 'CEFR']], df1, df2], ignore_index=True)
df_all.drop_duplicates(subset='text', inplace=True)
df_all.dropna(subset=['text', 'CEFR'], inplace=True)

df_all.to_csv("cefr_final_merged.csv", index=False)
print(f"\nSUCCESS! Merged dataset created: {len(df_all)} sentences")
print("CEFR distribution:")
print(df_all['CEFR'].value_counts().sort_index())

# ------------------- 4. Prepare labels & split -------------------
labels = ['A1', 'A2', 'B1', 'B2', 'C1']
label2id = {l: i for i, l in enumerate(labels)}
df_all['label'] = df_all['CEFR'].map(label2id)

train_df, test_df = train_test_split(df_all, test_size=0.2, random_state=42, stratify=df_all['label'])

# ------------------- 5. Tokenizer + Dataset -------------------
tokenizer = AutoTokenizer.from_pretrained("deepset/gelectra-base")

class CEFRDataset(TorchDataset):
    def __init__(self, texts, labels):
        self.encodings = tokenizer(texts.tolist(), truncation=True, padding=True, max_length=512)
        self.labels = labels.tolist()
    def __getitem__(self, idx):
        item = {k: torch.tensor(v[idx]) for k, v in self.encodings.items()}
        item['labels'] = torch.tensor(self.labels[idx])
        return item
    def __len__(self): return len(self.labels)

train_dataset = CEFRDataset(train_df['text'], train_df['label'])
test_dataset  = CEFRDataset(test_df['text'], test_df['label'])
train_loader = DataLoader(train_dataset, batch_size=16, shuffle=True)
test_loader  = DataLoader(test_dataset, batch_size=16, shuffle=False)

# ------------------- 6. Model -------------------
model = AutoModelForSequenceClassification.from_pretrained(
    "deepset/gelectra-base", num_labels=5, id2label={i: l for i, l in enumerate(labels)}, label2id=label2id
)
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model.to(device)

optimizer = AdamW(model.parameters(), lr=3e-5)
num_epochs = 4
scheduler = LinearLR(optimizer, total_iters=num_epochs * len(train_loader))

# ------------------- 7. Training -------------------
model.train()
for epoch in range(num_epochs):
    epoch_loss = 0
    for batch in tqdm(train_loader, desc=f"Epoch {epoch+1}/{num_epochs}"):
        batch = {k: v.to(device) for k, v in batch.items()}
        outputs = model(**batch)
        loss = outputs.loss
        loss.backward()
        optimizer.step()
        scheduler.step()
        optimizer.zero_grad()
        epoch_loss += loss.item()
    print(f"Epoch {epoch+1} – avg loss: {epoch_loss/len(train_loader):.4f}")

# ------------------- 8. Evaluation -------------------
model.eval()
preds, trues = [], []
with torch.no_grad():
    for batch in tqdm(test_loader, desc="Evaluating"):
        batch = {k: v.to(device) for k, v in batch.items()}
        outputs = model(**batch)
        preds.extend(torch.argmax(outputs.logits, dim=-1).cpu().numpy())
        trues.extend(batch['labels'].cpu().numpy())

print(f"\nFINAL TEST ACCURACY: {accuracy_score(trues, preds):.4f}")
print("\nClassification Report:\n", classification_report(trues, preds, target_names=labels))
print("\nConfusion Matrix:\n", confusion_matrix(trues, preds))

# ------------------- 9. Save model -------------------
os.makedirs("cefr_german_model", exist_ok=True)
model.save_pretrained("cefr_german_model")
tokenizer.save_pretrained("cefr_german_model")
print("Model saved → ./cefr_german_model")

# ------------------- 10. Predict function -------------------
def predict_cefr(sentence):
    model.eval()
    inputs = tokenizer(sentence, return_tensors="pt", truncation=True, padding=True, max_length=512)
    inputs = {k: v.to(device) for k, v in inputs.items()}
    with torch.no_grad():
        logits = model(**inputs).logits
    return labels[torch.argmax(logits, dim=-1).item()]

# Test
print("\nExamples:")
print(predict_cefr("Hallo, wie geht es dir?"))                                 # → A1/A2
print(predict_cefr("Die Auswirkungen des Klimawandels sind komplex und global."))  # → C1

AttributeError: module 'httpx' has no attribute 'RequestError'

In [17]:
pip install httpx==0.27.0 httpcore==1.0.5

Collecting httpx==0.27.0
  Downloading httpx-0.27.0-py3-none-any.whl.metadata (7.2 kB)
Collecting httpcore==1.0.5
  Downloading httpcore-1.0.5-py3-none-any.whl.metadata (20 kB)
Collecting h11<0.15,>=0.13 (from httpcore==1.0.5)
  Downloading h11-0.14.0-py3-none-any.whl.metadata (8.2 kB)
Downloading httpx-0.27.0-py3-none-any.whl (75 kB)
Downloading httpcore-1.0.5-py3-none-any.whl (77 kB)
Downloading h11-0.14.0-py3-none-any.whl (58 kB)
Installing collected packages: h11, httpcore, httpx

  Attempting uninstall: h11

    Found existing installation: h11 0.9.0

    Uninstalling h11-0.9.0:

      Successfully uninstalled h11-0.9.0

   ---------------------------------------- 0/3 [h11]
   ---------------------------------------- 0/3 [h11]
   ---------------------------------------- 0/3 [h11]
  Attempting uninstall: httpcore
   ---------------------------------------- 0/3 [h11]
    Found existing installation: httpcore 0.9.1
   ---------------------------------------- 0/3 [h11]
   ------------

ERROR: pip's dependency resolver does not currently take into account all the packages that are installed. This behaviour is the source of the following dependency conflicts.
googletrans 4.0.0rc1 requires httpx==0.13.3, but you have httpx 0.27.0 which is incompatible.


In [19]:
pip install datasets

Note: you may need to restart the kernel to use updated packages.


In [1]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, classification_report
import os

print("Loading data...")
df_user = pd.read_csv("cefr_augmented.csv", encoding='latin1')
df_user['text'] = df_user['text'].str.replace('Ã¼', 'ü').str.replace('Ã¤', 'ä')\
                                 .str.replace('Ã¶', 'ö').str.replace('ÃŸ', 'ß')
df_user['CEFR'] = df_user['CEFR'].replace({'C2': 'C1'})

from datasets import load_dataset
df1 = load_dataset("UniversalCEFR/elg_cefr_de", split="train").to_pandas()[['text','cefr_level']].rename(columns={'cefr_level':'CEFR'})
df2 = load_dataset("EliasAhl/german-cefr", split="train").to_pandas()[['text','cefrLevel']].rename(columns={'cefrLevel':'CEFR'})

for df in [df1, df2]:
    df['CEFR'] = df['CEFR'].replace({'C2': 'C1'})

df = pd.concat([df_user[['text','CEFR']], df1, df2]).drop_duplicates('text').dropna()
df.to_csv("cefr_final_merged.csv", index=False)
print(f"Ready! Total {len(df)} sentences")

labels = ['A1','A2','B1','B2','C1']
df['label'] = df['CEFR'].map({l:i for i,l in enumerate(labels)})

train_df, test_df = train_test_split(df, test_size=0.2, random_state=42, stratify=df['label'])

vectorizer = TfidfVectorizer(max_features=10000, ngram_range=(1,3), lowercase=False)
X_train = vectorizer.fit_transform(train_df['text'])
X_test = vectorizer.transform(test_df['text'])
y_train = train_df['label']
y_test = test_df['label']

print("Training model (takes 20-40 seconds)...")
model = RandomForestClassifier(n_estimators=300, random_state=42, n_jobs=-1, class_weight='balanced')
model.fit(X_train, y_train)

pred = model.predict(X_test)
acc = accuracy_score(y_test, pred)
print(f"\nFINAL TEST ACCURACY: {acc:.4f}")
print(classification_report(y_test, pred, target_names=labels))

os.makedirs("cefr_german_model", exist_ok=True)
import joblib
joblib.dump(model, "cefr_german_model/model.pkl")
joblib.dump(vectorizer, "cefr_german_model/vectorizer.pkl")
joblib.dump(labels, "cefr_german_model/labels.pkl")
print("\nModel saved to ./cefr_german_model")
print("To predict later, run this in a new cell:")
print('''import joblib
vectorizer = joblib.load("cefr_german_model/vectorizer.pkl")
model = joblib.load("cefr_german_model/model.pkl")
labels = joblib.load("cefr_german_model/labels.pkl")
def predict(sentence):
    vec = vectorizer.transform([sentence])
    p = model.predict(vec)[0]
    return labels[p]
print(predict("Hallo, wie geht es dir?"))''')

Loading data...
Ready! Total 2273 sentences
Training model (takes 20-40 seconds)...

FINAL TEST ACCURACY: 0.7363
              precision    recall  f1-score   support

          A1       0.92      0.47      0.62        51
          A2       0.72      0.81      0.76       102
          B1       0.75      0.64      0.69       105
          B2       0.76      0.73      0.74       102
          C1       0.69      0.92      0.79        95

    accuracy                           0.74       455
   macro avg       0.77      0.71      0.72       455
weighted avg       0.75      0.74      0.73       455


Model saved to ./cefr_german_model
To predict later, run this in a new cell:
import joblib
vectorizer = joblib.load("cefr_german_model/vectorizer.pkl")
model = joblib.load("cefr_german_model/model.pkl")
labels = joblib.load("cefr_german_model/labels.pkl")
def predict(sentence):
    vec = vectorizer.transform([sentence])
    p = model.predict(vec)[0]
    return labels[p]
print(predict("Hallo, wi

In [40]:
import pandas as pd
from sklearn.model_selection import train_test_split
from datasets import load_dataset
import joblib

# Load your data + public data
df_user = pd.read_csv("cefr_augmented.csv", encoding='latin1')
df_user['text'] = df_user['text'].str.replace('Ã¼', 'ü').str.replace('Ã¤', 'ä')\
                                 .str.replace('Ã¶', 'ö').str.replace('ÃŸ', 'ß')
df_user['CEFR'] = df_user['CEFR'].replace({'C2': 'C1'})

df1 = load_dataset("UniversalCEFR/elg_cefr_de", split="train").to_pandas()[['text','cefr_level']].rename(columns={'cefr_level':'CEFR'})
df2 = load_dataset("EliasAhl/german-cefr", split="train").to_pandas()[['text','cefrLevel']].rename(columns={'cefrLevel':'CEFR'})

for df in [df1, df2]: 
    df['CEFR'] = df['CEFR'].replace({'C2': 'C1'})

df = pd.concat([df_user[['text','CEFR']], df1, df2]).drop_duplicates('text').dropna()

labels = ['A1','A2','B1','B2','C1']
df['label'] = df['CEFR'].map({l:i for i,l in enumerate(labels)})

train_df, test_df = train_test_split(df, test_size=0.2, random_state=42, stratify=df['label'])
print("Data ready:", len(df), "sentences")

Data ready: 2273 sentences


In [62]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.svm import LinearSVC
from sklearn.metrics import accuracy_score, classification_report

# This vectorizer is the one that actually works for CEFR
vectorizer = TfidfVectorizer(
    analyzer='char_wb',
    ngram_range=(3,5),      # 3–5 character n-grams = gold for CEFR
    lowercase=False,
    max_features=None       # use all features
)

X_train = vectorizer.fit_transform(train_df['text'])
X_test  = vectorizer.transform(test_df['text'])
y_train = train_df['label']
y_test  = test_df['label']

print("Training LinearSVC...")
model = LinearSVC(C=1.0, class_weight='balanced', max_iter=1000)
model.fit(X_train, y_train)

pred = model.predict(X_test)
print(f"\nFINAL ACCURACY: {accuracy_score(y_test, pred):.4f}")
print(classification_report(y_test, pred, target_names=labels))

# Save the working model
import os
os.makedirs("cefr_german_model", exist_ok=True)
joblib.dump(model, "cefr_german_model/model.pkl")
joblib.dump(vectorizer, "cefr_german_model/vectorizer.pkl")
joblib.dump(labels, "cefr_german_model/labels.pkl")
print("Perfect model saved!")

Training LinearSVC...

FINAL ACCURACY: 0.7912
              precision    recall  f1-score   support

          A1       0.85      0.80      0.83        51
          A2       0.77      0.83      0.80       102
          B1       0.74      0.67      0.70       105
          B2       0.76      0.75      0.75       102
          C1       0.85      0.93      0.89        95

    accuracy                           0.79       455
   macro avg       0.80      0.80      0.80       455
weighted avg       0.79      0.79      0.79       455

Perfect model saved!


In [68]:
import joblib

# Load your perfect 79.1% model
vectorizer = joblib.load("cefr_german_model/vectorizer.pkl")
model      = joblib.load("cefr_german_model/model.pkl")
labels     = ['A1', 'A2', 'B1', 'B2', 'C1']

def cefr(sentence):
    return labels[model.predict(vectorizer.transform([sentence]))[0]]

# One sentence → one prediction → done
sentence = input("Enter a German sentence: ").strip()

if not sentence:
    print("No input — goodbye!")
else:
    level = cefr(sentence)
    print(f"\nCEFR Level: {level}")

Enter a German sentence:  Ich denke, dass wir mehr für den Umweltschutz tun sollten.



CEFR Level: C1


In [60]:
import pandas as pd
from sklearn.model_selection import train_test_split

# Read directly from the merged CSV you already created
print("Loading data from cefr_final_merged.csv ...")
df = pd.read_csv("cefr_final_merged.csv")

# Make sure CEFR column is clean (just in case)
df['CEFR'] = df['CEFR'].replace({'C2': 'C1'}).str.strip()

# Create numeric labels
labels = ['A1', 'A2', 'B1', 'B2', 'C1']
df['label'] = df['CEFR'].map({level: idx for idx, level in enumerate(labels)})

# Train-test split
train_df, test_df = train_test_split(
    df, test_size=0.2, random_state=42, stratify=df['label']
)

print(f"Data ready: {len(df)} sentences loaded from cefr_final_merged.csv")
print("CEFR distribution:")
print(df['CEFR'].value_counts().sort_index())

Loading data from cefr_final_merged.csv ...
Data ready: 2273 sentences loaded from cefr_final_merged.csv
CEFR distribution:
CEFR
A1    254
A2    507
B1    524
B2    511
C1    477
Name: count, dtype: int64


In [70]:
# ======================================================
# HOW I CREATED cefr_final_merged.csv
# (Run this cell to show sir — it creates the CSV file)
# ======================================================

import pandas as pd
from datasets import load_dataset

print("Step 1: Loading my original data...")
df_my = pd.read_csv("cefr_augmented.csv", encoding='latin1')

# Fix encoding issues
df_my['text'] = df_my['text'].str.replace('Ã¼', 'ü').str.replace('Ã¤', 'ä')\
                             .str.replace('Ã¶', 'ö').str.replace('ÃŸ', 'ß')
df_my['CEFR'] = df_my['CEFR'].replace({'C2': 'C1'})

print(f"→ My data: {len(df_my)} sentences")

print("\nStep 2: Adding public German CEFR datasets from Hugging Face...")

# Public dataset 1
df1 = load_dataset("UniversalCEFR/elg_cefr_de", split="train").to_pandas()
df1 = df1[['text', 'cefr_level']].rename(columns={'cefr_level': 'CEFR'})
print(f"→ UniversalCEFR/elg_cefr_de: {len(df1)} sentences")

# Public dataset 2
df2 = load_dataset("EliasAhl/german-cefr", split="train").to_pandas()
df2 = df2[['text', 'cefrLevel']].rename(columns={'cefrLevel': 'CEFR'})
print(f"→ EliasAhl/german-cefr: {len(df2)} sentences")

# Clean C2 → C1 in both
for df in [df1, df2]:
    df['CEFR'] = df['CEFR'].replace({'C2': 'C1'})

print("\nStep 3: Merging all data and removing duplicates...")
final_df = pd.concat([df_my[['text', 'CEFR']], df1, df2], ignore_index=True)
final_df = final_df.drop_duplicates(subset='text').dropna()  # remove empty rows

print(f"→ Final dataset: {len(final_df)} unique sentences")

print("\nStep 4: Saving as cefr_final_merged.csv")
final_df.to_csv("cefr_final_merged.csv", index=False)

print("cefr_final_merged.csv has been created!")
print("You can now open it and see all 2273 sentences with CEFR levels")

# Show first few rows
print("\nFirst 10 rows:")
print(final_df.head(10))

Step 1: Loading my original data...
→ My data: 1259 sentences

Step 2: Adding public German CEFR datasets from Hugging Face...
→ UniversalCEFR/elg_cefr_de: 509 sentences
→ EliasAhl/german-cefr: 606 sentences

Step 3: Merging all data and removing duplicates...
→ Final dataset: 2273 unique sentences

Step 4: Saving as cefr_final_merged.csv
cefr_final_merged.csv has been created!
You can now open it and see all 2273 sentences with CEFR levels

First 10 rows:
                                                text CEFR
0  M. Meier Müllergasse 1 Stadt X Internationale ...   B2
1  Müller Julia Bahnhofsstr. 1 A Stadt X Armenien...   B2
2  Michael Meier 1 Zentralplatz 1234. Stadt X Aup...   B2
3  Eva Meier Schmidt Müllergasse 12 Stadt X Kroat...   B2
4  Abs. Frau EVA SCHMIDT BAHNHOFSTR, , 1234 STADT...   B1
5  Maria Schmidt BahnhofsstraÃe - 12 Stadt X . S...   B1
6  Stadt X, Internationale Au-pair Vermittlung Ba...   B2
7  Meier Katharina . 1234 Stadt X Computer-Spezia...   B2
8  Maria Schmidt 

In [74]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.svm import LinearSVC
from sklearn.metrics import accuracy_score, classification_report
import joblib
import os

# Load the merged data you already created (fast, no internet!)
df = pd.read_csv("cefr_final_merged.csv")

labels = ['A1', 'A2', 'B1', 'B2', 'C1']
df['label'] = df['CEFR'].map({l: i for i, l in enumerate(labels)})

train_df, test_df = train_test_split(df, test_size=0.2, random_state=42, stratify=df['label'])

# Best model for German CEFR
vectorizer = TfidfVectorizer(analyzer='char_wb', ngram_range=(3,5), lowercase=False)
X_train = vectorizer.fit_transform(train_df['text'])
X_test  = vectorizer.transform(test_df['text'])
y_train = train_df['label']
y_test  = test_df['label']

model = LinearSVC(C=1.0, class_weight='balanced', max_iter=1000, random_state=42)
model.fit(X_train, y_train)

print(f"ACCURACY: {accuracy_score(y_test, model.predict(X_test)):.4f}")

# Save (overwrite old weak model)
os.makedirs("cefr_german_model", exist_ok=True)
joblib.dump(model,      "cefr_german_model/model.pkl")
joblib.dump(vectorizer, "cefr_german_model/vectorizer.pkl")
joblib.dump(labels,     "cefr_german_model/labels.pkl")
print("Model saved!")

ACCURACY: 0.7912
Model saved!
