In [12]:
import pandas as pd
from datasets import load_dataset

print("Step 1: Loading my original data...")
df_my = pd.read_csv("cefr_augmented.csv", encoding='latin1')

# Fix encoding issues
df_my['text'] = df_my['text'].str.replace('Ã¼', 'ü').str.replace('Ã¤', 'ä')\
                             .str.replace('Ã¶', 'ö').str.replace('ÃŸ', 'ß')
df_my['CEFR'] = df_my['CEFR'].replace({'C2': 'C1'})

print(f"→ My data: {len(df_my)} sentences")

print("\nStep 2: Adding public German CEFR datasets from Hugging Face...")

# Public dataset 1
df1 = load_dataset("UniversalCEFR/elg_cefr_de", split="train").to_pandas()
df1 = df1[['text', 'cefr_level']].rename(columns={'cefr_level': 'CEFR'})
print(f"→ UniversalCEFR/elg_cefr_de: {len(df1)} sentences")

# Public dataset 2
df2 = load_dataset("EliasAhl/german-cefr", split="train").to_pandas()
df2 = df2[['text', 'cefrLevel']].rename(columns={'cefrLevel': 'CEFR'})
print(f"→ EliasAhl/german-cefr: {len(df2)} sentences")

# Clean C2 → C1 in both
for df in [df1, df2]:
    df['CEFR'] = df['CEFR'].replace({'C2': 'C1'})

print("\nStep 3: Merging all data and removing duplicates...")
final_df = pd.concat([df_my[['text', 'CEFR']], df1, df2], ignore_index=True)
final_df = final_df.drop_duplicates(subset='text').dropna()

print(f"→ Final dataset: {len(final_df)} unique sentences")

print("\nStep 4: Saving as cefr_final_merged.csv")
final_df.to_csv("cefr_final_merged.csv", index=False)

print("cefr_final_merged.csv has been created!")
print("You can now open it and see all 2273 sentences with CEFR levels")

Step 1: Loading my original data...
→ My data: 1259 sentences

Step 2: Adding public German CEFR datasets from Hugging Face...
→ UniversalCEFR/elg_cefr_de: 509 sentences
→ EliasAhl/german-cefr: 606 sentences

Step 3: Merging all data and removing duplicates...
→ Final dataset: 2273 unique sentences

Step 4: Saving as cefr_final_merged.csv
cefr_final_merged.csv has been created!
You can now open it and see all 2273 sentences with CEFR levels


In [8]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.svm import LinearSVC
from sklearn.metrics import accuracy_score, classification_report
import joblib
import os

# Load the merged data you already created (fast, no internet!)
df = pd.read_csv("cefr_final_merged.csv")

labels = ['A1', 'A2', 'B1', 'B2', 'C1']
df['label'] = df['CEFR'].map({l: i for i, l in enumerate(labels)})

train_df, test_df = train_test_split(df, test_size=0.2, random_state=42, stratify=df['label'])

# Best model for German CEFR
vectorizer = TfidfVectorizer(analyzer='char_wb', ngram_range=(3,5), lowercase=False)
X_train = vectorizer.fit_transform(train_df['text'])
X_test  = vectorizer.transform(test_df['text'])
y_train = train_df['label']
y_test  = test_df['label']

model = LinearSVC(C=1.0, class_weight='balanced', max_iter=1000, random_state=42)
model.fit(X_train, y_train)

print(f"ACCURACY: {accuracy_score(y_test, model.predict(X_test)):.4f}")

# Save (overwrite old weak model)
os.makedirs("cefr_german_model", exist_ok=True)
joblib.dump(model,      "cefr_german_model/model.pkl")
joblib.dump(vectorizer, "cefr_german_model/vectorizer.pkl")
joblib.dump(labels,     "cefr_german_model/labels.pkl")
print("Model saved!")

ACCURACY: 0.7912
Model saved!


In [10]:
import joblib

# Load your perfect 79.1% model
vectorizer = joblib.load("cefr_german_model/vectorizer.pkl")
model      = joblib.load("cefr_german_model/model.pkl")
labels     = ['A1', 'A2', 'B1', 'B2', 'C1']

def cefr(sentence):
    return labels[model.predict(vectorizer.transform([sentence]))[0]]

# One sentence → one prediction → done
sentence = input("Enter a German sentence: ").strip()

if not sentence:
    print("No input — goodbye!")
else:
    level = cefr(sentence)
    print(f"\nCEFR Level: {level}")

Enter a German sentence:  Ich denke, dass wir mehr für den Umweltschutz tun sollten.



CEFR Level: C1
