In [2]:
import pandas as pd

# Load the cleaned CSV
df = pd.read_csv("merlin_meta_german_clean.csv")

# Check the first few rows
print(df.head())


                                                text CEFR
0  M. Meier Müllergasse 1 Stadt X Internationale ...   B2
1  Müller Julia Bahnhofsstr. 1 A Stadt X Armenien...   B2
2  Michael Meier 1 Zentralplatz 1234. Stadt X Aup...   B2
3  Eva Meier Schmidt Müllergasse 12 Stadt X Kroat...   B2
4  Abs. Frau EVA SCHMIDT BAHNHOFSTR, , 1234 STADT...   B1


In [4]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, classification_report

# Step 1: Load CSV
df = pd.read_csv("merlin_meta_german_clean.csv")

# Step 2: Check class distribution
print("Class distribution:\n", df['CEFR'].value_counts())

# Step 3: Split into train/test (stratified to preserve class proportions)
X_train, X_test, y_train, y_test = train_test_split(
    df['text'], df['CEFR'], test_size=0.2, random_state=42, stratify=df['CEFR']
)

# Step 4: Convert text to TF-IDF vectors
vectorizer = TfidfVectorizer(max_features=5000, ngram_range=(1,2))
X_train_tfidf = vectorizer.fit_transform(X_train)
X_test_tfidf = vectorizer.transform(X_test)

# Step 5: Train Logistic Regression with class weights
lr = LogisticRegression(
    max_iter=1000,
    multi_class='multinomial',
    solver='lbfgs',
    class_weight='balanced'   # <-- this balances minority classes
)
lr.fit(X_train_tfidf, y_train)

# Step 6: Predict and evaluate
y_pred = lr.predict(X_test_tfidf)
print("Accuracy:", accuracy_score(y_test, y_pred))
print("\nClassification Report:\n", classification_report(y_test, y_pred))


Class distribution:
 CEFR
B1    331
A2    306
B2    293
A1     57
C1     42
C2      4
Name: count, dtype: int64




Accuracy: 0.6473429951690821

Classification Report:
               precision    recall  f1-score   support

          A1       0.64      0.58      0.61        12
          A2       0.74      0.61      0.67        61
          B1       0.61      0.65      0.63        66
          B2       0.70      0.73      0.72        59
          C1       0.29      0.50      0.36         8
          C2       0.00      0.00      0.00         1

    accuracy                           0.65       207
   macro avg       0.50      0.51      0.50       207
weighted avg       0.66      0.65      0.65       207



  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


In [6]:
# Separate minority classes
minority_df = df[df['CEFR'].isin(['A1', 'C1', 'C2'])]
majority_df = df[~df['CEFR'].isin(['A1', 'C1', 'C2'])]


In [12]:
from transformers import MarianMTModel, MarianTokenizer

# German -> English
de_en_model = MarianMTModel.from_pretrained('Helsinki-NLP/opus-mt-de-en')
de_en_tokenizer = MarianTokenizer.from_pretrained('Helsinki-NLP/opus-mt-de-en')

# English -> German
en_de_model = MarianMTModel.from_pretrained('Helsinki-NLP/opus-mt-en-de')
en_de_tokenizer = MarianTokenizer.from_pretrained('Helsinki-NLP/opus-mt-en-de')


  from .autonotebook import tqdm as notebook_tqdm


ImportError: 
MarianMTModel requires the PyTorch library but it was not found in your environment.
However, we were able to find a TensorFlow installation. TensorFlow classes begin
with "TF", but are otherwise identically named to our PyTorch classes. This
means that the TF equivalent of the class you tried to import would be "TFMarianMTModel".
If you want to use TensorFlow, please use TF classes instead!

If you really do want to use PyTorch please go to
https://pytorch.org/get-started/locally/ and follow the instructions that
match your environment.


In [10]:
pip install transformers


Collecting transformers
  Using cached transformers-4.57.3-py3-none-any.whl.metadata (43 kB)
Collecting filelock (from transformers)
  Downloading filelock-3.20.0-py3-none-any.whl.metadata (2.1 kB)
Collecting huggingface-hub<1.0,>=0.34.0 (from transformers)
  Using cached huggingface_hub-0.36.0-py3-none-any.whl.metadata (14 kB)
Collecting pyyaml>=5.1 (from transformers)
  Downloading pyyaml-6.0.3-cp310-cp310-win_amd64.whl.metadata (2.4 kB)
Collecting regex!=2019.12.17 (from transformers)
  Downloading regex-2025.11.3-cp310-cp310-win_amd64.whl.metadata (41 kB)
Collecting tokenizers<=0.23.0,>=0.22.0 (from transformers)
  Using cached tokenizers-0.22.1-cp39-abi3-win_amd64.whl.metadata (6.9 kB)
Collecting safetensors>=0.4.3 (from transformers)
  Using cached safetensors-0.7.0-cp38-abi3-win_amd64.whl.metadata (4.2 kB)
Collecting tqdm>=4.27 (from transformers)
  Downloading tqdm-4.67.1-py3-none-any.whl.metadata (57 kB)
Collecting fsspec>=2023.5.0 (from huggingface-hub<1.0,>=0.34.0->transform