In [1]:
import pandas as pd
import json
from openai import OpenAI
import os
from dotenv import load_dotenv

load_dotenv()

openai_key = os.environ.get("OPENAI_API_KEY")

client = OpenAI(api_key=openai_key)

df = pd.read_json('tsar2025_test_blind.jsonl', lines =True)

In [2]:
openai_key = os.environ.get("OPENAI_API_KEY")
def gpt(prompt):
    response = client.chat.completions.create(
        model="gpt-4-1106-preview",
        messages=[{"role": "user", "content": prompt}],
        max_tokens=1000,
        stream=True,
        temperature=0.1,
        top_p = 0.2,
        seed = 42
    )

    full_text = ""
    for chunk in response:
        delta = chunk.choices[0].delta
        if delta.content:
            full_text += delta.content

    return full_text

In [3]:
all_simplifications = []
text_ids = []
for idx in range(len(df)):
    target_level = df['target_cefr'][idx]
    original_text = df['original'][idx]
    text_id = df['text_id'][idx]

    if target_level == 'A2':
        prompt = (
    f"You are a language teacher simplifying texts to A2 CEFR level.\n\n"
    f"OBJECTIVE: Transform this text to A2 level while preserving all original meaning and information.\n\n"
    f"A2 LANGUAGE REQUIREMENTS:\n"
    f"- Vocabulary: Most common 1500 English words only\n"
    f"- Sentences: 8-12 words, one clear idea per sentence\n"
    f"- Grammar: Simple present/past, basic future (will), basic modals (can/must/should)\n"
    f"- Connectors: and, but, because, so, when, if, then\n"
    f"- Style: Personal, concrete, everyday language\n\n"
    f"STRICT LEVEL CONTROL:\n"
    f"- Above A1: Include personal experiences, feelings, plans, time references\n"
    f"- Below B1: No present perfect, passive voice, or complex connectors (however, although, despite)\n"
    f"- Below B1: No abstract concepts without concrete explanation\n\n"
    f"TRANSFORMATION PROCESS:\n"
    f"1. Identify all key information and meaning\n"
    f"2. Break complex sentences into simple A2 structures\n"
    f"3. Replace advanced vocabulary with A2 equivalents\n"
    f"4. Convert complex grammar to simple A2 patterns\n"
    f"5. Verify all original meaning is preserved\n\n"
    f"CRITICAL: Do not omit, summarize, or change any information. Only change HOW it's expressed.\n\n"
    f"Return only the simplified text, with no explanations.\n\n"
    f"Text to simplify:\n"
    f"\"\"\"\n{original_text}\n\"\"\"\n"
)
    if target_level == 'B1':
        prompt = (
    f"You are an expert CEFR B1 text simplification specialist with deep understanding of automatic language assessment systems.\n\n"
    f"OBJECTIVE: Transform this text to precise B1 level while preserving all original meaning and information.\n\n"
    f"B1 LANGUAGE REQUIREMENTS:\n"
    f"- Vocabulary: 2000-3000 most common English words, avoid academic/formal terms\n"
    f"- Sentences: 15-22 words, can connect 2 related ideas with clear logic\n"
    f"- Grammar: Present perfect (have/has done), simple passive (is/was done), basic conditionals (if...will/would), modals (should, might, could, would)\n"
    f"- Connectors: however, although, while, since, unless, because, so that, even though\n"
    f"- Style: Clear intermediate language that shows reasoning and personal opinions\n\n"

    f"STRICT LEVEL CONTROL:\n"
    f"- Above A2: Include abstract concepts with simple explanation, cause-effect relationships, personal opinions with basic justification, intermediate grammar patterns\n"
    f"- Below B2: No academic/formal vocabulary (facilitate→help, demonstrate→show, utilize→use), no complex conditional structures, no sophisticated argumentation, no specialized terminology without explanation\n"
    f"- PRECISE B1 TARGET: Intermediate complexity using everyday vocabulary - never oversimplify to A2, never undersimplify leaving B2+ elements\n\n"

    f"CRITICAL B1 DIFFERENTIATORS:\n"
    f"- From A2: Can handle abstract ideas but explains them simply using common words\n"
    f"- From B2: Uses everyday vocabulary even for complex concepts, avoids formal/academic tone\n"
    f"- B1 signature: Connects ideas logically but with simple language patterns\n\n"

    f"TRANSFORMATION PROCESS:\n"
    f"1. Identify all key information and meaning\n"
    f"2. Scan for B2+ vocabulary and replace with B1 common equivalents\n"
    f"3. Convert complex sentences to B1 structures (maximum 2 clauses per sentence)\n"
    f"4. Add simple explanations for any remaining complex concepts\n"
    f"5. Include 2-3 B1 grammar markers per paragraph naturally\n"
    f"6. Verify consistent B1 complexity throughout - no A2 oversimplification, no B2+ elements remaining\n\n"

    f"CRITICAL: Do not omit, summarize, or change any information. Only change HOW it's expressed to match B1 patterns that automatic CEFR classifiers consistently recognize as B1 level.\n\n"

    f"Return only the simplified text, with no explanations.\n\n"

    f"Text to simplify:\n"
    f"\"\"\"\n{original_text}\n\"\"\"\n"
    )
    print(f"Processing: {text_id}")
    simplified_text = gpt(prompt)
    all_simplifications.append(simplified_text)
    text_ids.append(text_id)

with open('test_results/gpt_4_turbo_one_try.jsonl', 'w', encoding='utf-8') as f:
    for text_id, simplified_text in zip(text_ids, all_simplifications):
        item = {
            "text_id": text_id,
            "simplified": simplified_text
        }
        f.write(json.dumps(item, ensure_ascii=False) + '\n')

Processing: 21-a2
Processing: 22-a2
Processing: 23-a2
Processing: 24-a2
Processing: 25-a2
Processing: 26-a2
Processing: 27-a2
Processing: 28-a2
Processing: 29-a2
Processing: 30-a2
Processing: 31-a2
Processing: 32-a2
Processing: 33-a2
Processing: 34-a2
Processing: 35-a2
Processing: 36-a2
Processing: 37-a2
Processing: 38-a2
Processing: 39-a2
Processing: 40-a2
Processing: 41-a2
Processing: 42-a2
Processing: 43-a2
Processing: 44-a2
Processing: 45-a2
Processing: 46-a2
Processing: 47-a2
Processing: 48-a2
Processing: 49-a2
Processing: 50-a2
Processing: 51-a2
Processing: 52-a2
Processing: 53-a2
Processing: 54-a2
Processing: 55-a2
Processing: 56-a2
Processing: 57-a2
Processing: 58-a2
Processing: 59-a2
Processing: 60-a2
Processing: 61-a2
Processing: 62-a2
Processing: 63-a2
Processing: 64-a2
Processing: 65-a2
Processing: 66-a2
Processing: 67-a2
Processing: 68-a2
Processing: 69-a2
Processing: 70-a2
Processing: 71-a2
Processing: 72-a2
Processing: 73-a2
Processing: 74-a2
Processing: 75-a2
Processing

In [None]:
import numpy as np
from sklearn.metrics import f1_score, root_mean_squared_error
from transformers import pipeline

CEFR_LABELS = ['A1','A2','B1','B2','C1','C2']
LABEL2IDX   = {label: idx for idx, label in enumerate(CEFR_LABELS)}

cefr_labeler1 = pipeline(task="text-classification",model="AbdullahBarayan/ModernBERT-base-doc_en-Cefr" )
cefr_labeler2 = pipeline(task="text-classification",model="AbdullahBarayan/ModernBERT-base-doc_sent_en-Cefr")
cefr_labeler3 = pipeline(task="text-classification",model="AbdullahBarayan/ModernBERT-base-reference_AllLang2-Cefr2")


def get_cefr_labels(simplifications: list, models=[cefr_labeler1,cefr_labeler2,cefr_labeler3]):
  cefr_labels = []
  for simplification in simplifications:
    top_preds = (model(simplification)[0] for model in models)
    best = max(top_preds, key=lambda d: d["score"])
    cefr_labels.append(best["label"])
  return cefr_labels

def get_cefr_compliance_score(simplifications: list, reference_levels: list, models=[cefr_labeler1,cefr_labeler2,cefr_labeler3]):
    assert len(simplifications) == len(reference_levels), "The number of simplifications is different of the number of reference_levels."

    predicted_labels = get_cefr_labels(simplifications=simplifications, models=models)
    f1 = f1_score(reference_levels, predicted_labels, average='weighted')

    true_idx = np.array([LABEL2IDX[l] for l in reference_levels])
    pred_idx = np.array([LABEL2IDX[l] for l in predicted_labels])

    adj_acc = (np.abs(true_idx - pred_idx) <= 1).mean()
    rmse = root_mean_squared_error(true_idx, pred_idx)

    return {'weighted_f1': round(f1,4),
            'adj_accuracy': round(adj_acc,4),
            'rmse': round(rmse,4)}

def evaluate_single_text_cefr(text, target_level, models=[cefr_labeler1,cefr_labeler2,cefr_labeler3]):
    try:
        simplifications = [text]
        reference_levels = [target_level.upper()]

        predicted_labels = get_cefr_labels(simplifications=simplifications, models=models)

        if predicted_labels and len(predicted_labels) > 0 and predicted_labels[0] is not None:
            predicted_level = predicted_labels[0]

            f1 = f1_score(reference_levels, predicted_labels, average='weighted')

            true_idx = np.array([LABEL2IDX[l] for l in reference_levels])
            pred_idx = np.array([LABEL2IDX[l] for l in predicted_labels])

            adj_acc = (np.abs(true_idx - pred_idx) <= 1).mean()
            rmse = root_mean_squared_error(true_idx, pred_idx)

            metrics = {
                'weighted_f1': round(f1, 4),
                'adj_accuracy': round(adj_acc, 4),
                'rmse': round(rmse, 4)
            }
            return predicted_level, metrics
        else:
            return None, {'weighted_f1': 0.0, 'adj_accuracy': 0.0, 'rmse': 0.0}

    except Exception as e:
        print(f"Evaluation error: {e}")
        return None, {'weighted_f1': 0.0, 'adj_accuracy': 0.0, 'rmse': 0.0}
def iterative_cefr_simplification(df, max_iterations=5):

    def get_simplification_prompt(original_text, target_level, current_level=None, iteration=0):

        base_feedback = ""
        if current_level and iteration > 0:
            if current_level != target_level.upper():
                base_feedback = f"\nCURRENT ISSUE: The previous version was classified as {current_level}, but we need {target_level.upper()} level. "

                if current_level > target_level.upper():
                    base_feedback += "The text is TOO COMPLEX. Simplify more aggressively."
                else:
                    base_feedback += "The text is TOO SIMPLE. Add more complexity while staying at target level."

        if target_level.lower() == 'a2':
            return f"""You are a language teacher simplifying texts to A2 CEFR level.

OBJECTIVE: Transform this text to A2 level while preserving all original meaning and information.
{base_feedback}

A2 LANGUAGE REQUIREMENTS:
- Vocabulary: Most common 1500 English words only
- Sentences: 8-12 words, one clear idea per sentence
- Grammar: Simple present/past, basic future (will), basic modals (can/must/should)
- Connectors: and, but, because, so, when, if, then
- Style: Personal, concrete, everyday language

STRICT LEVEL CONTROL:
- Above A1: Include personal experiences, feelings, plans, time references
- Below B1: No present perfect, passive voice, or complex connectors (however, although, despite)
- Below B1: No abstract concepts without concrete explanation

TRANSFORMATION PROCESS:
1. Identify all key information and meaning
2. Break complex sentences into simple A2 structures
3. Replace advanced vocabulary with A2 equivalents
4. Convert complex grammar to simple A2 patterns
5. Verify all original meaning is preserved

CRITICAL: Do not omit, summarize, or change any information. Only change HOW it's expressed.

Return only the simplified text. Do not include any other comments, notes, or additional information.

Text to simplify:
\"\"\"
{original_text}
\"\"\"
"""

        elif target_level.lower() == 'b1':
            return f"""You are an expert CEFR B1 text simplification specialist with deep understanding of automatic language assessment systems.

OBJECTIVE: Transform this text to precise B1 level while preserving all original meaning and information.
{base_feedback}

B1 LANGUAGE REQUIREMENTS:
- Vocabulary: 2000-3000 most common English words, avoid academic/formal terms
- Sentences: 15-22 words, can connect 2 related ideas with clear logic
- Grammar: Present perfect (have/has done), simple passive (is/was done), basic conditionals (if...will/would), modals (should, might, could, would)
- Connectors: however, although, while, since, unless, because, so that, even though
- Style: Clear intermediate language that shows reasoning and personal opinions

STRICT LEVEL CONTROL:
- Above A2: Include abstract concepts with simple explanation, cause-effect relationships, personal opinions with basic justification, intermediate grammar patterns
- Below B2: No academic/formal vocabulary (facilitate→help, demonstrate→show, utilize→use), no complex conditional structures, no sophisticated argumentation, no specialized terminology without explanation
- PRECISE B1 TARGET: Intermediate complexity using everyday vocabulary - never oversimplify to A2, never undersimplify leaving B2+ elements

CRITICAL B1 DIFFERENTIATORS:
- From A2: Can handle abstract ideas but explains them simply using common words
- From B2: Uses everyday vocabulary even for complex concepts, avoids formal/academic tone
- B1 signature: Connects ideas logically but with simple language patterns

TRANSFORMATION PROCESS:
1. Identify all key information and meaning
2. Scan for B2+ vocabulary and replace with B1 common equivalents
3. Convert complex sentences to B1 structures (maximum 2 clauses per sentence)
4. Add simple explanations for any remaining complex concepts
5. Include 2-3 B1 grammar markers per paragraph naturally
6. Verify consistent B1 complexity throughout - no A2 oversimplification, no B2+ elements remaining

CRITICAL: Do not omit, summarize, or change any information. Only change HOW it's expressed to match B1 patterns that automatic CEFR classifiers consistently recognize as B1 level.

Return only the simplified text. Do not include any other comments, notes, or additional information.

Text to simplify:
\"\"\"
{original_text}
\"\"\"
"""
    results = []
    best_versions = []
    all_simplifications = []

    for idx in range(len(df)):
        target_level = df['target_cefr'][idx].lower()
        original_text = df['original'][idx]
        text_id = df['text_id'][idx]

        print(f"\n=== Processing {text_id} (Target: {target_level.upper()}) ===")

        current_text = original_text
        simplified_text = None
        iteration_history = []

        best_iteration = None

        for iteration in range(max_iterations):
            print(f"\nIteration {iteration + 1}/{max_iterations}")

            if iteration == 0:
                prompt = get_simplification_prompt(current_text, target_level)
            else:
                last_level = iteration_history[-1]['predicted_level']
                prompt = get_simplification_prompt(current_text, target_level, last_level, iteration)

            simplified_text = gpt(prompt)

            predicted_level, metrics = evaluate_single_text_cefr(simplified_text, target_level)

            iteration_info = {
                'iteration': iteration + 1,
                'text': simplified_text,
                'predicted_level': predicted_level,
                'weighted_f1': metrics['weighted_f1'],
                'adj_accuracy': metrics['adj_accuracy'],
                'rmse': metrics['rmse'],
                'target_achieved': predicted_level == target_level.upper() if predicted_level else False
            }

            iteration_history.append(iteration_info)

            if best_iteration is None or metrics['weighted_f1'] > best_iteration['weighted_f1']:
                best_iteration = iteration_info.copy()

            all_simplifications.append({
                'text_id': text_id,
                'iteration': iteration + 1,
                'target_level': target_level,
                'predicted_level': predicted_level,
                'weighted_f1': metrics['weighted_f1'],
                'adj_accuracy': metrics['adj_accuracy'],
                'rmse': metrics['rmse'],
                'simplified_text': simplified_text,
                'original_text': original_text if iteration == 0 else None
            })

            print(f"Prediction: {predicted_level}")
            print(f"F1: {metrics['weighted_f1']}, Adj Acc: {metrics['adj_accuracy']}, RMSE: {metrics['rmse']}")
            print(f"Target achieved: {'Yes' if iteration_info['target_achieved'] else 'No'}")

            if iteration_info['target_achieved']:
                print(f"Target {target_level.upper()} achieved at iteration {iteration + 1}!")
                break

            current_text = simplified_text

        final_result = {
            'text_id': text_id,
            'original': original_text,
            'target_level': target_level,
            'final_simplified': simplified_text,
            'iterations_used': len(iteration_history),
            'final_predicted_level': iteration_history[-1]['predicted_level'],
            'final_weighted_f1': iteration_history[-1]['weighted_f1'],
            'final_adj_accuracy': iteration_history[-1]['adj_accuracy'],
            'final_rmse': iteration_history[-1]['rmse'],
            'target_achieved': iteration_history[-1]['target_achieved'],
            'iteration_history': iteration_history
        }

        results.append(final_result)

        best_versions.append({
            'text_id': text_id,
            'simplified': best_iteration['text']
        })

        if final_result['target_achieved']:
            print(f"Success: {text_id} -> {target_level.upper()} in {len(iteration_history)} iterations")
        else:
            print(f"Failed: {text_id} -> {final_result['final_predicted_level']} instead of {target_level.upper()}")

    return results, all_simplifications, best_versions

def analyze_iterative_results(results):

    total_texts = len(results)
    successful_texts = sum(1 for r in results if r['target_achieved'])
    success_rate = successful_texts / total_texts * 100

    print(f"\n{'='*50}")
    print(f"ITERATIVE SIMPLIFICATION RESULTS ANALYSIS")
    print(f"{'='*50}")
    print(f"Processed texts: {total_texts}")
    print(f"Successful texts: {successful_texts}")
    print(f"Success rate: {success_rate:.1f}%")

    iteration_stats = {}
    for result in results:
        iterations_used = result['iterations_used']
        if iterations_used not in iteration_stats:
            iteration_stats[iterations_used] = {'total': 0, 'successful': 0}

        iteration_stats[iterations_used]['total'] += 1
        if result['target_achieved']:
            iteration_stats[iterations_used]['successful'] += 1

    print(f"\nDistribution by iterations:")
    for iterations, stats in sorted(iteration_stats.items()):
        success_rate_iter = stats['successful'] / stats['total'] * 100
        print(f"  {iterations} iterations: {stats['successful']}/{stats['total']} ({success_rate_iter:.1f}% success)")

    level_stats = {}
    for result in results:
        target = result['target_level']
        if target not in level_stats:
            level_stats[target] = {'total': 0, 'successful': 0}

        level_stats[target]['total'] += 1
        if result['target_achieved']:
            level_stats[target]['successful'] += 1

    print(f"\nPerformance by levels:")
    for level, stats in sorted(level_stats.items()):
        success_rate_level = stats['successful'] / stats['total'] * 100
        print(f"  {level.upper()}: {stats['successful']}/{stats['total']} ({success_rate_level:.1f}% success)")

    multi_iteration_texts = []
    for result in results:
        if result['iterations_used'] > 1:
            multi_iteration_texts.append({
                'text_id': result['text_id'],
                'iterations': result['iterations_used'],
                'final_level': result['final_predicted_level'],
                'target_achieved': result['target_achieved']
            })

    print(f"\n{'='*50}")
    print(f"TEXTS WITH MULTIPLE ITERATIONS ({len(multi_iteration_texts)} texts):")
    print(f"{'='*50}")
    for text in multi_iteration_texts:
        status = "✓" if text['target_achieved'] else "✗"
        print(f"{status} {text['text_id']}: {text['iterations']} iterations (final: {text['final_level']})")

    return {
        'total_texts': total_texts,
        'successful_texts': successful_texts,
        'success_rate': success_rate,
        'iteration_stats': iteration_stats,
        'level_stats': level_stats,
        'multi_iteration_texts': multi_iteration_texts
    }

def save_all_simplifications(all_simplifications, best_versions, filename='test_results/iterative_gpt-4-1106.csv'):

    import json
    with open('tsar2025sharedtask_evaluation/submissions/test_results/iterative_gpt-4-1106.jsonl', 'w', encoding='utf-8') as f:
        for item in best_versions:
            json.dump(item, f, ensure_ascii=False)
            f.write('\n')
    print("Best versions saved to 'best_versions.jsonl'")

    import pandas as pd

    df_simplifications = pd.DataFrame(all_simplifications)

    df_simplifications.to_csv(filename, index=False, encoding='utf-8')
    print(f"\nToate simplificările au fost salvate în '{filename}'")

    import json
    json_filename = filename.replace('.csv', '.json')
    with open(json_filename, 'w', encoding='utf-8') as f:
        json.dump(all_simplifications, f, ensure_ascii=False, indent=2)
    print(f"Format JSON salvat în '{json_filename}'")

def main():
    iterative_results, all_simplifications, best_versions = iterative_cefr_simplification(df, max_iterations=5)


    if iterative_results:
        analysis = analyze_iterative_results(iterative_results)

        save_all_simplifications(all_simplifications, best_versions)

        final_texts = [r['final_simplified'] for r in iterative_results]
        reference_levels = [r['target_level'].upper() for r in iterative_results]

        print(f"\n{'='*50}")
        print(f"DETAILED FINAL EVALUATION")
        print(f"{'='*50}")

        compliance_scores = get_cefr_compliance_score(final_texts, reference_levels)
        print(f"Weighted F1: {compliance_scores['weighted_f1']}")
        print(f"Adjacent Accuracy: {compliance_scores['adj_accuracy']}")
        print(f"RMSE: {compliance_scores['rmse']}")

        return iterative_results, analysis, compliance_scores, all_simplifications, best_versions

    return None

iterative_results, analysis, compliance_scores, all_simplifications, best_versions = main()