In [1]:
import spacy
import pyphen

# Load French NLP model
nlp = spacy.load("fr_core_news_sm")

# Sample test
text = "Le chat noir dort sur le canapé. Il rêve de poissons frais."
doc = nlp(text)

# Tokenization & sentence test
for sent in doc.sents:
    print("SENTENCE:", sent.text)

# Syllable test
dic = pyphen.Pyphen(lang='fr')
for token in doc:
    if token.is_alpha:
        print(token.text, "→", dic.inserted(token.text), "→", dic.inserted(token.text).count('-') + 1)



SENTENCE: Le chat noir dort sur le canapé.
SENTENCE: Il rêve de poissons frais.
Le → Le → 1
chat → chat → 1
noir → noir → 1
dort → dort → 1
sur → sur → 1
le → le → 1
canapé → ca-na-pé → 3
Il → Il → 1
rêve → rêve → 1
de → de → 1
poissons → pois-sons → 2
frais → frais → 1


In [2]:
def analyze_french_text(text):
    nlp = spacy.load("fr_core_news_sm")
    dic = pyphen.Pyphen(lang='fr')
    
    doc = nlp(text)
    sentences = list(doc.sents)
    
    word_count = 0
    syllable_count = 0
    complex_words = []
    
    for token in doc:
        if token.is_alpha:
            word = token.text
            word_count += 1
            syllables = dic.inserted(word).count('-') + 1
            syllable_count += syllables
            if syllables >= 3:
                complex_words.append(word)
    
    sentence_count = len(sentences)
    
    # Flesch Reading Ease (French version)
    # Formula adapted: 207 - (1.015 * ASL) - (73.6 * ASW)
    ASL = word_count / sentence_count if sentence_count else 0  # Average Sentence Length
    ASW = syllable_count / word_count if word_count else 0      # Average Syllables per Word
    flesch_score = 207 - (1.015 * ASL) - (73.6 * ASW)
    
    return {
        "word_count": word_count,
        "sentence_count": sentence_count,
        "syllable_count": syllable_count,
        "flesch_score": round(flesch_score, 2),
        "complex_words": complex_words
    }


In [3]:
sample_text = "Le chat noir dort sur le canapé. Il rêve de poissons frais."
result = analyze_french_text(sample_text)
print(result)


{'word_count': 12, 'sentence_count': 2, 'syllable_count': 15, 'flesch_score': 108.91, 'complex_words': ['canapé']}


In [4]:
# Sample CEFR word list for French (expand this later)
cefr_vocab = {
    "le": "A1", "chat": "A1", "noir": "A1", "dort": "A2",
    "sur": "A1", "canapé": "B1", "rêve": "B2", "de": "A1",
    "poissons": "A2", "frais": "B1", "il": "A1", "manger": "A2",
    "parce": "A1", "que": "A1", "chien": "A1", "cuisine": "B1"
}


In [5]:
def analyze_cefr_vocab(text, vocab_map):
    nlp = spacy.load("fr_core_news_sm")
    doc = nlp(text)
    
    cefr_counts = {
        "A1": 0, "A2": 0, "B1": 0,
        "B2": 0, "C1": 0, "C2": 0,
        "Unknown": 0
    }
    
    word_levels = {}

    for token in doc:
        if token.is_alpha:
            word = token.text.lower()
            level = vocab_map.get(word, "Unknown")
            cefr_counts[level] += 1
            word_levels[word] = level
    
    # Estimate CEFR based on most frequent *non-A1* level
    levels_ranked = ["C2", "C1", "B2", "B1", "A2", "A1"]
    estimated = max(
        (lvl for lvl in levels_ranked if cefr_counts[lvl] > 0),
        key=lambda lvl: cefr_counts[lvl],
        default="A1"
    )

    return {
        "word_levels": word_levels,
        "cefr_distribution": cefr_counts,
        "estimated_cefr_level": estimated
    }


In [6]:
text = "Le chat noir dort sur le canapé. Il rêve de poissons frais."

cefr_result = analyze_cefr_vocab(text, vocab_map=cefr_vocab)
print(cefr_result)


{'word_levels': {'le': 'A1', 'chat': 'A1', 'noir': 'A1', 'dort': 'A2', 'sur': 'A1', 'canapé': 'B1', 'il': 'A1', 'rêve': 'B2', 'de': 'A1', 'poissons': 'A2', 'frais': 'B1'}, 'cefr_distribution': {'A1': 7, 'A2': 2, 'B1': 2, 'B2': 1, 'C1': 0, 'C2': 0, 'Unknown': 0}, 'estimated_cefr_level': 'A1'}


In [7]:
def summarize_text_analysis(text, vocab_map):
    readability = analyze_french_text(text)
    cefr = analyze_cefr_vocab(text, vocab_map)
    
    return {
        "readability": readability,
        "cefr": cefr
    }


In [9]:
text = input()




 L’Algérie a refusé, lundi 17 mars, la liste des noms d’une soixantaine d’Algériens à expulser que la France lui a soumise il y a quelques jours, une démarche « rejetée sur la forme et le fond » par Alger. « Les autorités algériennes ont décidé de ne pas donner suite à la liste soumise par les autorités françaises » et les ont « invitées à suivre le canal d’usage, en l’occurrence celui établi entre les préfectures et les consulats », précise un communiqué du ministère des affaires étrangères.


In [10]:
summary = summarize_text_analysis(text, vocab_map=cefr_vocab)
print(summary)

{'readability': {'word_count': 81, 'sentence_count': 3, 'syllable_count': 127, 'flesch_score': 64.2, 'complex_words': ['Algérie', 'refusé', 'Algériens', 'expulser', 'rejetée', 'autorités', 'algériennes', 'décidé', 'autorités', 'invitées', 'occurrence', 'préfectures', 'communiqué', 'ministère']}, 'cefr': {'word_levels': {'algérie': 'Unknown', 'a': 'Unknown', 'refusé': 'Unknown', 'lundi': 'Unknown', 'mars': 'Unknown', 'la': 'Unknown', 'liste': 'Unknown', 'des': 'Unknown', 'noms': 'Unknown', 'une': 'Unknown', 'soixantaine': 'Unknown', 'algériens': 'Unknown', 'à': 'Unknown', 'expulser': 'Unknown', 'que': 'A1', 'france': 'Unknown', 'lui': 'Unknown', 'soumise': 'Unknown', 'il': 'A1', 'y': 'Unknown', 'quelques': 'Unknown', 'jours': 'Unknown', 'démarche': 'Unknown', 'rejetée': 'Unknown', 'sur': 'A1', 'forme': 'Unknown', 'et': 'Unknown', 'le': 'A1', 'fond': 'Unknown', 'par': 'Unknown', 'alger': 'Unknown', 'les': 'Unknown', 'autorités': 'Unknown', 'algériennes': 'Unknown', 'ont': 'Unknown', 'déc

In [13]:
!pip install pandas

Collecting pandas


[notice] A new release of pip available: 22.3.1 -> 25.0.1
[notice] To update, run: python.exe -m pip install --upgrade pip



  Downloading pandas-2.2.3-cp310-cp310-win_amd64.whl (11.6 MB)
     --------------------------------------- 11.6/11.6 MB 13.9 MB/s eta 0:00:00
Collecting tzdata>=2022.7
  Downloading tzdata-2025.2-py2.py3-none-any.whl (347 kB)
     ------------------------------------- 347.8/347.8 kB 10.9 MB/s eta 0:00:00
Installing collected packages: tzdata, pandas
Successfully installed pandas-2.2.3 tzdata-2025.2


In [14]:
import pandas as pd

def load_cefr_vocab_from_flelex(csv_path):
    french_df = pd.read_csv(csv_path, sep="\t", encoding="utf-8", engine="python")
    cefr_levels = ['A1', 'A2', 'B1', 'B2', 'C1', 'C2']
    def infer_cefr_level(row):
        freqs = [row[f'freq_{level}'] for level in cefr_levels]
        max_index = freqs.index(max(freqs))
        return cefr_levels[max_index]
    french_df['inferred_cefr_level'] = french_df.apply(infer_cefr_level, axis=1)
    
    # Create dictionary: {word: inferred_cefr_level}
    vocab_map = dict(zip(french_df['word'].str.lower(), french_df['inferred_cefr_level']))
    
    return vocab_map


In [15]:
cefr_vocab = load_cefr_vocab_from_flelex("FLELex_TreeTagger.csv")

In [18]:
cefr_vocab

{'-ci': 'C1',
 'abaisser': 'C2',
 'abandon': 'C1',
 'abandonner': 'B1',
 'abasourdir': 'B2',
 'abattage': 'C1',
 'abattoir': 'C1',
 'abattre': 'C1',
 'abbaye': 'B1',
 'abbé': 'B1',
 'abeille': 'C2',
 'aberration': 'C1',
 'aboiement': 'B1',
 'abolir': 'C1',
 'abolition': 'C1',
 'abomination': 'C1',
 'abondamment': 'C1',
 'abondance': 'B1',
 'abondant': 'C2',
 'abonder': 'B2',
 'abonnement': 'A2',
 'abonner': 'C1',
 'abonné': 'C1',
 'abord': 'C1',
 'abordable': 'C1',
 'aborder': 'C1',
 'aborigène': 'A2',
 'aboutir': 'C1',
 'aboutissement': 'C2',
 'aboyer': 'B1',
 'abreuver': 'A2',
 'abri': 'B1',
 'abricot': 'A1',
 'abriter': 'A2',
 'abroger': 'C1',
 'abruti': 'B2',
 'abrutir': 'B1',
 'abréger': 'B2',
 'absence': 'C2',
 'absent': 'C1',
 'absenter': 'C2',
 'absentéisme': 'B1',
 'absolu': 'B2',
 'absolument': 'A2',
 'absorber': 'C1',
 'absorption': 'C1',
 'absoudre': 'B1',
 'abstenir': 'C1',
 'abstention': 'B1',
 'abstraction': 'C2',
 'abstraire': 'C1',
 'absurde': 'B1',
 'abus': 'C1',
 'ab

In [21]:
summary = summarize_text_analysis(text, vocab_map=cefr_vocab)
summary

{'readability': {'word_count': 81,
  'sentence_count': 3,
  'syllable_count': 127,
  'flesch_score': 64.2,
  'complex_words': ['Algérie',
   'refusé',
   'Algériens',
   'expulser',
   'rejetée',
   'autorités',
   'algériennes',
   'décidé',
   'autorités',
   'invitées',
   'occurrence',
   'préfectures',
   'communiqué',
   'ministère']},
 'cefr': {'word_levels': {'algérie': 'Unknown',
   'a': 'Unknown',
   'refusé': 'Unknown',
   'lundi': 'A1',
   'mars': 'C2',
   'la': 'A1',
   'liste': 'C2',
   'des': 'Unknown',
   'noms': 'Unknown',
   'une': 'Unknown',
   'soixantaine': 'C1',
   'algériens': 'Unknown',
   'à': 'C2',
   'expulser': 'C2',
   'que': 'B2',
   'france': 'Unknown',
   'lui': 'A2',
   'soumise': 'Unknown',
   'il': 'A1',
   'y': 'A1',
   'quelques': 'Unknown',
   'jours': 'Unknown',
   'démarche': 'C2',
   'rejetée': 'Unknown',
   'sur': 'C1',
   'forme': 'C2',
   'et': 'A1',
   'le': 'C2',
   'fond': 'B1',
   'par': 'A2',
   'alger': 'Unknown',
   'les': 'Unknown',
 

In [23]:
!pip install analyzer

Collecting analyzer
  Downloading analyzer-0.1.1.tar.gz (1.6 kB)
  Preparing metadata (setup.py): started
  Preparing metadata (setup.py): finished with status 'done'
Collecting scipy>=0.13.0
  Downloading scipy-1.15.2-cp310-cp310-win_amd64.whl (41.2 MB)
     ---------------------------------------- 41.2/41.2 MB 5.7 MB/s eta 0:00:00
Collecting SQLAlchemy>=0.8
  Downloading sqlalchemy-2.0.40-cp310-cp310-win_amd64.whl (2.1 MB)
     ---------------------------------------- 2.1/2.1 MB 9.0 MB/s eta 0:00:00
Collecting pbr<1.7.0
  Downloading pbr-1.6.0-py2.py3-none-any.whl (87 kB)
     ---------------------------------------- 87.9/87.9 kB 5.2 MB/s eta 0:00:00
Collecting mox
  Downloading mox-0.5.3.tar.gz (32 kB)
  Preparing metadata (setup.py): started
  Preparing metadata (setup.py): finished with status 'done'
Collecting pyStock
  Downloading pystock-0.1.6.tar.gz (14 kB)
  Preparing metadata (setup.py): started
  Preparing metadata (setup.py): finished with status 'done'
Collecting analyzer


[notice] A new release of pip available: 22.3.1 -> 25.0.1
[notice] To update, run: python.exe -m pip install --upgrade pip


In [25]:
from flask import Flask, request, jsonify
import threading

# Load vocab once
cefr_vocab = load_cefr_vocab_from_flelex("FLELex_TreeTagger.csv")

# Define Flask app
app = Flask(__name__)

@app.route('/analyze', methods=['POST'])
def analyze():
    data = request.get_json()
    text = data.get("text", "")
    
    if not text:
        return jsonify({"error": "No text provided"}), 400

    result = summarize_text_analysis(text, vocab_map=cefr_vocab)
    return jsonify(result)

# Function to run Flask without blocking
def run_flask():
    app.run(port=5000, debug=False, use_reloader=False)

# Start Flask in background thread
threading.Thread(target=run_flask).start()


 * Serving Flask app '__main__'
 * Debug mode: off


 * Running on http://127.0.0.1:5000
Press CTRL+C to quit
127.0.0.1 - - [16/Apr/2025 17:02:26] "POST /analyze HTTP/1.1" 200 -


In [26]:
import requests

text = " L’Algérie a refusé, lundi 17 mars, la liste des noms d’une soixantaine d’Algériens à expulser que la France lui a soumise il y a quelques jours, une démarche « rejetée sur la forme et le fond » par Alger. « Les autorités algériennes ont décidé de ne pas donner suite à la liste soumise par les autorités françaises » et les ont « invitées à suivre le canal d’usage, en l’occurrence celui établi entre les préfectures et les consulats », précise un communiqué du ministère des affaires étrangères."
response = requests.post("http://127.0.0.1:5000/analyze", json={"text": text})
print(response.json())


{'cefr': {'cefr_distribution': {'A1': 10, 'A2': 3, 'B1': 4, 'B2': 7, 'C1': 5, 'C2': 16, 'Unknown': 36}, 'estimated_cefr_level': 'C2', 'word_levels': {'a': 'Unknown', 'affaires': 'Unknown', 'alger': 'Unknown', 'algérie': 'Unknown', 'algériennes': 'Unknown', 'algériens': 'Unknown', 'autorités': 'Unknown', 'canal': 'B2', 'celui': 'C1', 'communiqué': 'C1', 'consulats': 'Unknown', 'de': 'C2', 'des': 'Unknown', 'donner': 'B2', 'du': 'C2', 'décidé': 'B1', 'démarche': 'C2', 'en': 'B1', 'entre': 'C2', 'et': 'A1', 'expulser': 'C2', 'fond': 'B1', 'forme': 'C2', 'france': 'Unknown', 'françaises': 'Unknown', 'il': 'A1', 'invitées': 'Unknown', 'jours': 'Unknown', 'la': 'A1', 'le': 'C2', 'les': 'Unknown', 'liste': 'C2', 'lui': 'A2', 'lundi': 'A1', 'mars': 'C2', 'ministère': 'C2', 'ne': 'B2', 'noms': 'Unknown', 'occurrence': 'C2', 'ont': 'Unknown', 'par': 'A2', 'pas': 'B2', 'précise': 'Unknown', 'préfectures': 'Unknown', 'que': 'B2', 'quelques': 'Unknown', 'refusé': 'Unknown', 'rejetée': 'Unknown', 's