In [None]:
# This notebook builds a pipeline to evaluate grammar from audio files. It uses OpenAI Whisper for transcription and combines multiple models and metrics to assess language quality.

# For each audio input, the following are computed:

#     Transcription using Whisper
#     Grammar errors (LanguageTool)
#     Grammar correction (T5 model)
#     BLEU score between original and corrected text
#     Grammatical acceptability (CoLA via RoBERTa)
#     Fluency score (based on GPT-2 perplexity)
#     Readability (Flesch Reading Ease)
#     Coherence (semantic similarity between sentences)
#     Speech fluency (words per minute vs ideal rate)
#     Pronunciation score (based on Word Error Rate)

# A weighted final score is generated for each file to summarize overall spoken English proficiency.

In [None]:
!python --version


Python 3.11.11


In [None]:
!pip install -q torch torchaudio transformers torchmetrics
!pip install -q git+https://github.com/openai/whisper.git
!pip install -q language-tool-python
!pip install -q spacy
!python -m spacy download en_core_web_sm
!pip install evaluate textstat jiwer


  Installing build dependencies ... [?25l[?25hdone
  Getting requirements to build wheel ... [?25l[?25hdone
  Preparing metadata (pyproject.toml) ... [?25l[?25hdone
Collecting en-core-web-sm==3.8.0
  Downloading https://github.com/explosion/spacy-models/releases/download/en_core_web_sm-3.8.0/en_core_web_sm-3.8.0-py3-none-any.whl (12.8 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m12.8/12.8 MB[0m [31m105.0 MB/s[0m eta [36m0:00:00[0m
[?25h[38;5;2m✔ Download and installation successful[0m
You can now load the package via spacy.load('en_core_web_sm')
[38;5;3m⚠ Restart to reload dependencies[0m
If you are in a Jupyter or Colab notebook, you may need to restart Python in
order to load all the package's dependencies. You can do this by selecting the
'Restart kernel' or 'Restart runtime' option.


In [None]:
import torch
import whisper
import librosa
import jiwer
import spacy
import textstat
import numpy as np
import pandas as pd
import language_tool_python

from sentence_transformers import SentenceTransformer
from transformers import (
    T5ForConditionalGeneration, T5Tokenizer,
    AutoTokenizer, AutoModelForSequenceClassification,
    GPT2LMHeadModel, GPT2Tokenizer
)
from torchmetrics.text.bleu import BLEUScore
from evaluate import load

In [None]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print("Using device:", device)

Using device: cuda


In [None]:
# Initialize models

def init_models():
    return {
        "whisper": whisper.load_model("base"),
        "lt": language_tool_python.LanguageTool("en-US"),
        "spacy": spacy.load("en_core_web_sm"),
        "t5_model": T5ForConditionalGeneration.from_pretrained("vennify/t5-base-grammar-correction").to(device),
        "t5_tokenizer": T5Tokenizer.from_pretrained("vennify/t5-base-grammar-correction"),
        "bleu": load("bleu"),
        "cola_tokenizer": AutoTokenizer.from_pretrained("textattack/roberta-base-CoLA"),
        "cola_model": AutoModelForSequenceClassification.from_pretrained("textattack/roberta-base-CoLA").to(device),
        "gpt2_tokenizer": GPT2Tokenizer.from_pretrained("gpt2"),
        "gpt2_model": GPT2LMHeadModel.from_pretrained("gpt2").to(device),
        "semantic": SentenceTransformer("all-MiniLM-L6-v2"),
        "transformation": jiwer.Compose([
            jiwer.ToLowerCase(),
            jiwer.RemovePunctuation(),
            jiwer.Strip(),
            jiwer.RemoveMultipleSpaces()
        ])
    }

models = init_models()

Some weights of the model checkpoint at textattack/roberta-base-CoLA were not used when initializing RobertaForSequenceClassification: ['roberta.pooler.dense.bias', 'roberta.pooler.dense.weight']
- This IS expected if you are initializing RobertaForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing RobertaForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


In [None]:
def evaluate_audio(file_path, models):
    whisper_model = models["whisper"]
    result = whisper_model.transcribe(file_path, word_timestamps=True)

    transcription = result["text"]
    segments = result.get("segments", [])

    # Duration from Whisper if available
    if segments:
        start_time = segments[0]["start"]
        end_time = segments[-1]["end"]
        duration = end_time - start_time
    else:
        y, sr = librosa.load(file_path)
        duration = librosa.get_duration(y=y, sr=sr)

    # Grammar check
    # since this is rule based, it might not give desired results in all cases
    lt_tool = models["lt"]
    matches = lt_tool.check(transcription)
    grammar_errors = len(matches)
    grammar_score = 1 - grammar_errors / max(len(transcription.split()), 1)

    # T5 Grammar correction
    input_text = "fix: " + transcription
    inputs = models["t5_tokenizer"](input_text, return_tensors="pt", truncation=True).to(device)
    outputs = models["t5_model"].generate(**inputs, max_new_tokens=128)
    corrected_text = models["t5_tokenizer"].decode(outputs[0], skip_special_tokens=True)

    # BLEU
    bleu = models["bleu"].compute(predictions=[corrected_text], references=[[transcription]])["bleu"]

    # CoLA
    cola_inputs = models["cola_tokenizer"](corrected_text, return_tensors="pt", truncation=True).to(device)
    cola_score = torch.softmax(models["cola_model"](**cola_inputs).logits, dim=1)[0][1].item()

    # Fluency (perplexity-based)
    enc = models["gpt2_tokenizer"](transcription, return_tensors="pt").to(device)
    loss = models["gpt2_model"](**enc, labels=enc["input_ids"]).loss
    perplexity = torch.exp(loss).item()
    fluency_score = 1 / perplexity

    # Readability
    readability_score = min(max(textstat.flesch_reading_ease(transcription) / 100, 0), 1)

    # Coherence
    sents = transcription.split(". ")
    sent_vecs = models["semantic"].encode(sents)
    coherence = np.mean([np.dot(sent_vecs[i], sent_vecs[i+1]) for i in range(len(sent_vecs)-1)]) if len(sent_vecs) > 1 else 1

    # Speech Rate (using Whisper duration)
    word_count = len(transcription.split())
    wpm = word_count / (duration / 60)
    speech_fluency = max(0, 1 - abs(wpm - 140) / 140)

    # Pronunciation score
    wer = jiwer.wer(models["transformation"](corrected_text), models["transformation"](transcription))
    pronunciation = max(0, 1 - wer)

    # Final score
    final = 0.2 * grammar_score + 0.15 * bleu + 0.15 * cola_score + 0.15 * fluency_score + \
            0.1 * readability_score + 0.1 * coherence + 0.1 * pronunciation

    return {
        "Transcription": transcription,
        "Corrected Text": corrected_text,
        "Grammar Errors": grammar_errors,
        "BLEU": round(bleu, 3),
        "CoLA": round(cola_score, 3),
        "Fluency": round(fluency_score, 3),
        "Readability": round(readability_score, 3),
        "Coherence": round(coherence, 3),
        "WPM": round(wpm, 1),
        "Pronunciation": round(pronunciation, 3),
        "Final Score": round(final * 100, 3)
    }


In [None]:
audio_files = ["good_sample1.m4a", "bad_sample1.m4a"]
results = {file: evaluate_audio(file, models) for file in audio_files}

df = pd.DataFrame(results).T.transpose()
df.columns = audio_files
df.index.name = "Metric"
print(df)


                                                 good_sample1.m4a  \
Metric                                                              
Transcription    I am delighted to be here today and appreciat...   
Corrected Text  I am delighted to be here today and appreciate...   
Grammar Errors                                                  0   
BLEU                                                        0.802   
CoLA                                                        0.973   
Fluency                                                     0.067   
Readability                                                  0.46   
Coherence                                                   0.363   
WPM                                                         176.9   
Pronunciation                                               0.782   
Final Score                                             63.681999   

                                                  bad_sample1.m4a  
Metric                            