In [118]:
import whisper
from pprint import pprint
import pandas as pd
from typing import List
from pydantic import BaseModel
from openai import OpenAI


In [None]:
pd.set_option("display.max_colwidth", None)
pd.set_option("display.width", 0)


In [2]:
model = whisper.load_model("base", device="cpu")

In [3]:
result = model.transcribe("sample2.flac", word_timestamps=True, fp16=False)

In [29]:
words = []
for seg in result["segments"]:
    for w in seg["words"]:
        words.append({
            "word": w["word"].strip(),
            "start": float(w["start"]),
            "end": float(w["end"]),
            "duration": float(w["end"] - w["start"]),
            "confidence": float(w["probability"])
        })

df_words = pd.DataFrame(words)
df_words.sort_values("confidence").head(5)


Unnamed: 0,word,start,end,duration,confidence
19,can,6.6,6.7,0.1,0.241798
30,and,10.78,11.06,0.28,0.457742
36,red,12.48,12.9,0.42,0.650255
9,Vera,3.5,3.72,0.22,0.663709
37,gamecock.,12.9,13.6,0.7,0.692071


In [5]:
segments = []
for seg in result["segments"]:
    segments.append({
        "text": seg["text"].strip(),
        "start": float(seg["start"]),
        "end": float(seg["end"]),
        "duration": float(float(seg["end"]) - float(seg["start"])),
        "avg_word_confidence": sum([float(w["probability"]) for w in seg["words"]]) / (len(seg["words"]) if len(seg["words"]) > 0 else 0.0)
    })

df_segments = pd.DataFrame(segments)
with pd.option_context('display.max_colwidth', None):
    print(df_segments.head()) 

                                                                                         text  \
0  Before he had time to answer a much encumbered Vera burst into the room with the question,   
1                                                              I say, can I leave these here?   
2                    These were a small black pig and a lusty specimen of black red gamecock.   

   start    end  duration  avg_word_confidence  
0   0.00   5.34      5.34             0.923389  
1   5.84   7.48      1.64             0.835031  
2   8.50  13.60      5.10             0.870124  


In [27]:
total_duration = float(df_segments.iloc[-1]['end']) #- df_segments.iloc[0]['start'])
words_per_minute = (len(df_words) * 60) / (total_duration) 

print(f"Total duration (s): {total_duration:.2f}")
print(f"Number of words: {len(df_words)}")
print(f"Words per minute: {words_per_minute:.2f}")

Total duration (s): 13.60
Number of words: 38
Words per minute: 167.65


In [71]:
pauses = df_words["start"].iloc[1:].values - df_words["end"].iloc[:-1].values
long_pauses = pauses[pauses > 1.0]
very_long_pauses = pauses[pauses > 2.0]
pauses

array([0.  , 0.  , 0.  , 0.  , 0.  , 0.  , 0.  , 0.  , 0.  , 0.  , 0.  ,
       0.  , 0.  , 0.  , 0.  , 0.  , 0.5 , 0.  , 0.48, 0.  , 0.  , 0.  ,
       0.  , 1.02, 0.  , 0.  , 0.  , 0.  , 0.  , 0.  , 0.  , 0.  , 0.  ,
       0.  , 0.  , 0.  , 0.  ])

In [76]:
words_clean = df_words['word'].str.lower()
words_unique = words_clean.nunique()
words_total = len(words_clean)
vocab_richness = words_unique / words_total if words_total > 0 else 0
top_repeats = words_clean.value_counts().head(5)
print(f"Total words: {words_total}")
print(f"Unique words: {words_unique}")
print(f"Vocabulary richness: {vocab_richness:.2f}")

Total words: 38
Unique words: 32
Vocabulary richness: 0.84


In [91]:
TRANSCRIPT = """\
Iâ€™d like to about a person I really admire, and that is my elder.

Sheâ€™s about five years older than me, and Iâ€™ve know her my whole life, obviously, but we became much closer when I was in high school. That was the time when I was quite confused about my future, and she was already working, so I used to talk to her a lot.

One I really admire in her is her sense of responsibility. No matter how tired she is, she always finishes what she starts. Sheâ€™s also very calm, which is the complete opposite of me. When thereâ€™s a problem, she doesnâ€™t panic; instead, she tries to find a practical solution. I think thatâ€™s a very useful skill in real life.

Another thing I admire is that sheâ€™s very independent. She moved to another city for her and managed everything on her own, from rent to finances, without complaining much. Watching her made me realize that being independent isnâ€™t easy, but itâ€™s definitely worth it.

I admire her mainly because she motivates me without forcing me. She never lectures me, but her actions push me to work harder and be more disciplined. Honestly, whenever I feel lazy, I just think about how she handles her life, and that gives me a reality check.
"""

In [99]:
prompt_text = TRANSCRIPT

In [107]:
system_prompt = """
You are an expert grammar analysis engine specialized in spoken English assessment.

Your task:
- Analyze the transcript as natural spoken English.
- Detect grammatical errors that would be penalized in formal spoken English evaluation (e.g. IELTS Speaking).
- Identify errors precisely and conservatively.

Error categories:
- tense
- agreement
- articles
- prepositions
- sentence_structure
- other

Rules:
- Ignore pronunciation issues.
- Ignore vocabulary choice unless grammatically incorrect.
- Do NOT correct the text.
- Do NOT suggest fixes.
- Do NOT explain grammar rules.
- Do NOT infer intended meaning beyond what is grammatically present.
- Treat omissions (missing required words or structures) as grammatical errors.
- Count each distinct grammatical error once.

For each detected error:
- Provide the minimal text span that contains the error.
- Include the sentence index (0-based).
- Include character start and end positions relative to the full transcript.
- Do NOT include corrections or explanations.

Analysis steps:
1. Determine total sentence count (spoken-style sentences included).
2. Identify all grammatical errors.
3. Classify each error by type.
4. Ensure counts match the listed errors.

Return STRICT JSON using EXACTLY this schema:

{
  "sentence_count": number,
  "error_count": number,
  "errors_by_type": {
    "tense": number,
    "agreement": number,
    "articles": number,
    "prepositions": number,
    "sentence_structure": number,
    "other": number
  },
  "errors": [
    {
      "type": string,
      "context": string,
      "sentence_index": number,
      "char_start": number,
      "char_end": number
    }
  ]
}

If no grammatical errors are present, return zero counts and an empty errors array.
"""


In [120]:
class GrammarError(BaseModel):
    type: str
    context: str
    sentence_index: int
    char_start: int
    char_end: int


class ErrorsByType(BaseModel):
    tense: int
    agreement: int
    articles: int
    prepositions: int
    sentence_structure: int
    other: int


class GrammarAnalysis(BaseModel):
    sentence_count: int
    error_count: int
    errors_by_type: ErrorsByType
    errors: List[GrammarError]

In [126]:
client = OpenAI(api_key=api_key)

# response = client.responses.create(
#     model="o4-mini",
#     input=[
#         {
#             "role": "system",
#             "content": 
#     system_prompt
#         },
#         {
#             "role": "user",
#             "content": prompt_text
#         }
#     ]
# )

response = client.responses.parse(
    model="gpt-4o-mini",  # REQUIRED for Structured Outputs
    input=[
        {"role": "system", "content": system_prompt},
        {"role": "user", "content": prompt_text},
    ],
    text_format=GrammarAnalysis,  # ðŸ”’ schema enforced here
)

grammar_data: GrammarAnalysis = response.output_parsed

In [127]:
for err in grammar_data.errors:
    print(err.type, err.context)

tense Iâ€™ve know her my whole life
agreement One I really admire in her
articles to another city for her
articles sheâ€™s the complete opposite of me
prepositions watching her made me realize that being independent isnâ€™t easy
