# Grammar Scoring Engine â€“ Deep Learning + Audio Features

This notebook includes:
- Whisper ASR
- BERT-based grammar scoring
- Audio fluency features (pause, duration, fillers)

In [None]:
!pip install -q openai-whisper transformers torch librosa language-tool-python

In [None]:

import whisper, librosa, torch, re
import numpy as np
import pandas as pd
from transformers import BertTokenizer, BertModel
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_absolute_error
import language_tool_python


## Load Models

In [None]:

asr_model = whisper.load_model("base")
tokenizer = BertTokenizer.from_pretrained("bert-base-uncased")
bert = BertModel.from_pretrained("bert-base-uncased")
tool = language_tool_python.LanguageTool('en-US')


## Feature Functions

In [None]:

def transcribe(path):
    return asr_model.transcribe(path)['text']

def audio_features(path):
    y, sr = librosa.load(path, sr=None)
    duration = librosa.get_duration(y=y, sr=sr)
    pauses = np.sum(np.abs(y) < 0.01) / len(y)
    return duration, pauses

def grammar_features(text):
    errors = tool.check(text)
    return len(errors)

def bert_embedding(text):
    inputs = tokenizer(text, return_tensors='pt', truncation=True, max_length=128)
    with torch.no_grad():
        outputs = bert(**inputs)
    return outputs.pooler_output.numpy().flatten()


## Load Data

In [None]:

df = pd.read_csv('/kaggle/input/train.csv')
X, y = [], df['grammar_score']

for _, row in df.iterrows():
    text = transcribe(row['file_path'])
    dur, pause = audio_features(row['file_path'])
    gerr = grammar_features(text)
    emb = bert_embedding(text)
    feats = np.concatenate([emb, [dur, pause, gerr]])
    X.append(feats)

X = np.array(X)


## Train Model

In [None]:

from sklearn.linear_model import Ridge
X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.2, random_state=42)
model = Ridge(alpha=1.0)
model.fit(X_train, y_train)
preds = model.predict(X_val)
print("MAE:", mean_absolute_error(y_val, preds))


## Done