In [None]:
import os
import re
import librosa
import pandas as pd
import matplotlib.pyplot as plt

def extract_no(filename):
    match = re.search(r'audio_(\d+)\.wav', filename)
    return int(match.group(1)) if match else -1

def get_audio_info(audio_dir):
    data = []
    files = [f for f in os.listdir(audio_dir) if f.endswith('.wav')]
    files = sorted(files, key=extract_no)

    for file in files:
        path = os.path.join(audio_dir, file)
        try:
            y, sr = librosa.load(path, sr=None)
            duration = librosa.get_duration(y=y, sr=sr)
            data.append({'filename': file, 'sample_rate': sr, 'duration_sec': duration})
        except Exception as e:
            print(f"Error hai in {file}: {e}")

    return pd.DataFrame(data)

def plot_aud_distri(df,col='duration_sec'):
    plt.figure(figsize=(10, 6))
    plt.hist(df[col], bins=50, color='skyblue', edgecolor='black')
    plt.title(f'Distri of aud file {col}')
    plt.xlabel(f'{col})')
    plt.ylabel('Number of Files')
    plt.grid(True)
    plt.tight_layout()
    plt.show()

def flag_audio_info(df,csv_path):
    df_long = df[df['duration_sec'] > 70]
    df_long_sorted = df_long.sort_values(by='duration_sec', ascending=True)

    df_long_sorted.to_csv(csv_path, index=False)
    print(f"Saved {len(df_long_sorted)} long files to {csv_path}")

def mod_flag_audio_info(df_all,df_flagged,csv_path):
    df_merged = pd.merge(df_flagged, df_all[['filename', 'label']], on='filename', how='left')
    df_merged.to_csv(csv_path, index=False)
    print(f"Merged CSV saved as: {csv_path}")


if __name__ == "__main__":
    audio_dir = 'Dataset/audios/train' 
    labels_csv = 'Dataset/train.csv'
    output_csv = 'logs/csvs/audio_prop.csv'
    output2_csv = 'logs/csvs/flag_audio_prop.csv'
    output3_csv = 'logs/csvs/modflags_audio_prop.csv'


In [None]:
df = get_audio_info(audio_dir)
df.to_csv(output_csv, index=False)
print(f"Metadata saved to {output_csv}")

In [None]:
df = pd.read_csv(output_csv)
plot_aud_distri(df)

In [None]:
df = pd.read_csv(output_csv)
flag_audio_info(df,output2_csv)


In [None]:
df_all = pd.read_csv(labels_csv)
df_flagged = pd.read_csv(output2_csv)
mod_flag_audio_info(df_all,df_flagged,output3_csv)

In [None]:

df_mod = pd.read_csv(output3_csv)
plot_aud_distri(df_mod,col='label')


In [None]:
import os 
import pandas as pd
import librosa
from faster_whisper import WhisperModel

grammar_rubric = {
    1.0: "The person's speech struggles with proper sentence structure and syntax, displaying limited control over simple grammatical structures and memorized sentence patterns.",
    1.5: "The person's speech shows signs of basic structure but still has notable issues with grammar and syntax.",
    2.0: "The person has a limited understanding of sentence structure and syntax. Although they use simple structures, they consistently make basic sentence structure and grammatical mistakes. They might leave sentences incomplete.",
    2.5: "The person sometimes forms correct sentences but often makes errors that affect understanding.",
    3.0: "The person demonstrates a decent grasp of sentence structure but makes errors in grammatical structure, or they show a decent grasp of grammatical structure but make errors in sentence syntax and structure.",
    3.5: "The person has mostly correct grammar but makes occasional errors that may slightly affect clarity.",
    4.0: "The person displays a strong understanding of sentence structure and syntax. They consistently show good control of grammar. While occasional errors may occur, they are generally minor and do not lead to misunderstandings; the person can correct most of them.",
    4.5: "The person speaks accurately most of the time, with small grammar issues that rarely affect clarity.",
    5.0: "Overall, the person showcases high grammatical accuracy and adept control of complex grammar. They use grammar accurately and effectively, seldom making noticeable mistakes. Additionally, they handle complex language structures well and correct themselves when necessary."
}

label_csv = "Dataset/test.csv"
audio_dir = "Dataset/audios/test/"
output_csv = "testing_data.csv"

df = pd.read_csv(label_csv)
model = WhisperModel("large", compute_type="float16")

rows = []

for idx, row in df.iterrows():
    filename = row["filename"]
    # label = float(row["label"])

    path = os.path.join(audio_dir, filename)
    if not os.path.exists(path):
        print(f"[WARN] File not found: {path}")
        continue

    try:
        y, sr = librosa.load(path, sr=16000)
        duration = librosa.get_duration(y=y, sr=sr)

        if duration > 70:
            rows.append({
                "filename": filename,
                "transcription": "flag"
            })
            print(f"[FLAG] {filename} duration {duration:.2f}s marked as 'flag'")
            continue

        full_transcription = ""

        # Process in 20s chunks
        chunk_length = 20 * sr
        total_chunks = int(len(y) / chunk_length) + 1

        for i in range(total_chunks):
            start = i * chunk_length
            end = min((i + 1) * chunk_length, len(y))
            chunk = y[start:end]

            if len(chunk) < sr:
                continue

            segments, _ = model.transcribe(chunk, language="en", beam_size=5)
            chunk_text = " ".join([seg.text for seg in segments]).strip()
            full_transcription += " " + chunk_text

        # rubric_desc = grammar_rubric.get(label, "Unknown rubric description.")

        rows.append({
            "filename": filename,
            "transcription": full_transcription.strip(),
        })

        print(f"[OK] Processed {filename}")

    except Exception as e:
        print(f"[ERR] Failed {filename}: {e}")

df_out = pd.DataFrame(rows)
df_out.to_csv(output_csv, index=False)
print(f"\nSaved {len(df_out)} entries to {output_csv}")


In [None]:
import pandas as pd
from datasets import Dataset
from transformers import BertTokenizer, BertForSequenceClassification
from transformers import Trainer, DataCollatorWithPadding


df = pd.read_csv("training_data_for_t5.csv")  

df = df.dropna(subset=["transcription", "label"])

dataset = Dataset.from_pandas(df)
dataset = dataset.train_test_split(test_size=0.1)

tokenizer = BertTokenizer.from_pretrained("bert-large-uncased")

model = BertForSequenceClassification.from_pretrained(
    "bert-base-uncased",
    num_labels=1,
    problem_type="regression"  
)

def preprocess(batch):
    tokens = tokenizer(
        batch["transcription"],
        padding="max_length",
        truncation=True,
        max_length=256
    )
    tokens["labels"] = [float(l) for l in batch["label"]]
    return tokens

tokenized = dataset.map(preprocess, batched=True)

from transformers import TrainingArguments

training_args = TrainingArguments(
    output_dir="./bert_grammar_regressor",
    num_train_epochs=50,             
    per_device_train_batch_size=8,
    per_device_eval_batch_size=8,
    learning_rate=3e-5,            
    eval_strategy="epoch",
    save_strategy="epoch",
    logging_dir="./logs",
    weight_decay=0.01
)

# Data collator to pad 
data_collator = DataCollatorWithPadding(tokenizer)

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized["train"],
    eval_dataset=tokenized["test"],
    tokenizer=tokenizer,
    data_collator=data_collator,
)

trainer.train()

save_dir = "./saved_bert_grammar_regressor"
model.save_pretrained(save_dir)
tokenizer.save_pretrained(save_dir)
print(f"Model saved to {save_dir}")


In [None]:
import pandas as pd
import numpy as np
import torch
from transformers import BertTokenizer, BertForSequenceClassification

INPUT_CSV = "testing_data.csv"
OUTPUT_CSV = "submission2.csv"
MODEL_DIR = "./saved_bert_grammar_regressor" 
MAX_LENGTH = 256

model = BertForSequenceClassification.from_pretrained(MODEL_DIR)
tokenizer = BertTokenizer.from_pretrained(MODEL_DIR)
model.eval()

valid_scores = np.array([1.0, 1.5, 2.0, 2.5, 3.0, 3.5, 4.0, 4.5, 5.0])

def round_to_rubric(pred: float) -> float:
    pred = min(max(pred, 1.0), 5.0)
    closest = valid_scores[np.argmin(np.abs(valid_scores - pred))]
    return closest

def get_flag_score():
    scores = [5.0, 4.5, 4.0]
    weights = [0.85, 0.1, 0.05]
    return 5.0

def predict_score(transcription: str) -> float:
    if transcription.strip().lower() == "flag":
        return get_flag_score()
    
    inputs = tokenizer(transcription, return_tensors="pt", truncation=True, max_length=256)
    with torch.no_grad():
        output = model(**inputs)
        raw_score = output.logits.squeeze().item()
        final_score = round_to_rubric(raw_score)
        return final_score

df = pd.read_csv(INPUT_CSV)

df["label"] = df["transcription"].apply(predict_score)

df[["filename", "label"]].to_csv(OUTPUT_CSV, index=False)
print("Saved predictions to submission.csv")