In [None]:
!pip install -q librosa xgboost tqdm
!pip install -q git+https://github.com/openai/whisper.git


In [None]:
import whisper
print(whisper.available_models())


In [None]:
import pandas as pd
import numpy as np
import librosa
import whisper
import re

from tqdm import tqdm
from xgboost import XGBRegressor


In [None]:
train_df = pd.read_csv(
    "/kaggle/input/shl-intern-hiring-assessment-2025/dataset/csvs/train.csv"
)

test_df = pd.read_csv(
    "/kaggle/input/shl-intern-hiring-assessment-2025/dataset/csvs/test.csv"
)

print(train_df.head())
print(train_df.columns)


In [None]:
def extract_text_features(text):
    text = text.lower().strip()
    words = text.split()

    num_words = len(words)
    avg_word_len = sum(len(w) for w in words) / num_words if num_words > 0 else 0
    num_sentences = len(re.findall(r'[.!?]', text)) + 1 if text else 0
    lexical_diversity = len(set(words)) / num_words if num_words > 0 else 0

    return np.array([
        num_words,
        avg_word_len,
        num_sentences,
        lexical_diversity
    ])


def speech_rate(text, duration):
    words = text.split()
    return len(words) / duration if duration > 0 else 0


In [None]:
def extract_audio_features(audio_path):
    try:
        y, sr = librosa.load(audio_path, sr=16000)
        mfcc = librosa.feature.mfcc(y=y, sr=sr, n_mfcc=13)

        feats = []
        for stat in [np.mean, np.std, np.min, np.max]:
            feats.extend(stat(mfcc, axis=1))

        duration = librosa.get_duration(y=y, sr=sr)

        return np.array(feats), duration   # 52 MFCC features + duration
    except:
        return np.zeros(52), 0.0


In [None]:
asr_model = whisper.load_model("base")

def transcribe_audio(audio_path):
    try:
        result = asr_model.transcribe(audio_path)
        return result["text"]
    except:
        return ""


In [None]:
X = []
y = []

for _, row in tqdm(train_df.iterrows(), total=len(train_df)):
    audio_path = (
        "/kaggle/input/shl-intern-hiring-assessment-2025/"
        f"dataset/audios/train/{row['filename']}.wav"
    )

    text = transcribe_audio(audio_path)
    audio_feat, duration = extract_audio_features(audio_path)
    text_feat = extract_text_features(text)
    rate = speech_rate(text, duration)

    features = np.concatenate([
    audio_feat,        # 52 MFCC stats
    text_feat,         # 4 text features
    np.array([rate])   # 1 speech-rate feature
    ])

X.append(features)
y.append(row["label"])

X = np.array(X)
y = np.array(y)

print("X shape:", X.shape)
print("y shape:", y.shape)


In [None]:
model = XGBRegressor(
    n_estimators=300,
    learning_rate=0.05,
    max_depth=5,
    subsample=0.8,
    colsample_bytree=0.8,
    random_state=42
)

model.fit(X, y)
print("Model training complete")


In [None]:
test_preds = []

for _, row in tqdm(test_df.iterrows(), total=len(test_df)):
    audio_path = (
        "/kaggle/input/shl-intern-hiring-assessment-2025/"
        f"dataset/audios/test/{row['filename']}.wav"
    )

    text = transcribe_audio(audio_path)
    audio_feat, duration = extract_audio_features(audio_path)
    text_feat = extract_text_features(text)
    rate = speech_rate(text, duration)

    features = np.concatenate([
        audio_feat,
        text_feat,
        np.array([rate])
    ])

    pred = model.predict(features.reshape(1, -1))[0]
    test_preds.append(pred)


In [28]:
submission = pd.DataFrame({
    "filename": test_df["filename"],
    "label": test_preds
})

submission.to_csv("submission.csv", index=False)
submission.head()


Unnamed: 0,filename,label
0,audio_141,4.5
1,audio_114,4.5
2,audio_17,4.5
3,audio_76,4.5
4,audio_156,4.5


In [25]:
from IPython.display import FileLink
FileLink("submission.csv")
