<a href="https://colab.research.google.com/github/ptl-harsh/SHL_Task/blob/main/SHL_Hiring_Assessment.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# 🗣️ Grammar Scoring System for Spoken Audio

 Install Required Libraries

In [2]:
!pip install librosa
!pip install tqdm




Import Modules and Set Global Variables

In [14]:
import os
import numpy as np
import pandas as pd
import librosa
from tqdm import tqdm
from scipy.stats import pearsonr
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import KFold, cross_val_score
from sklearn.metrics import make_scorer
import zipfile

# Set random seed for reproducibility
np.random.seed(42)

# Define directories (adjust if needed)
TRAIN_AUDIO_DIR = '/content/drive/MyDrive/Datasets/SHL_Task/dataset/audios_train'
TEST_AUDIO_DIR = '/content/drive/MyDrive/Datasets/SHL_Task/dataset/audios_test'


In [5]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


Load CSV Files

In [10]:
train_df = pd.read_csv('/content/drive/MyDrive/Datasets/SHL_Task/dataset/train.csv')
test_df = pd.read_csv('/content/drive/MyDrive/Datasets/SHL_Task/dataset/test.csv')
sample_submission = pd.read_csv('/content/drive/MyDrive/Datasets/SHL_Task/dataset/sample_submission.csv')

print("Training samples:", train_df.shape[0])
print("Test samples:", test_df.shape[0])


Training samples: 444
Test samples: 195


Define Feature Extraction Function

In [11]:
def extract_features(file_path, sr=22050, n_mfcc=13):
    """
    Load an audio file and extract mean and std of MFCC features.
    """
    try:
        y, sr = librosa.load(file_path, sr=sr)
        # Compute MFCC features
        mfccs = librosa.feature.mfcc(y=y, sr=sr, n_mfcc=n_mfcc)
        # Take mean and standard deviation of MFCC coefficients
        mfccs_mean = np.mean(mfccs, axis=1)
        mfccs_std = np.std(mfccs, axis=1)
        features = np.concatenate([mfccs_mean, mfccs_std])
    except Exception as e:
        print(f"Error processing {file_path}: {e}")
        features = np.zeros(n_mfcc * 2)
    return features


Extract Features from Training Data

In [12]:
X_train = []
y_train = []
print("Extracting features from training audio files...")
for idx, row in tqdm(train_df.iterrows(), total=train_df.shape[0]):
    file_name = row['filename']
    label = row['label']
    file_path = os.path.join(TRAIN_AUDIO_DIR, file_name)
    features = extract_features(file_path)
    X_train.append(features)
    y_train.append(label)

X_train = np.array(X_train)
y_train = np.array(y_train)

print("Extracted features shape (train):", X_train.shape)


Extracting features from training audio files...


100%|██████████| 444/444 [04:33<00:00,  1.62it/s]

Extracted features shape (train): (444, 26)





Extract Features from Test Data

In [15]:
X_test = []
test_files = []
print("Extracting features from test audio files...")
for idx, row in tqdm(test_df.iterrows(), total=test_df.shape[0]):
    file_name = row['filename']
    file_path = os.path.join(TEST_AUDIO_DIR, file_name)
    features = extract_features(file_path)
    X_test.append(features)
    test_files.append(file_name)

X_test = np.array(X_test)
print("Extracted features shape (test):", X_test.shape)


Extracting features from test audio files...


100%|██████████| 195/195 [02:01<00:00,  1.61it/s]

Extracted features shape (test): (195, 26)





Define Custom Pearson Correlation Scorer

In [16]:
def pearson_corr(y_true, y_pred):
    return pearsonr(y_true, y_pred)[0]

pearson_scorer = make_scorer(pearson_corr, greater_is_better=True)


Train Model with Cross-Validation

In [17]:
# Initialize a RandomForestRegressor (feel free to experiment with other models)
model = RandomForestRegressor(n_estimators=100, random_state=42)

# Perform cross-validation (using KFold)
kf = KFold(n_splits=5, shuffle=True, random_state=42)
cv_scores = cross_val_score(model, X_train, y_train, cv=kf, scoring=pearson_scorer)
print("Cross-Validation Pearson Correlation scores:", cv_scores)
print("Mean CV Pearson Correlation:", np.mean(cv_scores))


Cross-Validation Pearson Correlation scores: [0.50930532 0.58693964 0.70354799 0.59908329 0.59337028]
Mean CV Pearson Correlation: 0.5984493048725888


Train on Full Data and Make Predictions

In [18]:
# Train the model on the full training data
model.fit(X_train, y_train)

# Predict on test data
predictions = model.predict(X_test)


Prepare Submission File

In [19]:
submission = pd.DataFrame({
    'filename': test_files,
    'label': predictions
})
submission = submission.sort_values('filename')  # Adjust ordering if needed

submission_csv = 'submission.csv'
submission.to_csv(submission_csv, index=False)
print(f"Submission file saved as {submission_csv}")


Submission file saved as submission.csv
