In [None]:

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))




# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

In [None]:
!pip install -q torch torchaudio ffmpeg-python

In [None]:
!pip install -q \
    language-tool-python==2.7.1 \
    protobuf==3.20.3 \
    transformers==4.38.2 \
    sentencepiece

In [None]:
import os
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

from transformers import pipeline
import language_tool_python

from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_squared_error
from scipy.stats import pearsonr


In [None]:
DATA_PATH = "/kaggle/input/shl-intern-hiring-assessment-2025"

TRAIN_CSV = f"{DATA_PATH}/dataset/csvs/train.csv"
TEST_CSV  = f"{DATA_PATH}/dataset/csvs/test.csv"

TRAIN_AUDIO_DIR = f"{DATA_PATH}/dataset/audios/train"
TEST_AUDIO_DIR  = f"{DATA_PATH}/dataset/audios/test"


In [None]:
train_df = pd.read_csv(TRAIN_CSV)
test_df  = pd.read_csv(TEST_CSV)

train_df.head()


In [None]:
plt.figure(figsize=(6,4))
plt.hist(train_df["label"], bins=10)
plt.xlabel("Grammar Score")
plt.ylabel("Count")
plt.title("Distribution of Grammar Scores")
plt.show()


In [None]:
asr = pipeline(
    "automatic-speech-recognition",
    model="openai/whisper-small",
    device=-1
)

def transcribe_audio(filename, split="train"):
    audio_dir = TRAIN_AUDIO_DIR if split == "train" else TEST_AUDIO_DIR

    if not filename.endswith(".wav"):
        filename = filename + ".wav"

    audio_path = os.path.join(audio_dir, filename)
    return asr(audio_path)["text"]


In [None]:
def extract_grammar_features(text, tool=None):
    if not text or len(text.strip()) == 0:
        return {
            "num_errors": 0,
            "word_count": 0,
            "error_rate": 0.0
        }

    errors = 0
    if tool is not None:
        try:
            matches = tool.check(text)
            errors = len(matches)
        except Exception:
            errors = 0  # API failure fallback

    words = len(text.split())

    return {
        "num_errors": errors,
        "word_count": words,
        "error_rate": errors / max(words, 1)
    }



In [None]:

from tqdm import tqdm
USE_CACHED_FEATURES = True

CACHE_PATH = "cached_train_features.csv"

if USE_CACHED_FEATURES and os.path.exists(CACHE_PATH):
    print("Loading cached grammar features...")
    feature_df = pd.read_csv(CACHE_PATH)

else:
    print("Extracting grammar features (first-time run)...")

    import language_tool_python
    tool = language_tool_python.LanguageToolPublicAPI('en-US')

    features = []

    for _, row in tqdm(train_df.iterrows(), total=len(train_df)):
        try:
            text = transcribe_audio(row["filename"], split="train")
        except Exception:
            text = ""

        feats = extract_grammar_features(text, tool)
        feats["label"] = row["label"]
        features.append(feats)

    feature_df = pd.DataFrame(features)
    feature_df.to_csv(CACHE_PATH, index=False)
    print("Grammar features cached successfully.")


In [None]:
X = feature_df.drop(columns=["label"])
y = feature_df["label"]

X_train, X_val, y_train, y_val = train_test_split(
    X, y, test_size=0.2, random_state=42
)


In [None]:
model = RandomForestRegressor(
    n_estimators=300,
    random_state=42
)

model.fit(X_train, y_train)


In [None]:
train_preds = model.predict(X_train)
train_rmse = np.sqrt(mean_squared_error(y_train, train_preds))

print("Training RMSE:", train_rmse)


In [None]:
val_preds = model.predict(X_val)

val_rmse = np.sqrt(mean_squared_error(y_val, val_preds))
pearson_corr, _ = pearsonr(y_val, val_preds)

print("Validation RMSE:", val_rmse)
print("Pearson Correlation:", pearson_corr)



In [None]:
plt.figure(figsize=(6,4))
plt.scatter(y_val, val_preds)
plt.xlabel("True Score")
plt.ylabel("Predicted Score")
plt.title("True vs Predicted Grammar Scores")
plt.show()


In [None]:
test_features = []

for filename in test_df["filename"]:
    try:
        text = transcribe_audio(filename, split="test")
    except Exception as e:
        print(f"ASR failed for test file {filename}: {e}")
        text = ""

    feats = extract_grammar_features(text)
    test_features.append(feats)

test_feature_df = pd.DataFrame(test_features)

test_predictions = model.predict(test_feature_df)

submission = pd.DataFrame({
    "filename": test_df["filename"],
    "label": test_predictions
})

submission.to_csv("submission.csv", index=False)
submission.head()


In [None]:
print(train_df.shape)
print(test_df.shape)
print(train_df.columns)
print(test_df.columns)


In [None]:
import os

sample = train_df["filename"].iloc[10]
print(sample)
print(os.path.exists(os.path.join(TRAIN_AUDIO_DIR, sample + ".wav")))


In [None]:
for i in range(10):
    fname = train_df["filename"].iloc[i]
    try:
        text = transcribe_audio(fname, split="train")
        print(f"{fname}: SUCCESS →", text[:80])
    except Exception as e:
        print(f"{fname}: ASR FAILED → {e}")


In [None]:
test_text = "I has a pen. She go to school yesterday."
print(extract_grammar_features(test_text))


In [None]:
feature_df.isna().sum()
feature_df.describe()


In [None]:
print("Train RMSE:", train_rmse)
print("Val RMSE:", val_rmse)


In [None]:
submission["label"].min(), submission["label"].max()
