In [2]:
import os
import numpy as np
import librosa
import lightgbm as lgb
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import classification_report, f1_score

In [3]:
# === 1. Feature Extraction with PCEN ===
def extract_features_with_pcen(file_path, sr=16000, duration=1, n_mels=64):
    y, _ = librosa.load(file_path, sr=sr)
    y = librosa.util.fix_length(data=y, size=int(sr * duration))
    mel_spec = librosa.feature.melspectrogram(y=y, sr=sr, n_mels=n_mels)
    pcen = librosa.pcen(mel_spec, sr=sr, gain=0.8, bias=10, power=0.25, time_constant=0.06, eps=1e-6)
    return np.mean(pcen, axis=1)  # Temporal mean pooling

In [4]:
# === 2. Load Dataset ===
def load_dataset(folder_path):
    X, y = [], []
    for fname in os.listdir(folder_path):
        if fname.endswith(".wav"):
            label = fname.split("_")[0]  # e.g., gunshot_001.wav -> gunshot
            file_path = os.path.join(folder_path, fname)
            features = extract_features_with_pcen(file_path)
            X.append(features)
            y.append(label)
    return np.array(X), np.array(y)

In [5]:
# === 3. Dataset Paths ===
train_path = '/kaggle/input/stage2/data/train'
test_path = '/kaggle/input/stage2/data/test'

In [6]:
# === 4. Load Data ===
X_train, y_train = load_dataset(train_path)
X_test, y_test = load_dataset(test_path)

In [7]:
# === 5. Label Encoding ===
le = LabelEncoder()
y_train_encoded = le.fit_transform(y_train)
y_test_encoded = le.transform(y_test)

In [8]:
from sklearn.model_selection import GridSearchCV

# === 6.1 Define Parameter Grid ===
param_grid = {
    'n_estimators': [50, 100, 200],
    'max_depth': [5, 10, 20],
    'learning_rate': [0.01, 0.1, 0.3],
    'num_leaves': [20, 31, 50],
    'min_child_samples': [5, 10, 20]
}

# === 6.2 Initialize Base Model ===
lgbm = lgb.LGBMClassifier()

In [9]:
# === 6.3 Initialize GridSearchCV ===
grid_search = GridSearchCV(
    estimator=lgbm,
    param_grid=param_grid,
    scoring='f1_weighted',  # or 'recall_weighted', etc.
    cv=3,  # 3-fold cross-validation
    verbose=0,
    n_jobs=-1  # Use all cores
)

# === 6.4 Fit Model with Grid Search ===
grid_search.fit(X_train, y_train_encoded)

[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.023083 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 16320
[LightGBM] [Info] Number of data points in the train set: 3752, number of used features: 64
[LightGBM] [Info] Start training from score -1.386294
[LightGBM] [Info] Start training from score -1.386294
[LightGBM] [Info] Start training from score -1.386294
[LightGBM] [Info] Start training from score -1.386294
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 16320
[LightGBM] [Info] Number of data points in the train set: 3752, number of used features: 64
[LightGBM] [Info] Start training from score -1.386294
[LightGBM] [Info] Start training from score -1.386294
[LightGBM] [Info] Start training from score -1.386294
[LightGBM] [Info] Start training from score -1.386294
[LightGBM] [Info] Auto-choosin

In [10]:
# === 6.5 Use the Best Estimator ===
best_model = grid_search.best_estimator_

In [20]:
# === 7. Evaluation with Best Model ===
y_pred = best_model.predict(X_test)

print("\n=== Best Parameters ===")
print(grid_search.best_params_)

print("\n=== Performance Metrics ===")
print(classification_report(y_test_encoded, y_pred, target_names=le.classes_))
print(f"F1-Score (weighted): {f1_score(y_test_encoded, y_pred, average='weighted'):.4f}")


=== Best Parameters ===
{'learning_rate': 0.1, 'max_depth': 10, 'min_child_samples': 20, 'n_estimators': 50, 'num_leaves': 20}

=== Performance Metrics ===
                 precision    recall  f1-score   support

backgroundnoise       0.52      0.84      0.64       402
 brokenbranches       0.78      0.70      0.73       402
      footsteps       0.85      0.72      0.78       402
        gunshot       0.90      0.56      0.69       402

       accuracy                           0.70      1608
      macro avg       0.76      0.70      0.71      1608
   weighted avg       0.76      0.70      0.71      1608

F1-Score (weighted): 0.7108


In [12]:
pip install codecarbon --quiet

[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m277.7/277.7 kB[0m [31m2.9 MB/s[0m eta [36m0:00:00[0ma [36m0:00:01[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m3.1/3.1 MB[0m [31m24.6 MB/s[0m eta [36m0:00:00[0m00:01[0m00:01[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m76.4/76.4 kB[0m [31m4.3 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m92.5/92.5 kB[0m [31m1.5 MB/s[0m eta [36m0:00:00[0m0:00:01[0m
[?25h[31mERROR: pip's dependency resolver does not currently take into account all the packages that are installed. This behaviour is the source of the following dependency conflicts.
google-genai 1.21.1 requires httpx<1.0.0,>=0.28.1, but you have httpx 0.27.2 which is incompatible.
firebase-admin 6.9.0 requires httpx[http2]==0.28.1, but you have httpx 0.27.2 which is incompatible.
langchain-core 0.3.66 requires packaging<25,>=23.2, but you have packaging 25.0 which is inco

In [31]:
from sklearn.metrics import classification_report, roc_auc_score, roc_curve
from sklearn.preprocessing import label_binarize
import matplotlib.pyplot as plt
import time
from codecarbon import EmissionsTracker

# Predictions
# === Track Energy + Time ===
tracker = EmissionsTracker(project_name="LGBM")
tracker.start()
start_time = time.time()

y_pred = best_model.predict(X_test)

train_time = time.time() - start_time
emissions = tracker.stop()

print(f"\n🔧 Training Time: {train_time*1000:.2f} m.seconds")
print(f"🔋 Energy Emissions: {emissions*1000000:.6f} mg CO₂\n")




[codecarbon INFO @ 17:47:18] [setup] RAM Tracking...
[codecarbon INFO @ 17:47:18] [setup] CPU Tracking...
 Linux OS detected: Please ensure RAPL files exist at /sys/class/powercap/intel-rapl/subsystem to measure CPU

[codecarbon INFO @ 17:47:19] CPU Model on constant consumption mode: Intel(R) Xeon(R) CPU @ 2.20GHz
[codecarbon INFO @ 17:47:19] [setup] GPU Tracking...
[codecarbon INFO @ 17:47:19] No GPU found.
[codecarbon INFO @ 17:47:19] The below tracking methods have been set up:
                RAM Tracking Method: RAM power estimation model
                CPU Tracking Method: global constant
                GPU Tracking Method: Unspecified
            
[codecarbon INFO @ 17:47:19] >>> Tracker's metadata:
[codecarbon INFO @ 17:47:19]   Platform system: Linux-6.6.56+-x86_64-with-glibc2.35
[codecarbon INFO @ 17:47:19]   Python version: 3.11.13
[codecarbon INFO @ 17:47:19]   CodeCarbon version: 3.0.4
[codecarbon INFO @ 17:47:19]   Available RAM : 31.350 GB
[codecarbon INFO @ 17:47:19]


🔧 Training Time: 17.53 m.seconds
🔋 Energy Emissions: 0.221489 mg CO₂

