# Cell 1: Setup and Installation

In [None]:
!pip install librosa scikit-learn matplotlib numpy pandas noisereduce tensorflow shap


# Cell 1.5: Unzip

In [None]:
# Cell X: Upload & Preprocess Audio Data
import os

# Path to your uploaded ZIP (replace 'your-data.zip' with your actual file name from Inputs)
zip_path = '/kaggle/input/your-data.zip'
data_dir = '/kaggle/working/violin-emotion-analysis/data'
csv_path = '/kaggle/working/violin-emotion-analysis/emotion_labels.csv'

# Unzip the data
!unzip -q {zip_path} -d {data_dir}

# Check folder structure
print("Data directory structure after unzipping:")
for root, dirs, files in os.walk(data_dir):
    level = root.replace(data_dir, '').count(os.sep)
    indent = ' ' * 2 * level
    print(f"{indent}{os.path.basename(root)}/")
    subindent = ' ' * 2 * (level + 1)
    for f in files:
        print(f"{subindent}{f}")

# Check if CSV exists
if not os.path.exists(csv_path):
    print("\nWARNING: 'emotion_labels.csv' not found!")
    print("Please upload your CSV to match filenames and soft emotion labels.")
else:
    print(f"\nCSV file found: {csv_path}")


# Cell 2: Imports and Updated Utility Functions

In [None]:
import os
import numpy as np
import pandas as pd
import librosa
import librosa.display
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, LabelEncoder
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score
from sklearn.ensemble import RandomForestClassifier
import tensorflow as tf
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import LSTM, Dense, Dropout, BatchNormalization, Attention, Input
from tensorflow.keras.callbacks import EarlyStopping, ReduceLROnPlateau
import noisereduce as nr
import shap
import joblib
import warnings
warnings.filterwarnings('ignore')

# Create directories
os.makedirs('/kaggle/working/violin-emotion-analysis/data', exist_ok=True)
os.makedirs('/kaggle/working/violin-emotion-analysis/models', exist_ok=True)

def extract_temporal_features(file_path, sr=44100, frame_length=2048, hop_length=512, window_size=3.0, hop_time=1.5):
    """
    Extract time-based audio features from overlapping windows (captures emotion dynamics)
    """
    y, sr = librosa.load(file_path, sr=sr)
    y = nr.reduce_noise(y=y, sr=sr)

    total_duration = librosa.get_duration(y=y, sr=sr)
    step = int(hop_time * sr)
    window = int(window_size * sr)

    feature_sequences = []
    for start in range(0, len(y) - window, step):
        segment = y[start:start + window]
        segment_features = []
        mfccs = librosa.feature.mfcc(y=segment, sr=sr, n_mfcc=13)
        chroma = librosa.feature.chroma_stft(y=segment, sr=sr)
        spectral_contrast = librosa.feature.spectral_contrast(y=segment, sr=sr)
        rms = librosa.feature.rms(y=segment)
        zcr = librosa.feature.zero_crossing_rate(y=segment)
        centroid = librosa.feature.spectral_centroid(y=segment, sr=sr)
        bandwidth = librosa.feature.spectral_bandwidth(y=segment, sr=sr)
        
        # Combine flattened statistics
        feature_vec = np.hstack([
            np.mean(mfccs, axis=1),
            np.std(mfccs, axis=1),
            np.mean(chroma, axis=1),
            np.mean(spectral_contrast, axis=1),
            np.mean(rms),
            np.mean(zcr),
            np.mean(centroid),
            np.mean(bandwidth)
        ])
        feature_sequences.append(feature_vec)

    return np.array(feature_sequences)

def create_temporal_dataset(data_dir, emotion_csv_path):
    """
    Create dataset with temporal features and soft emotion labels.
    emotion_csv_path must include columns: filename, emotion, and soft labels like happy, sad, calm, etc.
    """
    annotations = pd.read_csv(emotion_csv_path)
    X, y_soft = [], []

    for _, row in annotations.iterrows():
        file_path = os.path.join(data_dir, row['filename'])
        if not os.path.exists(file_path): 
            continue

        features = extract_temporal_features(file_path)
        X.append(features)
        y_soft.append(row[3:].values.astype(float))  # assuming first 3 columns are meta info

    # Pad sequences for LSTM
    max_len = max(x.shape[0] for x in X)
    num_features = X[0].shape[1]
    X_padded = np.zeros((len(X), max_len, num_features))
    for i, seq in enumerate(X):
        X_padded[i, :seq.shape[0], :] = seq

    return np.array(X_padded), np.array(y_soft)


# Cell 3: Data Preparation

In [None]:
data_dir = '/kaggle/working/violin-emotion-analysis/data'
emotion_csv_path = '/kaggle/working/violin-emotion-analysis/emotion_labels.csv'  # <-- upload your CSV here

if not os.path.exists(emotion_csv_path):
    print("Please upload 'emotion_labels.csv' with soft emotion probabilities.")
else:
    X, y_soft = create_temporal_dataset(data_dir, emotion_csv_path)
    print(f"Loaded {X.shape[0]} samples with {X.shape[2]} features each and {y_soft.shape[1]} soft emotion dimensions.")


# Cell 4: Model Training (LSTM + RandomForest Fusion)

In [None]:
from tensorflow.keras.models import Model

# Split data
X_train, X_test, y_train_soft, y_test_soft = train_test_split(
    X, y_soft, test_size=0.2, random_state=42
)

# Normalize feature values
scaler = StandardScaler()
for i in range(X_train.shape[0]):
    X_train[i] = scaler.fit_transform(X_train[i])
for i in range(X_test.shape[0]):
    X_test[i] = scaler.transform(X_test[i])

joblib.dump(scaler, '/kaggle/working/violin-emotion-analysis/models/scaler.pkl')

# Define LSTM model with Attention layer
input_layer = Input(shape=(X_train.shape[1], X_train.shape[2]))
x = LSTM(128, return_sequences=True)(input_layer)
x = Attention()([x, x])
x = BatchNormalization()(x)
x = Dropout(0.3)(x)
x = LSTM(64)(x)
x = BatchNormalization()(x)
x = Dropout(0.3)(x)
x = Dense(64, activation='relu')(x)
output_layer = Dense(y_train_soft.shape[1], activation='softmax')(x)
lstm_model = Model(inputs=input_layer, outputs=output_layer)

lstm_model.compile(optimizer='adam', loss='categorical_crossentropy', metrics=['accuracy'])

# Callbacks
early_stopping = EarlyStopping(patience=10, restore_best_weights=True)
reduce_lr = ReduceLROnPlateau(factor=0.5, patience=5, min_lr=1e-6)

print("Training LSTM model with temporal and soft label support...")
history = lstm_model.fit(
    X_train, y_train_soft,
    validation_data=(X_test, y_test_soft),
    epochs=80, batch_size=16,
    callbacks=[early_stopping, reduce_lr],
    verbose=1
)

# Train RandomForest on averaged features (feature-based model)
X_train_flat = np.mean(X_train, axis=1)
X_test_flat = np.mean(X_test, axis=1)
rf_model = RandomForestClassifier(n_estimators=100, random_state=42)
rf_model.fit(X_train_flat, np.argmax(y_train_soft, axis=1))

# Combine (fusion) predictions
lstm_probs = lstm_model.predict(X_test)
rf_probs = np.zeros_like(lstm_probs)
for i, pred in enumerate(rf_model.predict_proba(X_test_flat)):
    rf_probs[i, :len(pred)] = pred

hybrid_probs = 0.6 * lstm_probs + 0.4 * rf_probs
hybrid_pred = np.argmax(hybrid_probs, axis=1)
true_labels = np.argmax(y_test_soft, axis=1)

accuracy = accuracy_score(true_labels, hybrid_pred)
print(f"Hybrid Model Accuracy: {accuracy:.4f}")

lstm_model.save('/kaggle/working/violin-emotion-analysis/models/lstm_hybrid_model.h5')
joblib.dump(rf_model, '/kaggle/working/violin-emotion-analysis/models/rf_model.pkl')
print("Models saved successfully!")


# Cell 5: Explainability and Visualization

In [None]:
explainer = shap.Explainer(rf_model, X_train_flat[:100])
shap_values = explainer(X_test_flat[:50])

print("Generating SHAP summary plot (Random Forest interpretability)...")
shap.summary_plot(shap_values, X_test_flat[:50], show=True)


# Cell 6: Real-Time Prediction Simulation on New Audio Files

In [None]:
new_audio_dir = '/kaggle/working/violin-emotion-analysis/new_audio'  # put new audios here
emotion_labels = pd.read_csv(emotion_csv_path).columns[3:].tolist()  # same soft labels as training

new_files = [f for f in os.listdir(new_audio_dir) if f.endswith('.wav')]
print(f"Found {len(new_files)} new audio files for prediction.")

for f in new_files:
    path = os.path.join(new_audio_dir, f)
    
    # Extract features
    features = extract_temporal_features(path)
    
    # Pad sequence to match LSTM input
    padded = np.zeros((1, X_train.shape[1], X_train.shape[2]))
    padded[0, :features.shape[0], :] = features
    
    # LSTM prediction
    lstm_probs = lstm_model.predict(padded)
    
    # RF prediction
    rf_probs = rf_model.predict_proba(np.mean(padded, axis=1))
    rf_probs = np.array([np.pad(p, (0, len(emotion_labels) - len(p))) for p in rf_probs])
    
    # Hybrid fusion
    hybrid_probs = 0.6*lstm_probs + 0.4*rf_probs
    
    # Perceptual smoothing
    smoothed = np.convolve(np.mean(hybrid_probs, axis=0), np.ones(3)/3, mode='same')
    
    # Plot results
    plt.figure(figsize=(8, 5))
    plt.bar(emotion_labels, smoothed)
    plt.title(f"Predicted Emotions for {f}")
    plt.ylabel("Probability")
    plt.ylim(0,1)
    plt.show()
    
    # Print prediction
    pred_emotion = emotion_labels[np.argmax(smoothed)]
    print(f"Audio: {f} → Predicted Emotion: {pred_emotion}")
    print("Probability per emotion:")
    for label, prob in zip(emotion_labels, smoothed):
        print(f"  {label}: {prob:.3f}")
    print("\n" + "-"*40 + "\n")
