In [None]:
# Install necessary packages (add others if your script requires them)
!pip install joblib -q # -q makes it quieter

In [None]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import tensorflow as tf
from tensorflow.keras.models import Model
from tensorflow.keras.layers import Input, Dense, Dropout, BatchNormalization, Add
from tensorflow.keras.layers import Conv1D, SpatialDropout1D, GlobalAveragePooling1D, Activation
from tensorflow.keras.callbacks import ModelCheckpoint, ReduceLROnPlateau, EarlyStopping
from tensorflow.keras.regularizers import l2
from sklearn.preprocessing import LabelEncoder, StandardScaler
from sklearn.model_selection import train_test_split
from sklearn.metrics import confusion_matrix, classification_report, accuracy_score
from sklearn.utils import class_weight
import seaborn as sns
import joblib
import os
import time

In [None]:
# --- Configuration ---
from pathlib import Path # Ensure Path is imported

# *** Define the two input files using Colab paths ***
# Assuming you created Colab_HAR_Project/data in your Drive's root
drive_base_path = Path('/content/drive/MyDrive/Colab_HAR_Project/data')
INPUT_CSV_1 = drive_base_path / 'resampled_normalized_phone_data.csv' # Source for B, D, E
INPUT_CSV_2 = drive_base_path / 'combined_collected_data.csv'      # Source for A, C

ACTIVITIES_FROM_FILE1 = ['B', 'D', 'E']
ACTIVITIES_FROM_FILE2 = ['A', 'C']
ALL_ACTIVITIES_TO_KEEP = sorted(ACTIVITIES_FROM_FILE1 + ACTIVITIES_FROM_FILE2)

# *** Define the output directory within Google Drive for persistence ***
output_drive_path = Path('/content/drive/MyDrive/Colab_HAR_Project/results')
OUTPUT_DIR = output_drive_path / 'TCN_Results_Merged_Sources_Colab' # Use a distinct name
FILE_PREFIX = f"tcn_{''.join(ALL_ACTIVITIES_TO_KEEP)}_"

# Create output directory (in Google Drive via the mount)
# os.makedirs(OUTPUT_DIR, exist_ok=True) # Pathlib handles this better
OUTPUT_DIR.mkdir(parents=True, exist_ok=True) # Use pathlib's mkdir

# --- Windowing, Model, Training Parameters (Keep as before or adjust) ---
# ... (rest of your configuration variables like WINDOW_SIZE, BATCH_SIZE etc.) ...

# --- Add GPU Check (Optional but recommended) ---
import tensorflow as tf
print("--- GPU Check ---")
print(f"TensorFlow Version: {tf.__version__}")
gpu_devices = tf.config.list_physical_devices('GPU')
print(f"Num GPUs Available: {len(gpu_devices)}")
if gpu_devices:
    print("GPU(s) found:")
    for gpu in gpu_devices:
        print(f"  {gpu}")
        try: # Attempt memory growth setting
            tf.config.experimental.set_memory_growth(gpu, True)
        except RuntimeError as e:
            print(f"Could not set memory growth for {gpu}: {e}")
    print("Attempted to enable memory growth for GPU(s).")
else:
    print("!!! No GPU found by TensorFlow. Check Runtime settings. !!!")
print("-----------------\n")


# Windowing Parameters
WINDOW_SIZE = 60
STRIDE = 15

# Model Hyperparameters
KERNEL_SIZE = 7
NUM_FILTERS = 64
NUM_TCN_BLOCKS = 5
DILATION_RATES = [2**i for i in range(NUM_TCN_BLOCKS)]
SPATIAL_DROPOUT_RATE = 0.15
FINAL_DROPOUT_RATE = 0.3
L2_REG = 1e-4

# Training Parameters
BATCH_SIZE = 64
EPOCHS = 100
VALIDATION_SPLIT = 0.2
EARLY_STOPPING_PATIENCE = 15
LR_PATIENCE = 5
LR_FACTOR = 0.5

# --- Create output directory ---
os.makedirs(OUTPUT_DIR, exist_ok=True)

print("--- Configuration ---")
print(f"Input CSV 1 (Activities {ACTIVITIES_FROM_FILE1}): {INPUT_CSV_1}")
print(f"Input CSV 2 (Activities {ACTIVITIES_FROM_FILE2}): {INPUT_CSV_2}")
print(f"All Activities Kept: {ALL_ACTIVITIES_TO_KEEP}")
print(f"Output Directory: {OUTPUT_DIR}")
print(f"Output File Prefix: {FILE_PREFIX}")
print(f"Window Size: {WINDOW_SIZE} samples")
print(f"Stride: {STRIDE} samples")
print(f"TCN Kernel Size: {KERNEL_SIZE}")

print("---------------------\n")


# --- Data Preparation Function (Windowing) ---
# No changes needed in this function itself, it operates on the final combined dataframe
def create_subject_activity_windows(df, window_size, stride):
    """
    Creates sliding windows ensuring each window belongs to only one subject/activity pair.
    """
    windows = []
    labels = []
    subject_ids = []

    required_cols = ['x_accel', 'y_accel', 'z_accel', 'x_gyro', 'y_gyro', 'z_gyro']
    if not all(col in df.columns for col in required_cols):
        raise ValueError(f"Input DataFrame missing required sensor columns: {required_cols}")
    if 'subject' not in df.columns or 'activity' not in df.columns:
        raise ValueError("Input DataFrame missing 'subject' or 'activity' columns.")
    if 'timestamp' not in df.columns:
        print("Warning: 'timestamp' column not found. Assuming data is pre-sorted.")

    grouped = df.groupby(['subject', 'activity'])
    print(f"Processing {len(grouped)} subject-activity groups for windowing...")

    for name, group_df in grouped:
        subject, activity = name
        if 'timestamp' in group_df.columns:
             group_df = group_df.sort_values('timestamp')

        data_values = group_df[required_cols].values

        if len(data_values) < window_size:
            continue

        for start_idx in range(0, data_values.shape[0] - window_size + 1, stride):
            window = data_values[start_idx : start_idx + window_size, :]
            windows.append(window)
            labels.append(activity)
            subject_ids.append(subject)

    if not windows:
        raise ValueError("No windows could be created. Check filtered data length and window parameters.")

    X = np.array(windows)
    y = np.array(labels)
    subs = np.array(subject_ids)

    print(f"\nCreated {len(windows)} windows.")
    print(f"Window data shape (X): {X.shape}")
    print(f"Labels shape (y): {y.shape}")

    return X, y, subs


# --- TCN Model Definition ---

def residual_block(x, dilation_rate, nb_filters, kernel_size, padding, dropout_rate=0.0, l2_reg=0.0):
    prev_x = x
    conv1 = Conv1D(filters=nb_filters, kernel_size=kernel_size, dilation_rate=dilation_rate, padding=padding, kernel_regularizer=l2(l2_reg))(x)
    conv1 = BatchNormalization()(conv1); conv1 = Activation('relu')(conv1); conv1 = SpatialDropout1D(dropout_rate)(conv1)
    conv2 = Conv1D(filters=nb_filters, kernel_size=kernel_size, dilation_rate=dilation_rate, padding=padding, kernel_regularizer=l2(l2_reg))(conv1)
    conv2 = BatchNormalization()(conv2); conv2 = Activation('relu')(conv2); conv2 = SpatialDropout1D(dropout_rate)(conv2)
    if prev_x.shape[-1] != conv2.shape[-1]: prev_x = Conv1D(nb_filters, 1, padding='same', kernel_regularizer=l2(l2_reg))(prev_x)
    res_x = Add()([prev_x, conv2]); return res_x

def build_tcn_model(input_shape, num_classes, kernel_size, num_filters, dilation_rates, spatial_dropout, final_dropout, l2_reg=0.0):
    input_layer = Input(shape=input_shape)
    x = input_layer
    for dilation_rate in dilation_rates:
        x = residual_block(x, dilation_rate, num_filters, kernel_size, padding='same', dropout_rate=spatial_dropout, l2_reg=l2_reg)
    x = GlobalAveragePooling1D()(x)
    x = Dropout(final_dropout)(x)
    output_layer = Dense(num_classes, activation='softmax', kernel_regularizer=l2(l2_reg))(x)
    model = Model(inputs=input_layer, outputs=output_layer)
    return model


# --- Plotting and Evaluation Functions ---
# (Modified to use FILE_PREFIX in filenames)
def plot_confusion_matrix(y_true, y_pred_classes, class_names, output_dir, filename_suffix="confusion_matrix.png"):
    cm = confusion_matrix(y_true, y_pred_classes)
    plt.figure(figsize=(max(6, len(class_names)+1), max(5, len(class_names)*0.8+1)))
    sns.heatmap(cm, annot=True, fmt='d', cmap='Blues', xticklabels=class_names, yticklabels=class_names)
    plt.ylabel('Actual'); plt.xlabel('Predicted')
    plt.title(f'Confusion Matrix ({FILE_PREFIX[:-1]})'); plt.tight_layout()
    filepath = os.path.join(output_dir, FILE_PREFIX + filename_suffix); plt.savefig(filepath); plt.close()
    print(f"Confusion matrix saved to {filepath}")

def plot_training_history(history, output_dir, filename_suffix="training_history.png"):
    plt.figure(figsize=(12, 5))
    plt.subplot(1, 2, 1); plt.plot(history.history['accuracy'], label='Train Acc')
    if 'val_accuracy' in history.history: plt.plot(history.history['val_accuracy'], label='Val Acc')
    plt.title('Accuracy'); plt.ylabel('Accuracy'); plt.xlabel('Epoch'); plt.legend(loc='lower right')
    plt.subplot(1, 2, 2); plt.plot(history.history['loss'], label='Train Loss')
    if 'val_loss' in history.history: plt.plot(history.history['val_loss'], label='Val Loss')
    plt.title('Loss'); plt.ylabel('Loss'); plt.xlabel('Epoch'); plt.legend(loc='upper right')
    plt.suptitle(f'Training History ({FILE_PREFIX[:-1]})') # Add overall title
    plt.tight_layout(rect=[0, 0.03, 1, 0.95]) # Adjust layout for suptitle
    filepath = os.path.join(output_dir, FILE_PREFIX + filename_suffix); plt.savefig(filepath); plt.close()
    print(f"Training history plot saved to {filepath}")

def save_classification_report(y_true, y_pred_classes, class_names, output_dir, filename_suffix="classification_report.txt"):
    report = classification_report(y_true, y_pred_classes, target_names=class_names, digits=4, zero_division=0)
    report_path = os.path.join(output_dir, FILE_PREFIX + filename_suffix)
    with open(report_path, 'w') as f:
        f.write(f"Classification Report ({FILE_PREFIX[:-1]}):\n")
        f.write("=======================================\n")
        f.write(report)
    print(f"Classification report saved to {report_path}")
    return report

def save_predictions(y_true, y_pred_proba, y_pred_classes, subjects_test, label_encoder, output_dir, filename_suffix="test_predictions.csv"):
    df_results = pd.DataFrame({'subject': subjects_test,
                               'true_label_encoded': y_true,
                               'true_label_name': label_encoder.inverse_transform(y_true),
                               'predicted_label_encoded': y_pred_classes,
                               'predicted_label_name': label_encoder.inverse_transform(y_pred_classes)})
    for i, class_name in enumerate(label_encoder.classes_):
        df_results[f'prob_{class_name}'] = y_pred_proba[:, i]
    pred_path = os.path.join(output_dir, FILE_PREFIX + filename_suffix)
    df_results.to_csv(pred_path, index=False)
    print(f"Test set predictions saved to {pred_path}")


# --- Main Execution ---
if __name__ == "__main__":
    main_start_time = time.time()

    # 1. Load and Filter Data from Both Sources
    print(f"\n[1/7] Loading and filtering data...")
    all_data_frames = []

    # Load and filter from file 1
    try:
        print(f"  Loading {INPUT_CSV_1} for activities {ACTIVITIES_FROM_FILE1}...")
        df1 = pd.read_csv(INPUT_CSV_1)
        df1_filtered = df1[df1['activity'].isin(ACTIVITIES_FROM_FILE1)].copy() # Filter
        if not df1_filtered.empty:
             print(f"    Kept {len(df1_filtered)} rows from {INPUT_CSV_1}.")
             all_data_frames.append(df1_filtered)
        else:
             print(f"    Warning: No data found for activities {ACTIVITIES_FROM_FILE1} in {INPUT_CSV_1}.")
    except FileNotFoundError:
        print(f"  ERROR: Input file {INPUT_CSV_1} not found. Skipping.")
    except Exception as e:
        print(f"  ERROR: Could not load or filter {INPUT_CSV_1}: {e}. Skipping.")

    # Load and filter from file 2
    try:
        print(f"  Loading {INPUT_CSV_2} for activities {ACTIVITIES_FROM_FILE2}...")
        if not INPUT_CSV_2.is_file():
             raise FileNotFoundError(f"{INPUT_CSV_2} not found.")
        df2 = pd.read_csv(INPUT_CSV_2)
        df2_filtered = df2[df2['activity'].isin(ACTIVITIES_FROM_FILE2)].copy() # Filter
        if not df2_filtered.empty:
             print(f"    Kept {len(df2_filtered)} rows from {INPUT_CSV_2}.")
             all_data_frames.append(df2_filtered)
        else:
             print(f"    Warning: No data found for activities {ACTIVITIES_FROM_FILE2} in {INPUT_CSV_2}.")
    except FileNotFoundError:
        print(f"  ERROR: Input file {INPUT_CSV_2} not found. Skipping.")
    except Exception as e:
        print(f"  ERROR: Could not load or filter {INPUT_CSV_2}: {e}. Skipping.")

    # 2. Combine Data
    print("\n[2/7] Combining filtered data...")
    if not all_data_frames:
        print("FATAL ERROR: No data loaded from any source. Check file paths and activity lists.")
        exit()

    combined_df = pd.concat(all_data_frames, ignore_index=True)
    print(f"Combined dataset created with {len(combined_df)} rows.")

    # Verify final activities
    final_activities = combined_df['activity'].unique()
    print(f"Activities present in combined data: {sorted(final_activities)}")
    if set(final_activities) != set(ALL_ACTIVITIES_TO_KEEP):
        print("Warning: The activities in the final combined data do not exactly match the expected set.")
        print(f"  Expected: {ALL_ACTIVITIES_TO_KEEP}")
        print(f"  Found: {sorted(final_activities)}")
        # Decide if this is fatal or acceptable
        # For now, we continue but this might indicate an issue in filtering or input data

    # Check for necessary columns before windowing
    required_cols_for_windowing = ['subject', 'activity', 'timestamp', 'x_accel', 'y_accel', 'z_accel', 'x_gyro', 'y_gyro', 'z_gyro']
    missing_cols = [col for col in required_cols_for_windowing if col not in combined_df.columns and col != 'timestamp'] # Timestamp check is in windowing func
    if missing_cols:
        print(f"FATAL ERROR: Combined DataFrame is missing essential columns for windowing: {missing_cols}")
        exit()


    # 3. Create Windows
    print(f"\n[3/7] Creating sliding windows (size={WINDOW_SIZE}, stride={STRIDE})...")
    try:
        X, y_raw_labels, subjects = create_subject_activity_windows(combined_df, WINDOW_SIZE, STRIDE)
        # X shape: (num_windows, window_size, num_features)
        # y_raw_labels shape: (num_windows,)
        # subjects shape: (num_windows,)
    except Exception as e:
        print(f"FATAL ERROR: Failed to create windows: {e}")
        exit()

    # 4. Encode Labels
    print("\n[4/7] Encoding activity labels...")
    label_encoder = LabelEncoder()
    y = label_encoder.fit_transform(y_raw_labels)
    num_classes = len(label_encoder.classes_)
    # Ensure the encoder classes match the intended activities
    print(f"Encoded {num_classes} unique classes: {label_encoder.classes_}")
    if set(label_encoder.classes_) != set(ALL_ACTIVITIES_TO_KEEP):
         print("Warning: Label encoder classes do not match the full set of intended activities. This might happen if some activities had no data after windowing.")
         print(f"  Intended: {ALL_ACTIVITIES_TO_KEEP}")
         print(f"  Encoded: {list(label_encoder.classes_)}")
    # Save label encoder
    le_path = os.path.join(OUTPUT_DIR, FILE_PREFIX + 'label_encoder.joblib')
    joblib.dump(label_encoder, le_path)
    print(f"Label encoder saved to {le_path}")

    # 5. Split Data (Train/Test)
    print("\n[5/7] Splitting data into training and testing sets...")
    X_train, X_test, y_train, y_test, subjects_train, subjects_test = train_test_split(
        X, y, subjects,
        test_size=0.25,
        random_state=42,
        stratify=y
    )
    print(f"Training set shape: X={X_train.shape}, y={y_train.shape}")
    print(f"Testing set shape: X={X_test.shape}, y={y_test.shape}")


    # 6. Scale Features
    print("\n[6/7] Scaling features using StandardScaler...")
    if len(X_train.shape) != 3:
         print(f"FATAL ERROR: X_train does not have 3 dimensions. Shape: {X_train.shape}")
         exit()
    num_samples_train, timesteps, num_features = X_train.shape
    num_samples_test = X_test.shape[0]

    X_train_reshaped = X_train.reshape(-1, num_features)
    X_test_reshaped = X_test.reshape(-1, num_features)

    scaler = StandardScaler()
    X_train_scaled_reshaped = scaler.fit_transform(X_train_reshaped)
    X_test_scaled_reshaped = scaler.transform(X_test_reshaped)

    X_train_scaled = X_train_scaled_reshaped.reshape(num_samples_train, timesteps, num_features)
    X_test_scaled = X_test_scaled_reshaped.reshape(num_samples_test, timesteps, num_features)
    print("Scaling complete.")
    scaler_path = os.path.join(OUTPUT_DIR, FILE_PREFIX + 'scaler.joblib')
    joblib.dump(scaler, scaler_path)
    print(f"Scaler saved to {scaler_path}")

    # 7. Build and Train Model
    print("\n[7/7] Building and training TCN model...")
    input_shape = (WINDOW_SIZE, num_features)
    model = build_tcn_model(input_shape, num_classes, KERNEL_SIZE, NUM_FILTERS,
                             DILATION_RATES, SPATIAL_DROPOUT_RATE, FINAL_DROPOUT_RATE, L2_REG)

    model.compile(optimizer=tf.keras.optimizers.Adam(),
                  loss='sparse_categorical_crossentropy',
                  metrics=['accuracy'])
    model.summary(print_fn=lambda x: print(x))

    # Calculate Class Weights
    print("Calculating class weights...")
    unique_classes_train, counts_train = np.unique(y_train, return_counts=True)
    print(f"Training data class distribution: {dict(zip(label_encoder.inverse_transform(unique_classes_train), counts_train))}")

    if not np.all(unique_classes_train >= 0) or not np.all(unique_classes_train < num_classes):
        print(f"Warning: Class indices in y_train seem invalid. Unique values: {unique_classes_train}.")
        class_weights_dict = None
    else:
        class_weights_array = class_weight.compute_class_weight(
            class_weight='balanced',
            classes=unique_classes_train,
            y=y_train
        )
        class_weights_dict = dict(zip(unique_classes_train, class_weights_array))
        print(f"Class weights computed: {class_weights_dict}")


    # Define Callbacks
    checkpoint_path = os.path.join(OUTPUT_DIR, FILE_PREFIX + 'har_model.keras')
    model_checkpoint = ModelCheckpoint(filepath=checkpoint_path, monitor='val_accuracy', save_best_only=True, mode='max', verbose=1)
    reduce_lr = ReduceLROnPlateau(monitor='val_loss', factor=LR_FACTOR, patience=LR_PATIENCE, min_lr=1e-6, verbose=1)
    early_stopping = EarlyStopping(monitor='val_loss', patience=EARLY_STOPPING_PATIENCE, restore_best_weights=True, verbose=1)
    callbacks_list = [model_checkpoint, reduce_lr, early_stopping]

    # Train the Model
    history = model.fit(
        X_train_scaled, y_train,
        epochs=EPOCHS, batch_size=BATCH_SIZE,
        validation_split=VALIDATION_SPLIT,
        callbacks=callbacks_list,
        class_weight=class_weights_dict,
        verbose=1
    )

    print("Training finished.")
    plot_training_history(history, OUTPUT_DIR)

    # Load Best Model
    print("\nLoading best model weights saved during training...")
    try:
        best_model = tf.keras.models.load_model(checkpoint_path)
        print("Best model loaded successfully.")
    except Exception as e:
        print(f"Warning: Could not load best model from checkpoint ({e}). Using model state from end of training.")
        best_model = model


    # 8. Evaluate Model and Save Results
    print("\n[8/8] Evaluating model and saving results...") # Step number updated
    test_loss, test_acc = best_model.evaluate(X_test_scaled, y_test, verbose=0)
    print(f"\nTest Loss: {test_loss:.4f}")
    print(f"Test Accuracy: {test_acc:.4f}")

    y_pred_proba = best_model.predict(X_test_scaled)
    y_pred_classes = np.argmax(y_pred_proba, axis=1)

    # Get actual class names for reporting
    report_class_names = label_encoder.classes_

    report_str = save_classification_report(y_test, y_pred_classes, report_class_names, OUTPUT_DIR)
    print("\n" + report_str)
    plot_confusion_matrix(y_test, y_pred_classes, report_class_names, OUTPUT_DIR)
    save_predictions(y_test, y_pred_proba, y_pred_classes, subjects_test, label_encoder, OUTPUT_DIR)


    main_end_time = time.time()
    print(f"\n--- Script finished in {main_end_time - main_start_time:.2f} seconds ---")

--- GPU Check ---
TensorFlow Version: 2.18.0
Num GPUs Available: 1
GPU(s) found:
  PhysicalDevice(name='/physical_device:GPU:0', device_type='GPU')
Attempted to enable memory growth for GPU(s).
-----------------

--- Configuration ---
Input CSV 1 (Activities ['B', 'D', 'E']): /content/drive/MyDrive/Colab_HAR_Project/data/resampled_normalized_phone_data.csv
Input CSV 2 (Activities ['A', 'C']): /content/drive/MyDrive/Colab_HAR_Project/data/combined_collected_data.csv
All Activities Kept: ['A', 'B', 'C', 'D', 'E']
Output Directory: /content/drive/MyDrive/Colab_HAR_Project/results/TCN_Results_Merged_Sources_Colab
Output File Prefix: tcn_ABCDE_
Window Size: 60 samples
Stride: 15 samples
TCN Kernel Size: 7
---------------------


[1/7] Loading and filtering data...
  Loading /content/drive/MyDrive/Colab_HAR_Project/data/resampled_normalized_phone_data.csv for activities ['B', 'D', 'E']...
    Kept 542962 rows from /content/drive/MyDrive/Colab_HAR_Project/data/resampled_normalized_phone_data.

Model: "functional_1"
┏━━━━━━━━━━━━━━━━━━━━━━━━━━━┳━━━━━━━━━━━━━━━━━━━━━━━━┳━━━━━━━━━━━━━━━━┳━━━━━━━━━━━━━━━━━━━━━━━━┓
┃ Layer (type)              ┃ Output Shape           ┃        Param # ┃ Connected to           ┃
┡━━━━━━━━━━━━━━━━━━━━━━━━━━━╇━━━━━━━━━━━━━━━━━━━━━━━━╇━━━━━━━━━━━━━━━━╇━━━━━━━━━━━━━━━━━━━━━━━━┩
│ input_layer_1             │ (None, 60, 6)          │              0 │ -                      │
│ (InputLayer)              │                        │                │                        │
├───────────────────────────┼────────────────────────┼────────────────┼────────────────────────┤
│ conv1d_11 (Conv1D)        │ (None, 60, 64)         │          2,752 │ input_layer_1[0][0]    │
├───────────────────────────┼────────────────────────┼────────────────┼────────────────────────┤
│ batch_normalization_10    │ (None, 60, 64)         │            256 │ conv1d_11[0][0]        │
│ (BatchNormalization)      │                        │                │                        │
├───────