In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from pathlib import Path
import warnings
warnings.filterwarnings('ignore')

# Scientific computing
from scipy import signal
from scipy.interpolate import interp1d
from scipy.fft import fft, fftfreq
from scipy.stats import entropy

# Machine Learning
from sklearn.model_selection import train_test_split, LeaveOneGroupOut, cross_val_score
from sklearn.preprocessing import StandardScaler, LabelEncoder
from sklearn.tree import DecisionTreeClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.ensemble import RandomForestClassifier
from sklearn.cluster import KMeans
from sklearn.metrics import (accuracy_score, precision_score, recall_score, 
                             f1_score, confusion_matrix, classification_report,
                             silhouette_score, davies_bouldin_score)


In [None]:
class DatasetExplorer:
    """Explore and summarize the dataset structure"""
    
    def __init__(self, base_path='content'):
        self.base_path = Path(base_path)
        self.folders = {
            'raw': '1.Raw_time_domian_data',
            'trimmed': '2.Trimmed_interpolated_data',
            'subsamples': '3.Time_domain_subsamples'
        }
        
    def explore_structure(self):
        """Explore folder structure and file counts"""
        summary = {}
        
        for key, folder_name in self.folders.items():
            folder_path = self.base_path / folder_name
            if not folder_path.exists():
                print(f"Warning: {folder_path} not found!")
                continue
                
            # Count participants (subfolders)
            participants = [d for d in folder_path.iterdir() if d.is_dir()]
            n_participants = len(participants)
            
            # Count CSV files per participant
            total_files = 0
            files_per_participant = []
            
            for participant in participants:
                csv_files = list(participant.glob('*.csv'))
                files_per_participant.append(len(csv_files))
                total_files += len(csv_files)
            
            summary[key] = {
                'n_participants': n_participants,
                'total_files': total_files,
                'avg_days_per_participant': np.mean(files_per_participant) if files_per_participant else 0,
                'files_per_participant': files_per_participant
            }
            
        return summary
    
    def create_summary_table(self, summary):
        """Create summary table for Task 1"""
        print("\n" + "="*80)
        print("TASK 1: DATASET EXPLORATION - SUMMARY TABLE")
        print("="*80)
        
        for key, data in summary.items():
            print(f"\n{key.upper()} DATA:")
            print(f"  Number of Participants: {data['n_participants']}")
            print(f"  Total CSV Files: {data['total_files']}")
            print(f"  Avg Days/Files per Participant: {data['avg_days_per_participant']:.2f}")
        
        # Create DataFrame for export
        df_summary = pd.DataFrame({
            'Metric': ['Participants', 'Total Files', 'Avg Files/Participant'],
            'Raw Data': [summary['raw']['n_participants'], 
                        summary['raw']['total_files'],
                        f"{summary['raw']['avg_days_per_participant']:.2f}"],
            'Trimmed Data': [summary['trimmed']['n_participants'],
                           summary['trimmed']['total_files'],
                           f"{summary['trimmed']['avg_days_per_participant']:.2f}"],
        })
        
        return df_summary
    
    def load_sample_data(self, folder_type='raw', participant_idx=0, file_idx=0):
        """Load a sample CSV file for exploration"""
        folder_path = self.base_path / self.folders[folder_type]
        participants = sorted([d for d in folder_path.iterdir() if d.is_dir()])
        
        if participant_idx >= len(participants):
            print(f"Participant {participant_idx} not found!")
            return None
            
        participant_path = participants[participant_idx]
        csv_files = sorted(list(participant_path.glob('*.csv')))
        
        if file_idx >= len(csv_files):
            print(f"File {file_idx} not found!")
            return None
            
        df = pd.read_csv(csv_files[file_idx])
        print(f"\nLoaded: {csv_files[file_idx].name}")
        print(f"Shape: {df.shape}")
        print(f"Columns: {df.columns.tolist()}")
        print(f"\nFirst few rows:")
        print(df.head())
        
        return df