In [None]:
import zipfile
import pandas as pd
import os

# Define the path to the zip file
zip_file_path = '/content/human_vital_signs_dataset_2024.csv.zip'  # Updated path

# Check if the zip file exists
if not os.path.exists(zip_file_path):
    print(f"Error: zip file not found at {zip_file_path}")
else:
    # Create a directory to extract the contents
    extraction_path = '/extracted_data'
    os.makedirs(extraction_path, exist_ok=True)

    # Extract the contents of the zip file
    try:
        with zipfile.ZipFile(zip_file_path, 'r') as zip_ref:
            zip_ref.extractall(extraction_path)
            extracted_files = zip_ref.namelist()  # Get a list of extracted file names
        print(f"Extracted {len(extracted_files)} files to {extraction_path}")
    except zipfile.BadZipFile:
        print(f"Error: {zip_file_path} is not a valid zip file.")
    except Exception as e:
        print(f"Error during extraction: {e}")

    # After extraction, list files in the directory to find CSV files
    if os.path.exists(extraction_path):
        all_files = os.listdir(extraction_path)
        csv_files = [f for f in all_files if f.lower().endswith('.csv')]

        if not csv_files:
            print(f"Error: No CSV files found in the extracted directory {extraction_path}")
        else:
            # Display head of the vital signs dataset
            csv_file_path = os.path.join(extraction_path, csv_files[0])
            try:
                vital_signs_df = pd.read_csv(csv_file_path)
                print(f"\n--- Head of Human Vital Signs Dataset --- (Path: {csv_file_path})")
                print(vital_signs_df.head().to_markdown(index=False, numalign="left", stralign="left"))
                
                # Display basic info about the dataset
                print(f"\n--- Dataset Info ---")
                print(f"Number of rows: {vital_signs_df.shape[0]}")
                print(f"Number of columns: {vital_signs_df.shape[1]}")
                print(f"Columns: {', '.join(vital_signs_df.columns)}")
            except Exception as e:
                print(f"Error reading CSV file ({csv_files[0]}): {e}")
    else:
        print("Extraction directory not found. Please check the extraction process.")

In [None]:
import pandas as pd
import numpy as np
from datetime import datetime, timedelta
import os

# Load the dataset
input_path = "/extracted_data/human_vital_signs_dataset_2024.csv"
df = pd.read_csv(input_path)

# Process patients
transformed_df = pd.DataFrame(columns=df.columns)
test_patients = 10

for patient_idx in range(test_patients):
    base_idx = patient_idx * 10
    if base_idx >= len(df):
        break

    first_row = df.iloc[base_idx].copy()
    gender = first_row['Gender']
    base_timestamp = datetime.strptime(first_row['Timestamp'], '%Y-%m-%d %H:%M:%S.%f')
    base_age = first_row['Age']

    for year_offset in range(10):
        idx = base_idx + year_offset
        if idx < len(df):
            row = df.iloc[idx].copy()

            row['Patient ID'] = first_row['Patient ID']
            row['Gender'] = gender

            new_year = base_timestamp.year + year_offset
            new_timestamp = base_timestamp.replace(year=new_year)
            row['Timestamp'] = new_timestamp.strftime('%Y-%m-%d %H:%M:%S.%f')

            row['Age'] = base_age + year_offset

            transformed_df = pd.concat([transformed_df, pd.DataFrame(row).T], ignore_index=True)

# Remove the Height (m) column
transformed_df = transformed_df.drop(columns=['Height (m)'])

# Create a new dataframe to store the condensed data
condensed_df = pd.DataFrame()

# Identify unique patients
unique_patients = transformed_df['Patient ID'].unique()

# Create a single row for each patient
for patient_id in unique_patients:
    patient_data = transformed_df[transformed_df['Patient ID'] == patient_id]
    
    # Sort by timestamp to ensure consistent ordering
    patient_data = patient_data.sort_values('Timestamp')
    
    # Create a new row for this patient
    new_row = {
        'Patient ID': patient_id,
        'Gender': patient_data['Gender'].iloc[0]
    }
    
    # Get the measures that we want to condense (all except Patient ID, Gender, Timestamp)
    measures = [col for col in patient_data.columns if col not in ['Patient ID', 'Gender', 'Timestamp']]
    
    # For each measure, create 10 new columns (one for each year)
    for measure in measures:
        measure_values = patient_data[measure].values
        for i, value in enumerate(measure_values):
            new_row[f'{measure}_Year_{i+1}'] = value
    
    # Add to the condensed dataframe
    condensed_df = pd.concat([condensed_df, pd.DataFrame([new_row])], ignore_index=True)

# Output the condensed data
output_dir = "/transformed_data"
if not os.path.exists(output_dir):
    os.makedirs(output_dir)

output_path = os.path.join(output_dir, "condensed_vital_signs_dataset.csv")
condensed_df.to_csv(output_path, index=False)

print(f"Condensed dataset created with {len(condensed_df)} rows.")
print("Sample of condensed data:")
print(condensed_df.head(5))
print(condensed_df.shape)

In [None]:
import pandas as pd

# Load the dataset
full_df = pd.read_csv('/transformed_data/transformed_vital_signs_dataset.csv')

# Filter for only 'High Risk' rows
high_risk_df = full_df[full_df['Risk Category'] == 'High Risk']

# Print the first 10 rows of the filtered dataset
print("First 10 rows of the HIGH RISK patients:")
print(high_risk_df.head(1))

# Print dataset info for High Risk only
print(f"\nFiltered dataset shape: {high_risk_df.shape}")
print(f"Number of unique Patient IDs (High Risk only): {high_risk_df['Patient ID'].nunique()}")

In [None]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split

# Load your condensed dataset
df = pd.read_csv("/transformed_data/condensed_vital_signs_dataset.csv")
print(f"Original dataset shape: {df.shape}")

# Function to check if a column is numeric
def is_numeric_column(df, column):
    # Try to convert to numeric, return True if successful
    try:
        pd.to_numeric(df[column])
        return True
    except (ValueError, TypeError):
        return False

# Split features by year
def create_time_series_features(df):
    # Get all column names
    cols = df.columns
    
    # Identify feature columns (those ending with Year_X)
    feature_cols = [col for col in cols if '_Year_' in col]
    
    # Filter to keep only numeric columns
    numeric_feature_cols = [col for col in feature_cols if is_numeric_column(df, col)]
    
    # Group features by year
    year_groups = {}
    for year in range(1, 11):
        year_cols = [col for col in numeric_feature_cols if f'_Year_{year}' in col]
        if year_cols:
            year_groups[year] = year_cols
            
    return year_groups, numeric_feature_cols

# Get numeric features grouped by year
year_groups, all_feature_cols = create_time_series_features(df)
print(f"Found {len(all_feature_cols)} numeric feature columns")

# Create X (features) and y (target)
# For example, predict vital signs in year 10 based on years 1-9
X_cols = []
for year in range(1, 10):  # Years 1-9
    if year in year_groups:
        X_cols.extend(year_groups[year])
    
y_cols = year_groups.get(10, [])  # Year 10 (handle case where year 10 might not exist)

if not y_cols:
    print("Warning: No numeric target columns found for year 10")
else:
    print(f"Using {len(X_cols)} input features to predict {len(y_cols)} target features")

    X = df[X_cols]
    y = df[y_cols]

    # Split into training and testing sets
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
    print(f"Training set: {X_train.shape}, Test set: {X_test.shape}")

    # Scale features
    scaler = StandardScaler()
    X_train_scaled = scaler.fit_transform(X_train)
    X_test_scaled = scaler.transform(X_test)
    print("Scaling completed successfully")
    
    # Now you can proceed with your model building
    # For example:
    # model = RandomForestRegressor()
    # model.fit(X_train_scaled, y_train)