In [1]:
#lets connect to my google drive

from google.colab import drive
drive.mount('/content/drive')

print("✓ Google Drive connected successfully!")
print("Your files are now accessible in: /content/drive/MyDrive/")

Mounted at /content/drive
✓ Google Drive connected successfully!
Your files are now accessible in: /content/drive/MyDrive/


In [2]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

# Load dataset
df = pd.read_csv('/content/drive/MyDrive/Enhanced_anxiety_dataset.csv')

# Normalize headers once
df.columns = df.columns.str.strip().str.replace(r'\s+', ' ', regex=True)


In [3]:
# Basic inspection
print(df.shape)
print("")
print(df.info())
print("")
print(df.head())
print("")
print(df.describe())
print("")
print(df.isnull().sum())
print("")

(11000, 19)

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 11000 entries, 0 to 10999
Data columns (total 19 columns):
 #   Column                             Non-Null Count  Dtype  
---  ------                             --------------  -----  
 0   Age                                11000 non-null  int64  
 1   Gender                             11000 non-null  object 
 2   Occupation                         11000 non-null  object 
 3   Sleep Hours                        11000 non-null  float64
 4   Physical Activity (hrs/week)       11000 non-null  float64
 5   Caffeine Intake (mg/day)           11000 non-null  int64  
 6   Alcohol Consumption (drinks/week)  11000 non-null  int64  
 7   Smoking                            11000 non-null  object 
 8   Family History of Anxiety          11000 non-null  object 
 9   Stress Level (1-10)                11000 non-null  int64  
 10  Heart Rate (bpm)                   11000 non-null  int64  
 11  Breathing Rate (breaths/min)       11000 

In [4]:
# Missing percentage
missing_percent = (df.isnull().sum() / len(df)) * 100
print(missing_percent[missing_percent > 0])

from sklearn.impute import SimpleImputer
import pandas as pd

# Selecting columns
numerical_cols = df.select_dtypes(include=['number']).columns
categorical_cols = df.select_dtypes(include=['object','category']).columns


# Impute numerical
if len(numerical_cols) > 0:
    num_imputer = SimpleImputer(strategy='median')
    df[numerical_cols] = pd.DataFrame(
        num_imputer.fit_transform(df[numerical_cols]),
        columns=numerical_cols
    )

# Impute categorical
if len(categorical_cols) > 0:
    cat_imputer = SimpleImputer(strategy='most_frequent')
    df[categorical_cols] = pd.DataFrame(
        cat_imputer.fit_transform(df[categorical_cols]),
        columns=categorical_cols
    )
## no missing values in my dataset

Series([], dtype: float64)


In [5]:
# Check for duplicates
print(f"Duplicate rows: {df.duplicated().sum()}")

# Remove duplicates
df = df.drop_duplicates()

Duplicate rows: 0


In [6]:
# Focus on physiological features relevant to emotion detection
# CORRECTED column names to match your actual dataset
key_features = [
    'Heart Rate (bpm)',
    'Breathing Rate (breaths/min)',
    'Stress Level (1-10)',
    'Sleep Hours',
    'Sweating Level (1-5)',
    'Anxiety Level (1-10)'
]

# Use IQR method for outlier detection
def remove_outliers_iqr(df, column):
    Q1 = df[column].quantile(0.25)
    Q3 = df[column].quantile(0.75)
    IQR = Q3 - Q1
    lower_bound = Q1 - 1.5 * IQR
    upper_bound = Q3 + 1.5 * IQR
    return df[(df[column] >= lower_bound) & (df[column] <= upper_bound)]

print("="*60)
print("REMOVING OUTLIERS USING IQR METHOD")
print("="*60)
print(f"Starting samples: {len(df)}")

# Apply to each key feature
for col in key_features:
    if col in df.columns:
        before = len(df)
        df = remove_outliers_iqr(df, col)
        removed = before - len(df)
        print(f"✓ {col}: Removed {removed} outliers")
    else:
        print(f"  {col}: Column not found in dataset")

print(f"\nFinal samples after outlier removal: {len(df)}")
print(f"Total outliers removed: {11000 - len(df)}")

REMOVING OUTLIERS USING IQR METHOD
Starting samples: 11000
✓ Heart Rate (bpm): Removed 0 outliers
✓ Breathing Rate (breaths/min): Removed 0 outliers
✓ Stress Level (1-10): Removed 0 outliers
✓ Sleep Hours: Removed 110 outliers
✓ Sweating Level (1-5): Removed 0 outliers
✓ Anxiety Level (1-10): Removed 302 outliers

Final samples after outlier removal: 10588
Total outliers removed: 412


In [7]:
# Convert to lowercase and strip whitespace
for col in categorical_cols:
    df[col] = df[col].astype(str).str.lower().str.strip()
    df[col] = df[col].replace(["", "nan", "none", "null", "?"], np.nan)

# Standardize Gender
gender_map = {
    'm': 'male','male': 'male','man': 'male','boy': 'male',
    'f': 'female','female': 'female','woman': 'female','girl': 'female',
    'other': 'other'
}
if 'Gender' in df.columns:
    df['Gender'] = df['Gender'].map(gender_map).fillna('unknown')

# Standardize Yes/No columns
yes_no_cols_to_convert = [
    'Smoking','Dizziness','Medication',
    'Recent Major Life Event','Family History of Anxiety'
]
yes_no_normalizer = {
    'yes': 1,'y': 1,'true': 1,'t': 1,'1': 1,
    'no': 0,'n': 0,'false': 0,'f': 0,'0': 0
}
for col in yes_no_cols_to_convert:
    if col in df.columns:
        df[col] = df[col].astype(str).str.lower().map(yes_no_normalizer)
        df[col] = df[col].fillna(0).astype(int)

# Fill remaining categorical NaNs with 'unknown'
final_categorical_cols = df.select_dtypes(include=['object','category']).columns
for col in final_categorical_cols:
    df[col] = df[col].fillna('unknown')


In [8]:
# List of actual physiological columns
phys_cols = ['Heart Rate (bpm)', 'Breathing Rate (breaths/min)', 'Stress Level (1-10)']

# Check if all columns exist
missing_cols = [col for col in phys_cols if col not in df.columns]
if missing_cols:
    print(f" Missing columns: {missing_cols}. Cannot calculate Physiological_Score.")
else:
    # Standardize and average
    df['Physiological_Score'] = df[phys_cols].apply(
        lambda x: (x - x.mean()) / x.std(), axis=0
    ).mean(axis=1)

    print("✔ Physiological_Score column created successfully.")
    print(df[['Physiological_Score']].head())


✔ Physiological_Score column created successfully.
   Physiological_Score
0             0.500656
1            -0.949806
2            -0.071310
3            -0.531257
4            -0.515767


In [9]:
df.columns

Index(['Age', 'Gender', 'Occupation', 'Sleep Hours',
       'Physical Activity (hrs/week)', 'Caffeine Intake (mg/day)',
       'Alcohol Consumption (drinks/week)', 'Smoking',
       'Family History of Anxiety', 'Stress Level (1-10)', 'Heart Rate (bpm)',
       'Breathing Rate (breaths/min)', 'Sweating Level (1-5)', 'Dizziness',
       'Medication', 'Therapy Sessions (per month)', 'Recent Major Life Event',
       'Diet Quality (1-10)', 'Anxiety Level (1-10)', 'Physiological_Score'],
      dtype='object')

In [10]:
# Convert 'Smoking' column to numerical (0 or 1) if it contains 'yes'/'no' strings.
if 'Smoking' in df.columns and df['Smoking'].dtype == 'object':
    df['Smoking'] = df['Smoking'].astype(str).str.lower().map({'yes': 1, 'no': 0}).fillna(0)
df['Smoking'] = pd.to_numeric(df['Smoking'], errors='coerce').fillna(0)

df['Lifestyle_Risk'] = (
    (7 - df['Sleep Hours']).clip(0, 7) +                    # Sleep deficit (0-7)
    (10 - df['Diet Quality (1-10)']).clip(0, 10) +          # Poor diet (0-10)
    df['Alcohol Consumption (drinks/week)'] / 19 * 10 +     # Normalize to 0-10
    df['Caffeine Intake (mg/day)'] / 600 * 10 +             # Normalize to 0-10
    df['Smoking'] * 10                                       # Smoking penalty (0 or 10)
) / 5  # Average to 0-10 scale

print("✓ Lifestyle_Risk column created successfully.")
print("\nSample values:")
print(df[['Lifestyle_Risk']].head())
print("\nDescriptive statistics:")
print(df[['Lifestyle_Risk']].describe())

✓ Lifestyle_Risk column created successfully.

Sample values:
   Lifestyle_Risk
0        4.455965
1        4.068772
2        3.011053
3        5.871579
4        4.644386

Descriptive statistics:
       Lifestyle_Risk
count    10588.000000
mean         4.053029
std          1.453099
min          0.258596
25%          2.961623
50%          4.033333
75%          5.084298
max          8.079474


In [11]:
def categorize_anxiety(level):
    if pd.isna(level):
        return 'unknown'
    elif level <= 3:
        return 'mild'
    elif level <= 6:
        return 'moderate'
    else:
        return 'severe'

df['Anxiety_Category'] = df['Anxiety Level (1-10)'].apply(categorize_anxiety)

# Create ordinal mapping
anxiety_ordinal_map = {'mild': 0, 'moderate': 1, 'severe': 2, 'unknown': -1}
df['Anxiety_Category_Ordinal'] = df['Anxiety_Category'].map(anxiety_ordinal_map)

# Remove any 'unknown' categories
df = df[df['Anxiety_Category'] != 'unknown']

print("="*60)
print("ANXIETY CATEGORIZATION")
print("="*60)
print("Anxiety Category Distribution:")
print(df['Anxiety_Category'].value_counts())
print(f"\nRows after removing unknown categories: {len(df)}")

ANXIETY CATEGORIZATION
Anxiety Category Distribution:
Anxiety_Category
mild        5177
moderate    4651
severe       760
Name: count, dtype: int64

Rows after removing unknown categories: 10588


In [12]:
# Select features most relevant to real-time emotion detection
emotion_detection_features = [
    'Heart Rate (bpm)',
    'Breathing Rate (breaths/min)',
    'Stress Level (1-10)',
    'Sweating Level (1-5)',
    'Dizziness',
    'Physiological_Score'
]

# Ensure Anxiety columns match your dataset
anxiety_cols = []
for col in df.columns:
    if 'anxiety' in col.lower():
        anxiety_cols.append(col)

# Combine features for emotion detection
emotion_df = df[emotion_detection_features + anxiety_cols]

# Preview the selected features
print("✔ Selected features for emotion detection:")
print(emotion_df.head())


✔ Selected features for emotion detection:
   Heart Rate (bpm)  Breathing Rate (breaths/min)  Stress Level (1-10)  \
0             114.0                          14.0                 10.0   
1              62.0                          23.0                  1.0   
2              91.0                          28.0                  1.0   
3              86.0                          17.0                  4.0   
4              98.0                          19.0                  1.0   

   Sweating Level (1-5)  Dizziness  Physiological_Score  \
0                   4.0          0             0.500656   
1                   2.0          1            -0.949806   
2                   3.0          0            -0.071310   
3                   3.0          0            -0.531257   
4                   4.0          1            -0.515767   

   Family History of Anxiety  Anxiety Level (1-10) Anxiety_Category  \
0                          0                   5.0         moderate   
1              

In [13]:
from sklearn.preprocessing import LabelEncoder
import pandas as pd # Ensure pandas is imported if not already

# Label Encoding for ordinal categories
le = LabelEncoder()
# Only apply Label Encoding if 'Anxiety_Category' exists and 'Anxiety_Category_Encoded' does not
if 'Anxiety_Category' in df.columns and 'Anxiety_Category_Encoded' not in df.columns:
    df['Anxiety_Category_Encoded'] = le.fit_transform(df['Anxiety_Category'])
elif 'Anxiety_Category_Encoded' in df.columns:
    print("Anxiety_Category_Encoded already exists. Skipping Label Encoding.")
else:
    print("Anxiety_Category column not found for Label Encoding.")

# One-Hot Encoding for nominal categories
cols_to_onehot = []
if 'Gender' in df.columns:
    cols_to_onehot.append('Gender')
if 'Occupation' in df.columns:
    cols_to_onehot.append('Occupation')

if cols_to_onehot:
    df = pd.get_dummies(df, columns=cols_to_onehot, drop_first=True)
    print(f"One-Hot Encoding applied to: {cols_to_onehot}")
else:
    print("Gender and/or Occupation columns not found for One-Hot Encoding (already processed or not present).")


One-Hot Encoding applied to: ['Gender', 'Occupation']


| Split          | Percentage | Purpose               |
| -------------- | ---------- | --------------------- |
| **Train**      | 70%        | Model learns patterns |
| **Validation** | 15%        | Tune hyperparameters  |
| **Test**       | 15%        | Final evaluation      |


In [14]:
# Check final data quality BEFORE splitting
print("="*60)
print("DATA QUALITY REPORT")
print("="*60)
print(f"Total samples: {len(df):,}")
print(f"Missing values: {df.isnull().sum().sum()}")
print(f"Duplicates: {df.duplicated().sum()}")
print(f"Total columns in dataframe: {df.shape[1]}")
print(f"Training features (excluding 3 targets): {df.shape[1] - 3}")

print(f"\nAnxiety Level Distribution:")
print(df['Anxiety_Category'].value_counts())

print(f"\nTop 10 Feature Correlations with Anxiety Level:")
correlations = df.corr(numeric_only=True)['Anxiety Level (1-10)'].sort_values(ascending=False)
print(correlations.head(10))

print("\n" + "="*60)

DATA QUALITY REPORT
Total samples: 10,588
Missing values: 0
Duplicates: 0
Total columns in dataframe: 36
Training features (excluding 3 targets): 33

Anxiety Level Distribution:
Anxiety_Category
mild        5177
moderate    4651
severe       760
Name: count, dtype: int64

Top 10 Feature Correlations with Anxiety Level:
Anxiety Level (1-10)            1.000000
Anxiety_Category_Ordinal        0.905719
Anxiety_Category_Encoded        0.905719
Stress Level (1-10)             0.673264
Physiological_Score             0.506140
Therapy Sessions (per month)    0.423671
Caffeine Intake (mg/day)        0.288727
Lifestyle_Risk                  0.279318
Family History of Anxiety       0.171609
Heart Rate (bpm)                0.133035
Name: Anxiety Level (1-10), dtype: float64



In [15]:
from sklearn.model_selection import train_test_split

print("="*60)
print("SPLITTING DATA INTO TRAIN/VAL/TEST")
print("="*60)

# Define features and targets
# Remove duplicate/unnecessary columns
columns_to_drop = ['Anxiety Level (1-10)', 'Anxiety_Category']

# Also drop Anxiety_Category_Encoded if it exists (it's a duplicate)
if 'Anxiety_Category_Encoded' in df.columns:
    columns_to_drop.append('Anxiety_Category_Encoded')

X = df.drop(columns_to_drop, axis=1)
y_regression = df['Anxiety Level (1-10)']
y_classification = df['Anxiety_Category']

# First split: 70% train, 30% temp
X_train, X_temp, y_train_reg, y_temp_reg, y_train_class, y_temp_class = train_test_split(
    X, y_regression, y_classification,
    test_size=0.3,
    stratify=y_classification,
    random_state=42
)

# Second split: 50-50 of temp (15% val, 15% test)
X_val, X_test, y_val_reg, y_test_reg, y_val_class, y_test_class = train_test_split(
    X_temp, y_temp_reg, y_temp_class,
    test_size=0.5,
    stratify=y_temp_class,
    random_state=42
)

print(f"Training set:   {X_train.shape[0]:,} samples ({X_train.shape[0]/len(df)*100:.1f}%)")
print(f"Validation set: {X_val.shape[0]:,} samples ({X_val.shape[0]/len(df)*100:.1f}%)")
print(f"Test set:       {X_test.shape[0]:,} samples ({X_test.shape[0]/len(df)*100:.1f}%)")
print(f"\nTotal features: {X_train.shape[1]}")
print("\nClass distribution in splits:")
print(f"Train:      {dict(y_train_class.value_counts())}")
print(f"Validation: {dict(y_val_class.value_counts())}")
print(f"Test:       {dict(y_test_class.value_counts())}")

SPLITTING DATA INTO TRAIN/VAL/TEST
Training set:   7,411 samples (70.0%)
Validation set: 1,588 samples (15.0%)
Test set:       1,589 samples (15.0%)

Total features: 33

Class distribution in splits:
Train:      {'mild': np.int64(3624), 'moderate': np.int64(3255), 'severe': np.int64(532)}
Validation: {'mild': np.int64(776), 'moderate': np.int64(698), 'severe': np.int64(114)}
Test:       {'mild': np.int64(777), 'moderate': np.int64(698), 'severe': np.int64(114)}


In [16]:
from sklearn.preprocessing import StandardScaler
import joblib

print("="*60)
print("SCALING NUMERICAL FEATURES (NO DATA LEAKAGE)")
print("="*60)

# Define numerical features to scale
numerical_features = [
    'Age',
    'Sleep Hours',
    'Physical Activity (hrs/week)',
    'Caffeine Intake (mg/day)',
    'Alcohol Consumption (drinks/week)',
    'Stress Level (1-10)',
    'Heart Rate (bpm)',
    'Breathing Rate (breaths/min)',
    'Sweating Level (1-5)',
    'Therapy Sessions (per month)',
    'Diet Quality (1-10)',
    'Physiological_Score',
    'Lifestyle_Risk'
]

# Verify all features exist in the data
numerical_features = [f for f in numerical_features if f in X_train.columns]

print(f"Scaling {len(numerical_features)} features:")
for f in numerical_features:
    print(f"  • {f}")

# Initialize scaler
scaler = StandardScaler()

# ✓ FIT ONLY on training data
scaler.fit(X_train[numerical_features])

# Transform all splits using training scaler
X_train_scaled = X_train.copy()
X_val_scaled = X_val.copy()
X_test_scaled = X_test.copy()

X_train_scaled[numerical_features] = scaler.transform(X_train[numerical_features])
X_val_scaled[numerical_features] = scaler.transform(X_val[numerical_features])
X_test_scaled[numerical_features] = scaler.transform(X_test[numerical_features])

# Save scaler
joblib.dump(scaler, 'scaler.pkl')

print("\n✓ Scaling complete - NO data leakage")
print(f"✓ Scaler saved to 'scaler.pkl'")
print("\nScaler statistics (from training data ONLY):")
print(f"  Mean (first 3): {scaler.mean_[:3]}")
print(f"  Std (first 3):  {scaler.scale_[:3]}")

SCALING NUMERICAL FEATURES (NO DATA LEAKAGE)
Scaling 13 features:
  • Age
  • Sleep Hours
  • Physical Activity (hrs/week)
  • Caffeine Intake (mg/day)
  • Alcohol Consumption (drinks/week)
  • Stress Level (1-10)
  • Heart Rate (bpm)
  • Breathing Rate (breaths/min)
  • Sweating Level (1-5)
  • Therapy Sessions (per month)
  • Diet Quality (1-10)
  • Physiological_Score
  • Lifestyle_Risk

✓ Scaling complete - NO data leakage
✓ Scaler saved to 'scaler.pkl'

Scaler statistics (from training data ONLY):
  Mean (first 3): [40.26312239  6.72555661  2.99603292]
  Std (first 3):  [13.26104081  1.14972729  1.81173268]


In [17]:
print("="*60)
print("CREATING TWO FEATURE SETS")
print("="*60)

# FULL ASSESSMENT FEATURES (all features)
full_features = X_train_scaled.columns.tolist()
print(f"\n1. FULL ASSESSMENT: {len(full_features)} features")

# REAL-TIME EMOTION DETECTION (only 6 features)
realtime_features = [
    'Stress Level (1-10)',
    'Heart Rate (bpm)',
    'Breathing Rate (breaths/min)',
    'Sweating Level (1-5)',
    'Dizziness',
    'Physiological_Score'
]

# Filter only existing features
realtime_features = [f for f in realtime_features if f in X_train_scaled.columns]
print(f"2. REAL-TIME EMOTION: {len(realtime_features)} features")

# Create real-time datasets
X_train_realtime = X_train_scaled[realtime_features]
X_val_realtime = X_val_scaled[realtime_features]
X_test_realtime = X_test_scaled[realtime_features]

print(f"\n✓ Full model shape: {X_train_scaled.shape}")
print(f"✓ Real-time model shape: {X_train_realtime.shape}")

CREATING TWO FEATURE SETS

1. FULL ASSESSMENT: 33 features
2. REAL-TIME EMOTION: 6 features

✓ Full model shape: (7411, 33)
✓ Real-time model shape: (7411, 6)


In [18]:
import pandas as pd
import joblib
import shutil
import os

print("="*70)
print("SAVING PREPROCESSED DATA TO GOOGLE DRIVE")
print("="*70)

# Create folders in Google Drive
base_path = '/content/drive/MyDrive/SentiCare_Project'
data_path = f'{base_path}/01_Preprocessed_Data'
os.makedirs(data_path, exist_ok=True)

# 1. Full model (SCALED)
X_train_scaled.to_csv(f'{data_path}/X_train_full_scaled.csv', index=False)
X_val_scaled.to_csv(f'{data_path}/X_val_full_scaled.csv', index=False)
X_test_scaled.to_csv(f'{data_path}/X_test_full_scaled.csv', index=False)
print("✓ Full model: X_train/val/test_full_scaled.csv")

# 2. Real-time model (SCALED)
X_train_realtime.to_csv(f'{data_path}/X_train_realtime_scaled.csv', index=False)
X_val_realtime.to_csv(f'{data_path}/X_val_realtime_scaled.csv', index=False)
X_test_realtime.to_csv(f'{data_path}/X_test_realtime_scaled.csv', index=False)
print("✓ Real-time model: X_train/val/test_realtime_scaled.csv")

# 3. Targets
pd.DataFrame({
    'Anxiety_Level': y_train_reg,
    'Anxiety_Category': y_train_class
}).to_csv(f'{data_path}/y_train.csv', index=False)

pd.DataFrame({
    'Anxiety_Level': y_val_reg,
    'Anxiety_Category': y_val_class
}).to_csv(f'{data_path}/y_val.csv', index=False)

pd.DataFrame({
    'Anxiety_Level': y_test_reg,
    'Anxiety_Category': y_test_class
}).to_csv(f'{data_path}/y_test.csv', index=False)
print("✓ Targets: y_train/val/test.csv")

# 4. Scaler (CRITICAL for production)
joblib.dump(scaler, f'{data_path}/scaler.pkl')
print("✓ Scaler: scaler.pkl")

# 5. CBT mapping
pd.DataFrame({
    'Anxiety_Level': range(1, 11),
    'Anxiety_Category': ['mild']*3 + ['moderate']*3 + ['severe']*4
}).to_csv(f'{data_path}/anxiety_mapping.csv', index=False)
print("✓ CBT mapping: anxiety_mapping.csv")

# 6. Feature list
pd.DataFrame({
    'realtime_features': realtime_features + ['']*(len(full_features)-len(realtime_features)),
    'all_features': full_features
}).to_csv(f'{data_path}/feature_list.csv', index=False)
print("✓ Feature list: feature_list.csv")

print("\n" + "="*70)
print("ALL FILES SAVED TO GOOGLE DRIVE (PERMANENT)")
print("="*70)
print(f"Location: {data_path}")
print("\nFiles created (10 total):")
print("  • X_train/val/test_full_scaled.csv (3 files)")
print("  • X_train/val/test_realtime_scaled.csv (3 files)")
print("  • y_train/val/test.csv (3 files)")
print("  • scaler.pkl (1 file)")

SAVING PREPROCESSED DATA TO GOOGLE DRIVE
✓ Full model: X_train/val/test_full_scaled.csv
✓ Real-time model: X_train/val/test_realtime_scaled.csv
✓ Targets: y_train/val/test.csv
✓ Scaler: scaler.pkl
✓ CBT mapping: anxiety_mapping.csv
✓ Feature list: feature_list.csv

ALL FILES SAVED TO GOOGLE DRIVE (PERMANENT)
Location: /content/drive/MyDrive/SentiCare_Project/01_Preprocessed_Data

Files created (10 total):
  • X_train/val/test_full_scaled.csv (3 files)
  • X_train/val/test_realtime_scaled.csv (3 files)
  • y_train/val/test.csv (3 files)
  • scaler.pkl (1 file)


In [19]:
print("="*60)
print("VALIDATION CHECK")
print("="*60)

# Check files were created
import os
files_to_check = [
    'X_train_full_scaled.csv',
    'X_val_full_scaled.csv',
    'X_test_full_scaled.csv',
    'X_train_realtime_scaled.csv',
    'X_val_realtime_scaled.csv',
    'X_test_realtime_scaled.csv',
    'y_train.csv',
    'y_val.csv',
    'y_test.csv',
    'scaler.pkl'
]

print("\nFile existence check:")
for f in files_to_check:
    exists = os.path.exists(f)
    symbol = "✓" if exists else "✗"
    print(f"{symbol} {f}")

# Check shapes match
print(f"\nShape validation:")
print(f"Full train: {X_train_scaled.shape}")
print(f"Realtime train: {X_train_realtime.shape}")
print(f"Targets train: {len(y_train_reg)}")

assert X_train_scaled.shape[0] == X_train_realtime.shape[0] == len(y_train_reg), "Shape mismatch in training data"
print("✓ All shapes match")

VALIDATION CHECK

File existence check:
✗ X_train_full_scaled.csv
✗ X_val_full_scaled.csv
✗ X_test_full_scaled.csv
✗ X_train_realtime_scaled.csv
✗ X_val_realtime_scaled.csv
✗ X_test_realtime_scaled.csv
✗ y_train.csv
✗ y_val.csv
✗ y_test.csv
✓ scaler.pkl

Shape validation:
Full train: (7411, 33)
Realtime train: (7411, 6)
Targets train: 7411
✓ All shapes match
