In [2]:
# 02_train_test_split.py

import pandas as pd
import os
from google.colab import drive

# === 1. MOUNT GOOGLE DRIVE ===
drive.mount('/content/drive')
root_path = '/content/drive/My Drive/load_type_prediction/'

# === 2. LOAD CLEANED DATA ===
cleaned_path = os.path.join(root_path, 'processed_data/cleaned_full_data.csv')
df = pd.read_csv(cleaned_path)

# === 3. CONVERT Date_Time TO DATETIME FORMAT (FIXED) ===
df['Date_Time'] = pd.to_datetime(df['Date_Time'])  # Removed dayfirst=True

# === 4. IDENTIFY LAST MONTH FOR TEST SPLIT ===
df['YearMonth'] = df['Date_Time'].dt.to_period('M')  # Format: 2018-12
last_month = df['YearMonth'].max()

print(f"📅 Last month to use for test: {last_month}")

# === 5. SPLIT TRAIN AND TEST DATA ===
test_df = df[df['YearMonth'] == last_month].copy()
train_df = df[df['YearMonth'] < last_month].copy()

# === 6. DROP TEMPORARY SPLIT COLUMN ===
train_df.drop(columns=['YearMonth'], inplace=True)
test_df.drop(columns=['YearMonth'], inplace=True)

# === 7. SAVE SPLIT FILES ===
train_path = os.path.join(root_path, 'data/train/power_train.csv')
test_path = os.path.join(root_path, 'data/test/power_test.csv')

os.makedirs(os.path.dirname(train_path), exist_ok=True)
train_df.to_csv(train_path, index=False)
test_df.to_csv(test_path, index=False)

print(f"\n✅ Train data saved to: {train_path}")
print(f"✅ Test data saved to: {test_path}")
print(f"Train shape: {train_df.shape} | Test shape: {test_df.shape}")


Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).
📅 Last month to use for test: 2018-12

✅ Train data saved to: /content/drive/My Drive/load_type_prediction/data/train/power_train.csv
✅ Test data saved to: /content/drive/My Drive/load_type_prediction/data/test/power_test.csv
Train shape: (32064, 9) | Test shape: (2977, 9)
