In [None]:
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.metrics import mean_squared_error
from sklearn.linear_model import LinearRegression
from sklearn.model_selection import TimeSeriesSplit
import numpy as np

### Data Collection

In [None]:
import requests

# URLs of the files
train_data_url = 'https://www.raphaelcousin.com/modules/data-science-practice/module5/exercise/module5_exercise_train.csv'
test_data_url = 'https://www.raphaelcousin.com/modules/data-science-practice/module5/exercise/module5_exercise_test.csv'

# Function to download a file
def download_file(url, file_name):
    response = requests.get(url)
    response.raise_for_status()  # Ensure we notice bad responses
    with open(file_name, 'wb') as file:
        file.write(response.content)
    print(f'Downloaded {file_name} from {url}')

# Downloading the files
download_file(train_data_url, 'module5_exercise_train.csv')
download_file(test_data_url, 'module5_exercise_test.csv')

In [None]:
df_train =  pd.read_csv("module5_exercise_train.csv", sep=",")
df_test =  pd.read_csv("module5_exercise_test.csv", sep=",")

### Data analysis

In [None]:
#### Make a complete analysis on data preprocessing

data = pd.concat([df_train, df_test], axis=0)

# Inconsistencies
print("数据类型：")
print(data.dtypes) 

# Duplicates (data.duplicated().sum())
print("重复行数量：", data.duplicated().sum())

# Missing values (data.isnull().sum())
print("缺失值数量：")
print(data.isnull().sum().sort_values(ascending=False))

# Categorical
print("天气变量分布：")
print(data['weather_condition'].value_counts())

# Outliers
# Feature Engineering
# Feature Selection and/or Dimensionality Reduction

In [None]:
df_train.shape

In [None]:
df_test.shape

In [None]:
def plot_feature_over_time(df, feature, date_id_start, date_id_end):
    df_filtered = df[(df['date'] >= date_id_start) & (df['date'] <= date_id_end)]
    
    if feature not in df_filtered.columns:
        print(f"Feature '{feature}' not found in the DataFrame.")
        return
    
    # Plotting
    plt.figure(figsize=(10, 6))
    plt.plot(df_filtered['date'], df_filtered[feature], label=feature, linestyle='-')
    plt.xlabel('Date')
    plt.ylabel(feature)
    plt.title(f'{feature} from {date_id_start} to {date_id_end}')
    plt.xticks(rotation=45)
    plt.grid(True)
    plt.legend()
    plt.tight_layout()
    plt.show()



In [None]:
data['date'] = pd.to_datetime(data['date'])

In [None]:
data

In [None]:
data['wind_speed']

In [None]:
plot_feature_over_time(data, 'electricity_demand', '2017-01-01', '2019-09-07')

In [None]:
plot_feature_over_time(data, 'humidity', '2016-06-01', '2016-12-01')

### Data Preprocessing Evaluation Strategy

In [None]:
# Provide a complete data preprocessing transformations

In [None]:

# 1. Handle Inconsistencies
def handle_inconsistencies(X_train, y_train, X_val=None):
    def handle_windspeed(windspeed):
        windspeed = str(windspeed).strip()
        if "km/h" in windspeed:
            return float(windspeed.replace("km/h", "").strip())
        elif "m/s" in windspeed:
            return float(windspeed.replace("m/s", "").strip()) * 3.6
        else:
            return float(windspeed)

    X_train['wind_speed'] = X_train['wind_speed'].apply(handle_windspeed)
    if X_val is not None:
        X_val['wind_speed'] = X_val['wind_speed'].apply(handle_windspeed)
        return X_train, y_train, X_val
    return X_train, y_train


# 2. Handling Duplicates
def handle_duplicates(X_train, y_train, X_val=None):
    before_len = len(X_train)
    X_train_no_duplicates = X_train.drop_duplicates()
    y_train_no_duplicates = y_train.loc[X_train_no_duplicates.index]
    after_len = len(X_train_no_duplicates)

    if before_len != after_len:
        print(f"Removed {before_len - after_len} duplicate rows.")

    if X_val is not None:
        return X_train_no_duplicates, y_train_no_duplicates, X_val
    return X_train_no_duplicates, y_train_no_duplicates


# 3. Handling Missing Values
def handle_missing_values(X_train, y_train, X_val=None):
    features = [
        'humidity', 'wind_speed',
        'temperature_station1', 'temperature_station2', 'temperature_station3',
        'temperature_station4', 'temperature_station5', 'temperature_station6',
        'temperature_station7', 'temperature_station8', 'temperature_station9',
        'temperature_station10'
    ]

    X_train[features] = X_train[features].ffill()
    if X_val is not None:
        X_val['weather_condition'] = X_val['weather_condition'].fillna('Unknown')
        X_val[features] = X_val[features].ffill()
        return X_train, y_train, X_val
    return X_train, y_train


# 4. Handling Categorical Values
def handle_categorical(X_train, y_train, X_val=None):
    mapping = {'Very Low': 0, 'Low': 1, 'Moderate': 2, 'High': 3, 'Very High': 4}

    X_train_encoded = pd.get_dummies(X_train, columns=['weather_condition'])
    X_train_encoded['oil_brent_price_indicator'] = X_train_encoded['oil_brent_price_indicator'].map(mapping)

    if X_val is not None:
        X_val_encoded = pd.get_dummies(X_val, columns=['weather_condition'])
        X_val_encoded['oil_brent_price_indicator'] = X_val_encoded['oil_brent_price_indicator'].map(mapping)
        # 对齐列
        X_val_encoded = X_val_encoded.reindex(columns=X_train_encoded.columns, fill_value=0)
        return X_train_encoded, y_train, X_val_encoded
    return X_train_encoded, y_train


# 5. Handling Outliers
def handle_outliers(X_train, y_train, X_val=None):
    def IQR_clip(df, columns):
        for column in columns:
            Q1, Q3 = df[column].quantile(0.25), df[column].quantile(0.75)
            IQR = Q3 - Q1
            lower, upper = Q1 - 1.5 * IQR, Q3 + 1.5 * IQR
            df[column] = np.clip(df[column], lower, upper)
        return df

    def clip_y(y):
        Q1, Q3 = y.quantile(0.25), y.quantile(0.75)
        IQR = Q3 - Q1
        lower, upper = Q1 - 1.5 * IQR, Q3 + 1.5 * IQR
        return np.clip(y, lower, upper)

    X_train = IQR_clip(X_train, ['humidity'])
    y_train = clip_y(y_train)

    if X_val is not None:
        X_val = IQR_clip(X_val, ['humidity'])
        return X_train, y_train, X_val
    return X_train, y_train


# 6. Feature Engineering
def feature_engineering(X_train, y_train, X_val=None):
    def add_datetime_features(df):
        df['date'] = pd.to_datetime(df['date'])
        df['year'] = df['date'].dt.year
        df['month'] = df['date'].dt.month
        df['day'] = df['date'].dt.day
        return df

    X_train = add_datetime_features(X_train)
    if X_val is not None:
        X_val = add_datetime_features(X_val)
        return X_train, y_train, X_val
    return X_train, y_train


# 7. Feature Selection
def feature_selection(X_train, X_val=None):
    selected_columns = [
        'humidity',
        'temperature_station1', 'temperature_station2', 'temperature_station3',
        'temperature_station4', 'temperature_station5', 'temperature_station6',
        'temperature_station7', 'temperature_station8', 'temperature_station9',
        'temperature_station10',
        'year', 'month', 'day',
        'oil_brent_price_indicator',
        'weather_condition_Cloudy', 'weather_condition_Sunny',
        'weather_condition_Rainy', 'weather_condition_Snowy'
    ]

    X_train_selected = X_train[selected_columns]
    if X_val is not None:
        X_val_selected = X_val[selected_columns]
        return X_train_selected, X_val_selected
    return X_train_selected


In [None]:
from xgboost import XGBRegressor
def evaluate_pipeline(X, y, n_splits=5):
    # 一次性全局预处理
    X, y = handle_inconsistencies(X, y)
    X, y = handle_duplicates(X, y)
    X, y = handle_missing_values(X, y)
    X, y = handle_categorical(X, y)
    X, y = handle_outliers(X, y)
    X, y = feature_engineering(X, y)
    X = feature_selection(X)

    model = XGBRegressor()
    tscv = TimeSeriesSplit(n_splits=n_splits)

    train_scores, val_scores = [], []

    for fold, (train_idx, val_idx) in enumerate(tscv.split(X)):
        print(f"Processing fold {fold + 1}/{n_splits}...")

        X_train, X_val = X.iloc[train_idx], X.iloc[val_idx]
        y_train, y_val = y.iloc[train_idx], y.iloc[val_idx]

        model.fit(X_train, y_train)

        train_mse = mean_squared_error(y_train, model.predict(X_train))
        val_mse = mean_squared_error(y_val, model.predict(X_val))

        train_scores.append(train_mse)
        val_scores.append(val_mse)

        print(f"Fold {fold + 1} Train MSE: {train_mse:.4f}, Validation MSE: {val_mse:.4f}")

    # 输出汇总结果
    print("\nTrain MSE:")
    print(f"Mean: {np.mean(train_scores):.4f}, Max: {np.max(train_scores):.4f}, Min: {np.min(train_scores):.4f}")

    print("\nValidation MSE:")
    print(f"Mean: {np.mean(val_scores):.4f}, Max: {np.max(val_scores):.4f}, Min: {np.min(val_scores):.4f}")

    return np.mean(val_scores)

In [None]:
# Prepare X and y
X = df_train.copy().drop(columns=['electricity_demand'], axis=1)
y = df_train.copy().pop('electricity_demand')

# Run the evaluation
evaluate_pipeline(X, y)

### Generating Submission File

In [None]:
# Train and submit your results

In [None]:
# ============================================================
# FULL PIPELINE (XGB ONLY) — PREPROCESS, AUDIT, CV, SUBMISSION
# ============================================================

# -----------------
# Imports
# -----------------
import numpy as np
import pandas as pd
from IPython.display import display
from sklearn.model_selection import TimeSeriesSplit
from sklearn.metrics import mean_squared_error
from xgboost import XGBRegressor

# -----------------
# Safety helpers
# -----------------
def _assert_no_nans(name, arr):
    if isinstance(arr, pd.Series):
        nn = int(arr.isna().sum())
    else:
        nn = int(pd.isna(arr).sum().sum())
    if nn > 0:
        raise ValueError(f"{name} still has {nn} NaNs after preprocessing.")

def _ensure_all_numeric(df: pd.DataFrame, name: str) -> pd.DataFrame:
    non_num_cols = df.columns[~df.dtypes.apply(lambda t: np.issubdtype(t, np.number))]
    if len(non_num_cols) > 0:
        df = df.copy()
        for c in non_num_cols:
            df[c] = pd.to_numeric(df[c], errors="coerce")
    return df

# -----------------
# 1) Handle Inconsistencies
# -----------------
def handle_inconsistencies(X_train, y_train, X_val=None):
    def handle_windspeed(windspeed):
        s = str(windspeed).strip()
        try:
            if "km/h" in s:
                return float(s.replace("km/h", "").strip())
            elif "m/s" in s:
                return float(s.replace("m/s", "").strip()) * 3.6
            else:
                return float(s)
        except Exception:
            return np.nan  # leave NaN, impute later

    X_train = X_train.copy()
    if 'wind_speed' in X_train.columns:
        X_train['wind_speed'] = X_train['wind_speed'].apply(handle_windspeed)

    if X_val is not None:
        X_val = X_val.copy()
        if 'wind_speed' in X_val.columns:
            X_val['wind_speed'] = X_val['wind_speed'].apply(handle_windspeed)
        return X_train, y_train, X_val
    return X_train, y_train

# -----------------
# 2) Handling Duplicates
# -----------------
def handle_duplicates(X_train, y_train, X_val=None):
    before_len = len(X_train)
    X_train_no_duplicates = X_train.drop_duplicates()
    y_train_no_duplicates = y_train.loc[X_train_no_duplicates.index]
    removed = before_len - len(X_train_no_duplicates)
    if removed > 0:
        print(f"Removed {removed} duplicate rows.")
    if X_val is not None:
        return X_train_no_duplicates, y_train_no_duplicates, X_val
    return X_train_no_duplicates, y_train_no_duplicates

# -----------------
# 3) Handling Missing Values
# -----------------
def handle_missing_values(X_train, y_train, X_val=None):
    numeric_features = [
        'humidity', 'wind_speed',
        'temperature_station1', 'temperature_station2', 'temperature_station3',
        'temperature_station4', 'temperature_station5', 'temperature_station6',
        'temperature_station7', 'temperature_station8', 'temperature_station9',
        'temperature_station10'
    ]

    X_train = X_train.copy()
    exist_train = [c for c in numeric_features if c in X_train.columns]
    if exist_train:
        X_train[exist_train] = X_train[exist_train].ffill().bfill()
    if 'weather_condition' in X_train.columns:
        X_train['weather_condition'] = X_train['weather_condition'].fillna('Unknown')

    if X_val is not None:
        X_val = X_val.copy()
        exist_val = [c for c in numeric_features if c in X_val.columns]
        if exist_val:
            X_val[exist_val] = X_val[exist_val].ffill().bfill()
        if 'weather_condition' in X_val.columns:
            X_val['weather_condition'] = X_val['weather_condition'].fillna('Unknown')
        return X_train, y_train, X_val
    return X_train, y_train

# -----------------
# 4) Handling Categorical Values
# -----------------
def handle_categorical(X_train, y_train, X_val=None):
    mapping = {'Very Low': 0, 'Low': 1, 'Moderate': 2, 'High': 3, 'Very High': 4}
    expected_weather_cols = [
        'weather_condition_Cloudy', 'weather_condition_Sunny',
        'weather_condition_Rainy', 'weather_condition_Snowy'
    ]

    # train encoding
    if 'weather_condition' in X_train.columns:
        X_train_encoded = pd.get_dummies(X_train, columns=['weather_condition'])
    else:
        X_train_encoded = X_train.copy()

    if 'oil_brent_price_indicator' in X_train_encoded.columns:
        X_train_encoded['oil_brent_price_indicator'] = (
            X_train_encoded['oil_brent_price_indicator']
            .map(mapping).fillna(2).astype(int)
        )

    for col in expected_weather_cols:
        if col not in X_train_encoded.columns:
            X_train_encoded[col] = 0

    if X_val is not None:
        if 'weather_condition' in X_val.columns:
            X_val_encoded = pd.get_dummies(X_val, columns=['weather_condition'])
        else:
            X_val_encoded = X_val.copy()

        if 'oil_brent_price_indicator' in X_val_encoded.columns:
            X_val_encoded['oil_brent_price_indicator'] = (
                X_val_encoded['oil_brent_price_indicator']
                .map(mapping).fillna(2).astype(int)
            )

        # align columns identically
        X_val_encoded = X_val_encoded.reindex(columns=X_train_encoded.columns, fill_value=0)
        return X_train_encoded, y_train, X_val_encoded

    return X_train_encoded, y_train

# -----------------
# 5) Handling Outliers
# -----------------
def handle_outliers(X_train, y_train, X_val=None):
    def IQR_clip(df, columns):
        df = df.copy()
        for column in columns:
            if column not in df.columns:
                continue
            Q1, Q3 = df[column].quantile(0.25), df[column].quantile(0.75)
            IQR = Q3 - Q1
            lower, upper = Q1 - 1.5 * IQR, Q3 + 1.5 * IQR
            df[column] = np.clip(df[column], lower, upper)
        return df

    def clip_y(y):
        Q1, Q3 = y.quantile(0.25), y.quantile(0.75)
        IQR = Q3 - Q1
        lower, upper = Q1 - 1.5 * IQR, Q3 + 1.5 * IQR
        return np.clip(y, lower, upper)

    X_train = IQR_clip(X_train, ['humidity'])
    y_train = clip_y(y_train)

    if X_val is not None:
        X_val = IQR_clip(X_val, ['humidity'])
        return X_train, y_train, X_val
    return X_train, y_train

# -----------------
# 6) Feature Engineering
# -----------------
def feature_engineering(X_train, y_train, X_val=None):
    def add_datetime_features(df):
        df = df.copy()
        if 'date' in df.columns:
            dt = pd.to_datetime(df['date'], errors='coerce')
            df['year']  = dt.dt.year
            df['month'] = dt.dt.month
            df['day']   = dt.dt.day
        return df

    X_train = add_datetime_features(X_train)
    if X_val is not None:
        X_val = add_datetime_features(X_val)
        return X_train, y_train, X_val
    return X_train, y_train

# -----------------
# 7) Feature Selection (column subset)
# -----------------
def feature_selection(X_train, X_val=None):
    selected_columns = [
        'humidity',
        'temperature_station1', 'temperature_station2', 'temperature_station3',
        'temperature_station4', 'temperature_station5', 'temperature_station6',
        'temperature_station7', 'temperature_station8', 'temperature_station9',
        'temperature_station10',
        'year', 'month', 'day',
        'oil_brent_price_indicator',
        'weather_condition_Cloudy', 'weather_condition_Sunny',
        'weather_condition_Rainy', 'weather_condition_Snowy'
    ]
    cols_train = [c for c in selected_columns if c in X_train.columns]
    X_train_selected = X_train[cols_train]
    if X_val is not None:
        X_val_selected = X_val.reindex(columns=X_train_selected.columns, fill_value=0)
        return X_train_selected, X_val_selected
    return X_train_selected

# -----------------
# Preview & Audit utilities
# -----------------
def preprocess_for_preview(X_train, y_train, X_test=None):
    """Apply the exact same preprocessing as in submission path."""
    X_train = X_train.copy()
    X_test  = None if X_test is None else X_test.copy()

    if X_test is None:
        X_train, y_train = handle_inconsistencies(X_train, y_train)
        X_train, y_train = handle_duplicates(X_train, y_train)
        X_train, y_train = handle_missing_values(X_train, y_train)
        X_train, y_train = handle_categorical(X_train, y_train)
        X_train, y_train = handle_outliers(X_train, y_train)
        X_train, y_train = feature_engineering(X_train, y_train)
        X_train          = feature_selection(X_train)
    else:
        X_train, y_train, X_test = handle_inconsistencies(X_train, y_train, X_test)
        X_train, y_train, X_test = handle_duplicates(X_train, y_train, X_test)
        X_train, y_train, X_test = handle_missing_values(X_train, y_train, X_test)
        X_train, y_train, X_test = handle_categorical(X_train, y_train, X_test)
        X_train, y_train, X_test = handle_outliers(X_train, y_train, X_test)
        X_train, y_train, X_test = feature_engineering(X_train, y_train, X_test)
        X_train, X_test          = feature_selection(X_train, X_test)

    # explicit final impute (train medians)
    num_cols = X_train.select_dtypes(include='number').columns
    med = X_train[num_cols].median()
    X_train[num_cols] = X_train[num_cols].fillna(med)
    if X_test is not None:
        X_test[num_cols] = X_test[num_cols].fillna(med)

    # force numeric
    X_train = _ensure_all_numeric(X_train, "X_train")
    if X_test is not None:
        X_test = _ensure_all_numeric(X_test, "X_test")

    return X_train, y_train, X_test

def audit_frame(df: pd.DataFrame, name: str):
    print(f"\n=== {name} ===")
    print(f"shape: {df.shape}")
    print(f"columns ({len(df.columns)}): {list(df.columns)[:10]}{' ...' if df.shape[1] > 10 else ''}")

    non_num = df.columns[~df.dtypes.apply(lambda t: np.issubdtype(t, np.number))]
    if len(non_num):
        print("⚠️ Non-numeric columns:", list(non_num))
    else:
        print("All columns are numeric ✅")

    na_total = int(df.isna().sum().sum())
    if na_total == 0:
        print("No missing values ✅")
    else:
        na_cols = df.isna().sum()
        na_cols = na_cols[na_cols > 0].sort_values(ascending=False)
        print(f"⚠️ Missing values total = {na_total}")
        print(na_cols.head(10))

    display(df.head(5))
    display(df.describe(include='all').T.head(12))

    if 'oil_brent_price_indicator' in df.columns:
        vc = df['oil_brent_price_indicator'].value_counts(dropna=False).sort_index()
        print("oil_brent_price_indicator value counts:\n", vc)
    wc_cols = [c for c in df.columns if c.startswith('weather_condition_')]
    if wc_cols:
        print("weather_condition dummies present:", wc_cols)
    if 'wind_speed' in df.columns:
        ws = df['wind_speed']
        print(f"wind_speed range: min={ws.min():.3f}, max={ws.max():.3f}")

def compare_train_test(X_tr: pd.DataFrame, X_te: pd.DataFrame):
    print("\n=== Train/Test Column Alignment ===")
    only_in_tr = [c for c in X_tr.columns if c not in X_te.columns]
    only_in_te = [c for c in X_te.columns if c not in X_tr.columns]
    if not only_in_tr and not only_in_te and list(X_tr.columns) == list(X_te.columns):
        print("Columns identical and in the same order ✅")
    else:
        if only_in_tr:
            print("⚠️ Columns only in TRAIN:", only_in_tr)
        if only_in_te:
            print("⚠️ Columns only in TEST:", only_in_te)
        if list(X_tr.columns) != list(X_te.columns):
            print("⚠️ Column order differs.")

    print("\nSample mean/std comparison (first 8 cols):")
    cols = X_tr.columns[:8]
    stats = pd.DataFrame({
        'train_mean': X_tr[cols].mean(),
        'test_mean':  X_te[cols].mean(),
        'train_std':  X_tr[cols].std(),
        'test_std':   X_te[cols].std()
    })
    display(stats)

# -----------------
# Cross-validated evaluation (XGB)
# -----------------
def evaluate_pipeline(X, y, n_splits=5):
    X, y = handle_inconsistencies(X, y)
    X, y = handle_duplicates(X, y)
    X, y = handle_missing_values(X, y)
    X, y = handle_categorical(X, y)
    X, y = handle_outliers(X, y)
    X, y = feature_engineering(X, y)
    X = feature_selection(X)

    base_params = dict(
        n_estimators=400,
        learning_rate=0.05,
        max_depth=6,
        subsample=0.8,
        colsample_bytree=0.8,
        reg_lambda=1.0,
        reg_alpha=0.0,
        random_state=42,
        n_jobs=-1
    )

    tscv = TimeSeriesSplit(n_splits=n_splits)
    train_scores, val_scores = [], []

    for fold, (train_idx, val_idx) in enumerate(tscv.split(X)):
        print(f"Processing fold {fold + 1}/{n_splits}...")
        X_train, X_val = X.iloc[train_idx].copy(), X.iloc[val_idx].copy()
        y_train, y_val = y.iloc[train_idx].copy(), y.iloc[val_idx].copy()

        model = XGBRegressor(**base_params)
        model.fit(
            X_train, y_train,
            eval_set=[(X_val, y_val)],
            verbose=False,
            early_stopping_rounds=50
        )

        y_tr_pred = model.predict(X_train)
        y_va_pred = model.predict(X_val)
        train_mse = mean_squared_error(y_train, y_tr_pred)
        val_mse   = mean_squared_error(y_val,   y_va_pred)
        train_scores.append(train_mse)
        val_scores.append(val_mse)

        print(f"Fold {fold + 1} Train MSE: {train_mse:.4f}, Validation MSE: {val_mse:.4f}")

    print("\nTrain MSE:")
    print(f"Mean: {np.mean(train_scores):.4f}, Max: {np.max(train_scores):.4f}, Min: {np.min(train_scores):.4f}")
    print("\nValidation MSE:")
    print(f"Mean: {np.mean(val_scores):.4f}, Max: {np.max(val_scores):.4f}, Min: {np.min(val_scores):.4f}")

    return float(np.mean(val_scores))

# -----------------
# Final train & predict for submission (XGB)
# -----------------
def train_and_predict_to_submit(X_train, y_train, X_test):
    X_train = X_train.copy()
    X_test  = X_test.copy()
    y_train = y_train.copy()

    # unified preprocessing with X_val path (keeps alignment)
    X_train, y_train, X_test = handle_inconsistencies(X_train, y_train, X_test)
    X_train, y_train, X_test = handle_duplicates(X_train, y_train, X_test)
    _assert_no_nans("y_train after duplicates", y_train)

    X_train, y_train, X_test = handle_missing_values(X_train, y_train, X_test)
    X_train, y_train, X_test = handle_categorical(X_train, y_train, X_test)
    X_train, y_train, X_test = handle_outliers(X_train, y_train, X_test)
    X_train, y_train, X_test = feature_engineering(X_train, y_train, X_test)
    X_train, X_test          = feature_selection(X_train, X_test)

    # final impute based on TRAIN medians only
    num_cols = X_train.select_dtypes(include="number").columns
    med = X_train[num_cols].median()
    X_train[num_cols] = X_train[num_cols].fillna(med)
    X_test[num_cols]  = X_test[num_cols].fillna(med)

    # enforce numeric
    X_train = _ensure_all_numeric(X_train, "X_train")
    X_test  = _ensure_all_numeric(X_test,  "X_test")

    # sanity checks
    _assert_no_nans("X_train", X_train)
    _assert_no_nans("X_test",  X_test)
    _assert_no_nans("y_train", y_train)

    if list(X_train.columns) != list(X_test.columns):
        missing_in_test = [c for c in X_train.columns if c not in X_test.columns]
        extra_in_test   = [c for c in X_test.columns if c not in X_train.columns]
        raise ValueError(
            "Train/Test columns misaligned.\n"
            f"Only in TRAIN: {missing_in_test}\n"
            f"Only in TEST:  {extra_in_test}"
        )

    final_model = XGBRegressor(
        n_estimators=400,
        learning_rate=0.05,
        max_depth=6,
        subsample=0.8,
        colsample_bytree=0.8,
        reg_lambda=1.0,
        reg_alpha=0.0,
        random_state=42,
        n_jobs=-1
    )

    print(f"Training XGBRegressor on dataset: {X_train.shape}")
    final_model.fit(X_train, y_train, verbose=False)

    print(f"Predicting on test dataset: {X_test.shape}")
    y_test_pred = final_model.predict(X_test)
    return y_test_pred

# -----------------
# Example Usage
# -----------------
# Load data
df_train = pd.read_csv("module5_exercise_train.csv")
X_train = df_train.drop(columns=['electricity_demand'])
y_train = df_train['electricity_demand']
X_test  = pd.read_csv("module5_exercise_test.csv")

# (Optional) Preview & checks
X_pp, y_pp, Xtest_pp = preprocess_for_preview(X_train, y_train, X_test)
audit_frame(X_pp, "X_train_preprocessed")
audit_frame(Xtest_pp, "X_test_preprocessed")
compare_train_test(X_pp, Xtest_pp)

# (Optional) Cross-validated evaluation
# _ = evaluate_pipeline(X_train, y_train, n_splits=5)

# Final train & predict, then save
y_test_pred = train_and_predict_to_submit(X_train, y_train, X_test)
pd.DataFrame({'electricity_demand': y_test_pred}).to_csv("submission2.csv", index=False)
print("✅ Saved predictions to 'submission2.csv'")


In [None]:
import pandas as pd
import numpy as np

def view_and_validate_submission(submission_path: str,
                                 x_test_path: str = "module5_exercise_test.csv",
                                 target_col: str = "electricity_demand"):
    # Load files
    sub = pd.read_csv(submission_path)
    X_test = pd.read_csv(x_test_path)

    print("=== Basic Info ===")
    print(f"Submission shape: {sub.shape}")
    print(f"Submission columns: {list(sub.columns)}")
    print(f"Test shape: {X_test.shape}")
    print()

    # Preview
    print("=== Head (first 10 rows) ===")
    display(sub.head(10))

    print("\n=== Dtypes ===")
    print(sub.dtypes)

    # Checks
    print("\n=== Validation Checks ===")
    # 1) Target column present
    if target_col not in sub.columns:
        print(f"❌ Missing required column: '{target_col}'")
    else:
        print(f"✅ Found target column: '{target_col}'")

    # 2) Row count matches test
    n_test = len(X_test)
    n_sub = len(sub)
    if n_sub == n_test:
        print(f"✅ Row count OK: submission rows = test rows = {n_sub}")
    else:
        print(f"❌ Row count mismatch: submission {n_sub} vs test {n_test}")

    # 3) No NaNs
    nan_total = int(sub.isna().sum().sum())
    if nan_total == 0:
        print("✅ No NaNs in submission")
    else:
        print(f"❌ Found {nan_total} NaNs in submission")
        print(sub.isna().sum()[sub.isna().sum() > 0])

    # 4) Numeric target
    if target_col in sub.columns:
        if np.issubdtype(sub[target_col].dtype, np.number):
            print(f"✅ '{target_col}' is numeric")
        else:
            print(f"❌ '{target_col}' is not numeric (dtype={sub[target_col].dtype})")

    # 5) Quick stats
    if target_col in sub.columns:
        print("\n=== Target Summary ===")
        display(sub[target_col].describe())

    return sub, X_test

# Inspect your file
sub, X_test_check = view_and_validate_submission("submission2.csv")


In [None]:
def train_and_predict_to_submit(X_train, y_train, X_test):
    # Defensive copies (avoid SettingWithCopy warnings downstream)
    X_train = X_train.copy()
    X_test  = X_test.copy()

    # === Preprocess (use the X_val path so each function returns 3 values) ===
    X_train, y_train, X_test = handle_inconsistencies(X_train, y_train, X_test)
    X_train, y_train, X_test = handle_duplicates(X_train, y_train, X_test)
    X_train, y_train, X_test = handle_missing_values(X_train, y_train, X_test)
    X_train, y_train, X_test = handle_categorical(X_train, y_train, X_test)
    X_train, y_train, X_test = handle_outliers(X_train, y_train, X_test)
    X_train, y_train, X_test = feature_engineering(X_train, y_train, X_test)

    # Feature selection returns 2 values when X_val is provided
    X_train, X_test = feature_selection(X_train, X_test)

    # === Train & predict ===
    model = LinearRegression()
    print(f"Training model on entire dataset of shape: {X_train.shape}")
    model.fit(X_train, y_train)

    print(f"Predicting on test dataset of shape: {X_test.shape}")
    y_test_pred = model.predict(X_test)
    return y_test_pred


In [None]:
df_train = pd.read_csv("module5_exercise_train.csv")
X_train = df_train.drop(columns=['electricity_demand'])
y_train = df_train['electricity_demand']

X_test = pd.read_csv("module5_exercise_test.csv")

y_test_pred = train_and_predict_to_submit(X_train, y_train, X_test)

In [None]:
# Call serve_model to train and predict
y_test_pred = train_and_predict_to_submit(X_train, y_train, X_test)

In [None]:
# Generating Submission File
submission = pd.DataFrame({
    'date': X_test['date'],
    'electricity_demand': y_test_pred
})

# Save the submission file
submission.to_csv('submission.csv', index=False, sep=',')
print("Submission file saved as 'submission.csv'.")

In [None]:
import pandas as pd

In [None]:
pd.read_csv('submission2.csv')

In [None]:
df_test

In [None]:
df_train.isnull().sum()

In [None]:
y = df_train.pop('electricity_demand')

In [None]:
df_train

In [None]:
y

In [None]:
def handle_missing_values(X_train, y_train, X_val=None):
    numeric_features = [
        'humidity', 'wind_speed',
        'temperature_station1', 'temperature_station2', 'temperature_station3',
        'temperature_station4', 'temperature_station5', 'temperature_station6',
        'temperature_station7', 'temperature_station8', 'temperature_station9',
        'temperature_station10'
    ]

    # Forward then backward fill to remove leading/trailing NaNs
    exist_train = [c for c in numeric_features if c in X_train.columns]
    if exist_train:
        X_train.loc[:, exist_train] = X_train[exist_train].ffill().bfill()
    if 'weather_condition' in X_train.columns:
        X_train.loc[:, 'weather_condition'] = X_train['weather_condition'].fillna('Unknown')

    if X_val is not None:
        exist_val = [c for c in numeric_features if c in X_val.columns]
        if exist_val:
            X_val.loc[:, exist_val] = X_val[exist_val].ffill().bfill()
        if 'weather_condition' in X_val.columns:
            X_val.loc[:, 'weather_condition'] = X_val['weather_condition'].fillna('Unknown')
        return X_train, y_train, X_val

    return X_train, y_train



In [None]:
x,y = handle_missing_values(df_train, y)

In [None]:
x.isnull().sum()