In [None]:
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.metrics import mean_squared_error
from sklearn.linear_model import LinearRegression
from sklearn.model_selection import TimeSeriesSplit
import numpy as np

### Data Collection

In [None]:
import requests

# URLs of the files
train_data_url = 'https://www.raphaelcousin.com/modules/data-science-practice/module5/exercise/module5_exercise_train.csv'
test_data_url = 'https://www.raphaelcousin.com/modules/data-science-practice/module5/exercise/module5_exercise_test.csv'

# Function to download a file
def download_file(url, file_name):
    response = requests.get(url)
    response.raise_for_status()  # Ensure we notice bad responses
    with open(file_name, 'wb') as file:
        file.write(response.content)
    print(f'Downloaded {file_name} from {url}')

# Downloading the files
download_file(train_data_url, 'module5_exercise_train.csv')
download_file(test_data_url, 'module5_exercise_test.csv')

In [None]:
df_train =  pd.read_csv("module5_exercise_train.csv", sep=",")
df_test =  pd.read_csv("module5_exercise_test.csv", sep=",")

### Data analysis

In [None]:
data = pd.concat([df_train, df_test], axis=0)

df_train.shape

df_test.shape

In [None]:
def plot_feature_over_time(df, feature, date_id_start, date_id_end):
    df_filtered = df[(df['date'] >= date_id_start) & (df['date'] <= date_id_end)]
    
    if feature not in df_filtered.columns:
        print(f"Feature '{feature}' not found in the DataFrame.")
        return
    
    plt.figure(figsize=(10, 6))
    plt.plot(df_filtered['date'], df_filtered[feature], label=feature, linestyle='-')
    plt.xlabel('Date')
    plt.ylabel(feature)
    plt.title(f'{feature} from {date_id_start} to {date_id_end}')
    plt.xticks(rotation=45)
    plt.grid(True)
    plt.legend()
    plt.tight_layout()
    plt.show()



##### Exploring Inconsistencies

##### Mixed Type Columns

In [None]:
for column in data.columns:
    print(column, data[column].apply(type).unique())

print("\nWeather Condition Values\n")
print(data["weather_condition"].unique())

print("\nWind Speed Non str Values\n")
print(data["wind_speed"][data["wind_speed"].apply(type) == float].unique())

- `weather_condition` must be cast into 4 0-1 columns for `['Cloudy' 'Sunny' 'Rainy' 'Snowy' nan]`
- `wind_speed` missing values must be handled, must be cast to float

##### Missing Values

In [None]:

missing_counts = data.isnull().sum().sort_values(ascending=False)
missing_counts = missing_counts[missing_counts > 0]
plt.figure(figsize=(8, 4))
missing_counts.plot(kind='bar')
plt.title('Bar Chart of Missing Values Count')
plt.xlabel('Columns')
plt.ylabel('Number of Missing Values')
plt.show()

print("Percentage of missing values per column:")
print((data.isnull().sum() / len(data) * 100).round(2))

In [None]:
plot_feature_over_time(data, 'humidity', '2000-01-01', '2020-09-01')
plot_feature_over_time(data, 'humidity', '2018-01-01', '2018-09-01')
plot_feature_over_time(data, 'temperature_station1', '2017-01-01', '2017-09-01')

- `temperature_station` missing values can be handled with a 3-nearest average
- `weather_condition` should not be too much of a problem if we split in into three 0-1 columns anyway
- `humidity` by linear interpolation, aberant values must be trimmed beforehand
- `wind_speed` must be cast

In [None]:
data

- `humidity` missing values : interpolation
- `wind_speed` unify units and cast to float
- `oil_bent_price_indicator` to ordered category
- `temparature_station*` interpolate missing values ?
- `date` must be split to reflect seasonal changes

In [None]:
data['wind_speed']

- `wind_speed` unify units and cast to float

In [None]:
plot_feature_over_time(data, 'electricity_demand', '2017-01-01', '2019-09-07')

- `electricity_demand` : trim (far) outliers, interpolate missing or trimmed values 

**TODO** : Compare `weather_condition` with `humidity` ?

#### Summary

- `date` must be split to reflect seasonal changes
- `weather_condition` must be cast into 4 0-1 columns for `['Cloudy' 'Sunny' 'Rainy' 'Snowy' nan]`, missing values should then not be a major problem
- `wind_speed` missing values must be handled, must be cast to float, units unified
- `electricity_demand` : trim (far) outliers, interpolate missing or trimmed values 
- `humidity` by linear interpolation, aberant values must be trimmed beforehand
- `oil_brent_price_indicator` to ordered category `['Moderate', 'High', 'Low', 'Very Low', 'Very High']`
- `temperature_station` missing values can be handled with a 3-nearest average

In [None]:
data["oil_brent_price_indicator"].unique()

### Data Preprocessing Evaluation Strategy

In [None]:
from typing import (
    Optional
)

import re
from scipy import stats
import datetime

# 1. Handle Inconsistencies


def _handle_inconsistencies(data: pd.DataFrame) -> pd.DataFrame:
    """Parse and convert wind_speed."""
    data = data.copy()

    wind_speed_regex = r"^\s*(\d+\.\d+)\s*(km\/h|m\/s)\s*$"

    def parse_wind_speed(wind_speed: str | float) -> float:
        if not isinstance(wind_speed, str):
            return np.nan
        result = re.search(wind_speed_regex, wind_speed)
        if not result:
            return np.nan
        value, unit = result.groups()

        try:
            value = float(value)
        except Exception:
            return np.nan

        match unit:
            case "km/h":
                temporal=value/3
                return temporal
            case "m/s":
                return value
            case _:
                raise Exception("Bad regex stupid")

    data["wind_speed"] = data["wind_speed"].apply(parse_wind_speed)
    data["date"] = pd.to_datetime(data["date"])

    return data


def handle_inconsistencies(X_train: pd.DataFrame, y_train: pd.DataFrame, X_val: pd.DataFrame | None = None):
    if X_val is not None:
        return _handle_inconsistencies(X_train), y_train, _handle_inconsistencies(X_val)
    else:
        return _handle_inconsistencies(X_train), y_train


# 2. Handling Duplicates
def handle_duplicates(X_train, y_train, X_val=None):
    if X_val is not None:
        return X_train.copy(), y_train, X_val.copy()
    else:
        X_train_no_duplicates = X_train.copy()
        y_train_no_duplicates = y_train.loc[X_train_no_duplicates.index]
        return X_train_no_duplicates, y_train_no_duplicates


def _handle_missing_values(data: pd.DataFrame) -> pd.DataFrame:
    """Interpolate all numeric columns."""
    data = data.copy()
    for column in data.select_dtypes(include="number").columns.tolist():
        data[column] = data[column].interpolate(limit_direction="both")

    return data

# 3. Handling Missing Values


def handle_missing_values(X_train, y_train, X_val=None):
    # Linear interpolation should do it
    # weather condition is not handled, we'll do it later
    if X_val is not None:
        return _handle_missing_values(X_train), _handle_missing_values(X_val)
    else:
        return _handle_missing_values(X_train)

# 4. Handling Categorical Values


def _handle_categorical(data: pd.DataFrame) -> pd.DataFrame:
    """Make weather_condition into 4 boolean columns then drop it. Cast oil_brent_price_indicator to ordered categories."""
    data = data.copy()

    data["oil_brent_price_indicator"] = data["oil_brent_price_indicator"].map({
        "Very Low": 1,
        "Low": 2,
        "Moderate": 3,
        "High": 4,
        "Very High": 5
    })

    data["sunny"] = (data["weather_condition"] == "Sunny").astype(int)
    data["cloudy"] = (data["weather_condition"] == "Cloudy").astype(int)
    data["rainy"] = (data["weather_condition"] == "Rainy").astype(int)
    data["snowy"] = (data["weather_condition"] == "Snowy").astype(int)
    data = data.drop("weather_condition", axis=1)

    return data


def handle_categorical(X_train, y_train, X_val=None):

    if X_val is not None:
        return _handle_categorical(X_train), _handle_categorical(X_val)
    else:
        return _handle_categorical(X_train)

# 5. Handling Outliers


def _handle_outliers(data: pd.DataFrame | pd.Series) -> pd.DataFrame | pd.Series:
    """Replace numeric values 5 sigmas from their mean."""
    data = data.copy()

    if isinstance(data, pd.Series):
        z_scores = np.abs(stats.zscore(data))
        data[z_scores > 5] = data.median()
    else:
        for column in data.select_dtypes(include="number").columns.tolist():
            z_scores = np.abs(stats.zscore(data[column]))
            data.loc[z_scores > 5, column] = data[column].median()

    return data


def handle_outliers(X_train, y_train, X_val=None):

    if X_val is not None:
        return _handle_outliers(X_train), _handle_outliers(y_train), _handle_outliers(X_val)
    else:
        return _handle_outliers(X_train), _handle_outliers(y_train)

# 6. Feature Engineering


def _feature_engineering(data: pd.DataFrame) -> pd.DataFrame:
    """Encode year's day into a rotating representation. Drop various temperature stations but add their average."""
    data = data.copy()

    data["day_sin"] = np.sin(
        2 * np.pi * data["date"].apply(lambda date: date.timetuple().tm_yday) / 366)
    data["day_cos"] = np.cos(
        2 * np.pi * data["date"].apply(lambda date: date.timetuple().tm_yday) / 366)
    data = data.drop("date", axis=1)

    data["temperature_avg"] = np.mean(
        data[[column for column in data.columns if "temperature_station" in column]], axis=1)
    data = data.drop(
        [column for column in data.columns if "temperature_station" in column], axis=1)

    return data


def feature_engineering(X_train, y_train, X_val=None):
    if X_val is not None:
        return _feature_engineering(X_train), y_train, _feature_engineering(X_val)
    else:
        return _feature_engineering(X_train), y_train

# 7. Feature Selection and Dimensionality Reduction


def feature_selection(X_train, y_train, X_val=None):

    if X_val is not None:
        return X_train, X_val
    else:
        return X_train

In [None]:
_data = _feature_engineering(_handle_outliers(_handle_categorical(_handle_missing_values(_handle_inconsistencies(data)))))
_data.describe()



In [None]:
def evaluate_pipeline(X, y, n_splits=5):

    ### call transformations here, if there is no learning and no need to be crossval
    X, y = handle_inconsistencies(X, y)
    X, y = handle_duplicates(X, y)
    X  = handle_missing_values(X, y)
    X = handle_categorical(X, y)
    X, y = handle_outliers(X, y)
    X, y = feature_engineering(X, y)
    X = feature_selection(X, y)
    
    model = LinearRegression()
    
    tscv = TimeSeriesSplit(n_splits=n_splits)
    
    train_scores = []
    val_scores = []
    
    for fold, (train_index, val_index) in enumerate(tscv.split(X)):
        print(f"Processing fold {fold + 1}/{n_splits}...")
        
        X_train, X_val = X.iloc[train_index].copy(), X.iloc[val_index].copy()
        y_train, y_val = y.iloc[train_index].copy(), y.iloc[val_index].copy()
        model.fit(X_train, y_train)
        
        y_train_pred = model.predict(X_train)
        train_mse = mean_squared_error(y_train, y_train_pred)
        train_scores.append(train_mse)
        
        # Predict on validation set
        y_val_pred = model.predict(X_val)
        val_mse = mean_squared_error(y_val, y_val_pred)
        val_scores.append(val_mse)
        
        print(f"Fold {fold + 1} Train MSE: {train_mse:.4f}, Validation MSE: {val_mse:.4f}")
    
    # Compute mean, max, and min values for train and validation MSE
    mean_train_mse = np.mean(train_scores)
    max_train_mse = np.max(train_scores)
    min_train_mse = np.min(train_scores)
    
    mean_val_mse = np.mean(val_scores)
    max_val_mse = np.max(val_scores)
    min_val_mse = np.min(val_scores)
    
    # Print results
    print("\nTrain MSE:")
    print(f"Mean: {mean_train_mse:.4f}, Max: {max_train_mse:.4f}, Min: {min_train_mse:.4f}")
    
    print("\nValidation MSE:")
    print(f"Mean: {mean_val_mse:.4f}, Max: {max_val_mse:.4f}, Min: {min_val_mse:.4f}")
    
    return mean_val_mse  # Return mean validation MSE as the overall score

In [None]:
# Prepare X and y
X = df_train.copy().drop(columns=['electricity_demand'], axis=1)
y = df_train.copy().pop('electricity_demand')

# Run the evaluation
evaluate_pipeline(X, y)

### Generating Submission File

In [None]:

df_train =  pd.read_csv("module5_exercise_train.csv", sep=",")

X_train = df_train.drop(columns=['electricity_demand'], axis=1)
y_train = df_train['electricity_demand']

X_test =  pd.read_csv("module5_exercise_test.csv", sep=",")

In [None]:
def train_and_predict_to_submit(X_train, y_train, X_test):
    model = LinearRegression()
    
    X_train, y_train, X_test = handle_inconsistencies(X_train, y_train, X_test)
    X_train, y_train, X_test = handle_duplicates(X_train, y_train, X_test)
    X_train, X_test = handle_missing_values(X_train, y_train, X_test)
    X_train, X_test = handle_categorical(X_train, y_train, X_test)
    X_train, y_train, X_test = handle_outliers(X_train, y_train, X_test)
    X_train, y_train, X_test = feature_engineering(X_train, y_train, X_test)
    X_train, X_test = feature_selection(X_train, y_train, X_test)

    # Train the model on the entire training set
    print(f"Training model on entire dataset of shape: {X_train.shape}")
    model.fit(X_train, y_train)
    
    # Predict on the test set
    print(f"Predicting on test dataset of shape: {X_test.shape}")
    y_test_pred = model.predict(X_test)
    
    return y_test_pred

In [None]:

y_test_pred = train_and_predict_to_submit(X_train, y_train, X_test)

In [None]:
# Generating Submission File
submission = pd.DataFrame({
    'date': X_test['date'],
    'electricity_demand': y_test_pred
})

# Save the submission file
submission.to_csv('submission.csv', index=False, sep=',')
print("Submission file saved as 'submission.csv'.")