In [None]:
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.metrics import mean_squared_error
from sklearn.linear_model import LinearRegression
from sklearn.model_selection import TimeSeriesSplit
import numpy as np
from sklearn.preprocessing import OrdinalEncoder

### Data Collection

In [None]:
import requests

# URLs of the files
train_data_url = 'https://www.raphaelcousin.com/modules/data-science-practice/module5/exercise/module5_exercise_train.csv'
test_data_url = 'https://www.raphaelcousin.com/modules/data-science-practice/module5/exercise/module5_exercise_test.csv'

# Function to download a file
def download_file(url, file_name):
    response = requests.get(url)
    response.raise_for_status()  # Ensure we notice bad responses
    with open(file_name, 'wb') as file:
        file.write(response.content)
    print(f'Downloaded {file_name} from {url}')

# Downloading the files
download_file(train_data_url, 'module5_exercise_train.csv')
download_file(test_data_url, 'module5_exercise_test.csv')

In [None]:
df_train =  pd.read_csv("module5_exercise_train.csv", sep=",")
df_test =  pd.read_csv("module5_exercise_test.csv", sep=",")

### Data analysis

In [None]:
#### Make a complete analysis on data preprocessing
# Inconsistencies
# Duplicates (data.duplicated().sum())
# Missing values (data.isnull().sum())
# Categorical
# Outliers
# Feature Engineering
# Feature Selection and/or Dimensionality Reduction

In [None]:
data = pd.concat([df_train, df_test], axis=0)
data.shape

In [None]:
print(df_train.shape)
df_train

In [None]:
print(df_test.shape)
df_test
# une colonne de moins car on doit prédire la demande d'électricité

In [None]:
print(data.shape)
data.nunique()

In [None]:
print(f"Dates dupliquées : {data['date'].duplicated().sum()}")
# On supprime les dates dupliquées
data = data.drop_duplicates(subset=['date'], keep='last')

In [None]:
def plot_feature_over_time(df, feature, date_id_start, date_id_end):
    df_filtered = df[(df['date'] >= date_id_start) & (df['date'] <= date_id_end)]
    
    if feature not in df_filtered.columns:
        print(f"Feature '{feature}' not found in the DataFrame.")
        return
    
    # Plotting
    plt.figure(figsize=(10, 6))
    plt.plot(df_filtered['date'], df_filtered[feature], label=feature, linestyle='-')
    plt.xlabel('Date')
    plt.ylabel(feature)
    plt.title(f'{feature} from {date_id_start} to {date_id_end}')
    plt.xticks(rotation=45)
    plt.grid(True)
    plt.legend()
    plt.tight_layout()
    plt.show()



In [None]:
data['date'] = pd.to_datetime(data['date'])

In [None]:
data

In [None]:
data['wind_speed']

In [None]:
df = pd.DataFrame()

# On sépare en deux colonnes : valeur et unité
df[["valeur", "unite"]] = data["wind_speed"].str.extract(r'([\d.]+)\s*(\w+/?.*)')

# Conversion en float
df["valeur"] = df["valeur"].astype(float)

# Conversion en m/s
df["vitesse_m/s"] = df.apply(
    lambda row: row["valeur"] if row["unite"] == "m/s" else row["valeur"] / 3.6,
    axis=1
)

# On remplace la colonne wind_speed par les valeurs de vitesse_m/s dans le dataset data
data["wind_speed"] = df["vitesse_m/s"]

data

In [None]:
plot_feature_over_time(data, 'electricity_demand', '2017-01-01', '2019-09-07')

In [None]:
# On regarde le nombre de valeurs négatives
print(sum(data["electricity_demand"] < 0))
# Il y en a qu'une donc on remplace par la moyenne
data["electricity_demand"] = data["electricity_demand"].mask(data["electricity_demand"] < 0, np.mean(data["electricity_demand"]))

plot_feature_over_time(data, 'electricity_demand', '2017-01-01', '2019-09-07')


In [None]:
"""
# On constate que les dates ne sont pas dans l'ordre temporel
data = data.sort_values("date")

plot_feature_over_time(data, 'electricity_demand', '2017-01-01', '2019-09-07')
"""

In [None]:
plot_feature_over_time(data, 'humidity', '2016-06-01', '2019-09-07')

In [None]:
# On remplace les morceaux manquants par une interpollation linéaire
data["humidity"] = data["humidity"].interpolate(method="linear")

plot_feature_over_time(data, 'humidity', '2016-06-01', '2016-12-01')


In [None]:
data = data.fillna(method="ffill").fillna(method="bfill")

In [None]:
# Identify unique values
print("Unique values in each column:")
columns = ["weather_condition", "oil_brent_price_indicator"]
for column in columns:
    print(f"{column}: {data[column].nunique()} - {data[column].unique()}")

In [None]:
encoder = OrdinalEncoder(categories=[['Snowy','Rainy','Cloudy', 'Sunny']])
data['weather_condition'] = encoder.fit_transform(data[['weather_condition']])
data

In [None]:
encoder2 = OrdinalEncoder(categories=[['Very Low','Low','Moderate','High','Very High']])
data['oil_brent_price_indicator'] = encoder2.fit_transform(data[['oil_brent_price_indicator']])
data

In [None]:
def add_datetime_features(X):
        X['year'] = X['date'].dt.year
        X['month'] = X['date'].dt.month
        X['day'] = X['date'].dt.day
        return X
data = add_datetime_features(data)
data

### Data Preprocessing Evaluation Strategy

In [None]:
# Provide a complete data preprocessing transformations

In [None]:
# 1. Handle Inconsistencies
def handle_inconsistencies(X_train, y_train, X_val=None):
    X_train['date'] = pd.to_datetime(X_train['date'])

    X = pd.DataFrame()
    # On sépare en deux colonnes : valeur et unité
    X[["valeur", "unite"]] = X_train["wind_speed"].str.extract(r'([\d.]+)\s*(\w+/?.*)')
    # Conversion en float
    X["valeur"] = X["valeur"].astype(float)
    # Conversion en m/s
    X["vitesse_m/s"] = X.apply(
        lambda row: row["valeur"] if row["unite"] == "m/s" else row["valeur"] / 3.6,
        axis=1
    )
    # On remplace les valeurs de la colonne wind_speed par les valeurs de vitesse_m/s
    X_train["wind_speed"] = X["vitesse_m/s"]

    if X_val is not None:
        X_val['date'] = pd.to_datetime(X_val['date'])

        X = pd.DataFrame()
        # On sépare en deux colonnes : valeur et unité
        X[["valeur", "unite"]] = X_val["wind_speed"].str.extract(r'([\d.]+)\s*(\w+/?.*)')
        # Conversion en float
        X["valeur"] = X["valeur"].astype(float)
        # Conversion en m/s
        X["vitesse_m/s"] = X.apply(
            lambda row: row["valeur"] if row["unite"] == "m/s" else row["valeur"] / 3.6,
            axis=1
        )
        # On remplace les valeurs de la colonne wind_speed par les valeurs de vitesse_m/s
        X_val["wind_speed"] = X["vitesse_m/s"]
        return X_train.copy(), y_train, X_val.copy()
    else:
        return X_train.copy(), y_train

# 2. Handling Duplicates
def handle_duplicates(X_train, y_train, X_val=None):
    X_train_no_duplicates = X_train.drop_duplicates()
    y_train_no_duplicates = y_train.loc[X_train_no_duplicates.index]
    if X_val is not None:
        X_val_no_duplicates = X_val.drop_duplicates()
        return X_train_no_duplicates.copy(), y_train_no_duplicates, X_val_no_duplicates.copy()
    else:
        return X_train_no_duplicates.copy(), y_train_no_duplicates

# 3. Handling Missing Values
def handle_missing_values(X_train, y_train, X_val=None):
    # On remplace les morceaux manquants par une interpollation linéaire
    X_train["humidity"] = X_train["humidity"].interpolate(method="linear")

    X_train = X_train.fillna(method="ffill").fillna(method="bfill")
    if X_val is not None:
        X_val["humidity"] = X_val["humidity"].interpolate(method="linear")

        X_val = X_val.fillna(method="ffill").fillna(method="bfill")
        return X_train.copy(), X_val.copy()
    else:
        return X_train.copy()

# 4. Handling Categorical Values
def handle_categorical(X_train, y_train, X_val=None):
    # Ordinal encoding (useful for ordinal data where the order matters)
    encoder = OrdinalEncoder(categories=[['Snowy','Rainy','Cloudy', 'Sunny']])
    X_train['weather_condition'] = encoder.fit_transform(X_train[['weather_condition']])
    encoder2 = OrdinalEncoder(categories=[['Very Low','Low','Moderate','High','Very High']])
    X_train['oil_brent_price_indicator'] = encoder2.fit_transform(X_train[['oil_brent_price_indicator']])
    if X_val is not None:
        X_val['weather_condition'] = encoder.transform(X_val[['weather_condition']])
        X_val['oil_brent_price_indicator'] = encoder2.transform(X_val[['oil_brent_price_indicator']])
        return X_train.copy(), X_val.copy()
    else:
        return X_train.copy()

# 5. Handling Outliers
def handle_outliers(X_train, y_train, X_val=None):
    def X_IQR(df, columns):
        for column in columns:
            Q1 = df[column].quantile(0.25)
            Q3 = df[column].quantile(0.75)
            IQR = Q3 - Q1
            lower_bound = Q1 - 1.5 * IQR
            upper_bound = Q3 + 1.5 * IQR
            df[column] = np.clip(df[column], lower_bound, upper_bound)
        return df
    # On n'utilise pas cette fonction
    def y_IQR(y):
        Q1 = y.quantile(0.25)
        Q3 = y.quantile(0.75)
        IQR = Q3 - Q1
        lower_bound = Q1 - 1.5 * IQR
        upper_bound = Q3 + 1.5 * IQR
        return np.clip(y, lower_bound, upper_bound)
    
    column = ['humidity']
    X_train = X_IQR(X_train.copy(), column)

    y_train = y_train.mask(y_train < 0, np.mean(y_train))
    if X_val is not None:
        X_val = X_IQR(X_val.copy(), column)
        return X_train, y_train, X_val
    else:
        return X_train, y_train

# 6. Feature Engineering
def feature_engineering(X_train, y_train, X_val=None):
    def add_datetime_features(X):
        X['year'] = X['date'].dt.year
        X['month'] = X['date'].dt.month
        X['day'] = X['date'].dt.day
        return X
    X_train = add_datetime_features(X_train)

    if X_val is not None:
        X_val = add_datetime_features(X_val)
        return X_train.copy(), y_train, X_val.copy()
    else:
        return X_train.copy(), y_train

# 7. Feature Selection and Dimensionality Reduction
def feature_selection(X_train, y_train, X_val=None):
    selected_columns = ['weather_condition', 'humidity', 'wind_speed', 'oil_brent_price_indicator', 'temperature_station1',
       'temperature_station2', 'temperature_station3', 'temperature_station4',
       'temperature_station5', 'temperature_station6', 'temperature_station7',
       'temperature_station8', 'temperature_station9', 'temperature_station10',
       'year', 'month', 'day']
    if X_val is not None:
        return X_train[selected_columns], X_val[selected_columns]
    else:
        return X_train[selected_columns]

In [None]:
def evaluate_pipeline(X, y, n_splits=5):

    ### call transformations here, if there is no learning and no need to be crossval
    X, y = handle_inconsistencies(X, y)
    X, y = handle_duplicates(X, y)
    X = handle_missing_values(X, y)
    # X = handle_categorical(X, y)
    X, y = handle_outliers(X, y)
    X, y = feature_engineering(X, y)
    X = feature_selection(X, y)
    
    model = LinearRegression()
    
    tscv = TimeSeriesSplit(n_splits=n_splits)
    
    train_scores = []
    val_scores = []
    
    X_trains = []
    X_vals = []
    y_trains = []
    y_vals = []

    for fold, (train_index, val_index) in enumerate(tscv.split(X)):
        print(f"Processing fold {fold + 1}/{n_splits}...")
        
        # Split data into train and validation sets
        X_train, X_val = X.iloc[train_index].copy(), X.iloc[val_index].copy()
        y_train, y_val = y.iloc[train_index].copy(), y.iloc[val_index].copy()

        X_trains.append(X_train)
        X_vals.append(X_val)
        y_trains.append(y_train)
        y_vals.append(y_val)

        ### call transformations here, if there is learning
        # X_train, y_train, X_val = handle_inconsistencies(X_train, y_train, X_val)
        # X_train, y_train, X_val = handle_duplicates(X_train, y_train, X_val)
        # X_train, X_val = handle_missing_values(X_train, y_train, X_val)
        X_train, X_val = handle_categorical(X_train, y_train, X_val)
        # X_train, y_train, X_val = handle_outliers(X_train, y_train, X_val)
        # X_train, y_train, X_val = feature_engineering(X_train, y_train, X_val)
        # X_train, X_val = feature_selection(X_train, y_train, X_val)
        
        # Train the model
        model.fit(X_train, y_train)
        
        # Predict on training set
        y_train_pred = model.predict(X_train)
        train_mse = mean_squared_error(y_train, y_train_pred)
        train_scores.append(train_mse)
        
        # Predict on validation set
        y_val_pred = model.predict(X_val)
        val_mse = mean_squared_error(y_val, y_val_pred)
        val_scores.append(val_mse)
        
        print(f"Fold {fold + 1} Train MSE: {train_mse:.4f}, Validation MSE: {val_mse:.4f}")
    
    # Compute mean, max, and min values for train and validation MSE
    mean_train_mse = np.mean(train_scores)
    max_train_mse = np.max(train_scores)
    min_train_mse = np.min(train_scores)
    
    mean_val_mse = np.mean(val_scores)
    max_val_mse = np.max(val_scores)
    min_val_mse = np.min(val_scores)
    
    # Print results
    print("\nTrain MSE:")
    print(f"Mean: {mean_train_mse:.4f}, Max: {max_train_mse:.4f}, Min: {min_train_mse:.4f}")
    
    print("\nValidation MSE:")
    print(f"Mean: {mean_val_mse:.4f}, Max: {max_val_mse:.4f}, Min: {min_val_mse:.4f}")
    
    return mean_val_mse, X_trains, X_vals, y_trains, y_vals  # Return mean validation MSE as the overall score

In [None]:
# Prepare X and y
X = df_train.copy().drop(columns=['electricity_demand'], axis=1)
y = df_train.copy().pop('electricity_demand')

# Run the evaluation
mse, X_trains, X_vals, y_trains, y_vals = evaluate_pipeline(X, y)

### Generating Submission File

In [None]:
# Train and submit your results

In [None]:
# Prepare X_train and y_train from your data
df_train =  pd.read_csv("module5_exercise_train.csv", sep=",")

X_train = df_train.drop(columns=['electricity_demand'], axis=1)
y_train = df_train['electricity_demand']

X_test =  pd.read_csv("module5_exercise_test.csv", sep=",")

In [None]:
def train_and_predict_to_submit(X_train, y_train, X_test):
    model = LinearRegression()
    
    X_train, y_train, X_test = handle_inconsistencies(X_train, y_train, X_test)
    X_train, y_train, X_test = handle_duplicates(X_train, y_train, X_test)
    X_train, X_test = handle_missing_values(X_train, y_train, X_test)
    X_train, X_test = handle_categorical(X_train, y_train, X_test)
    X_train, y_train, X_test = handle_outliers(X_train, y_train, X_test)
    X_train, y_train, X_test = feature_engineering(X_train, y_train, X_test)
    X_train, X_test = feature_selection(X_train, y_train, X_test)

    # Train the model on the entire training set
    print(f"Training model on entire dataset of shape: {X_train.shape}")
    model.fit(X_train, y_train)
    
    # Predict on the test set
    print(f"Predicting on test dataset of shape: {X_test.shape}")
    y_test_pred = model.predict(X_test)
    
    return y_test_pred

In [None]:
# Call serve_model to train and predict
y_test_pred = train_and_predict_to_submit(X_train, y_train, X_test)

In [None]:
# Generating Submission File
submission = pd.DataFrame({
    'date': X_test['date'],
    'electricity_demand': y_test_pred
})

# Save the submission file
submission.to_csv('submission.csv', index=False, sep=',')
print("Submission file saved as 'submission.csv'.")