In [None]:
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.metrics import mean_squared_error
from sklearn.linear_model import LinearRegression
from sklearn.model_selection import TimeSeriesSplit
import numpy as np
import seaborn as sns

### Data Collection

In [None]:
import requests

# URLs of the files
train_data_url = 'https://www.raphaelcousin.com/modules/data-science-practice/module5/exercise/module5_exercise_train.csv'
test_data_url = 'https://www.raphaelcousin.com/modules/data-science-practice/module5/exercise/module5_exercise_test.csv'

# Function to download a file
def download_file(url, file_name):
    response = requests.get(url)
    response.raise_for_status()  # Ensure we notice bad responses
    with open(file_name, 'wb') as file:
        file.write(response.content)
    print(f'Downloaded {file_name} from {url}')

# Downloading the files
download_file(train_data_url, 'module5_exercise_train.csv')
download_file(test_data_url, 'module5_exercise_test.csv')

In [None]:
df_train =  pd.read_csv("module5_exercise_train.csv", sep=",")
df_test =  pd.read_csv("module5_exercise_test.csv", sep=",")

### Data analysis

In [None]:
#### Make a complete analysis on data preprocessing
# Inconsistencies
# Duplicates (data.duplicated().sum())
# Missing values (data.isnull().sum())
# Categorical
# Outliers
# Feature Engineering
# Feature Selection and/or Dimensionality Reduction

In [None]:
data = pd.concat([df_train, df_test], axis=0)

In [None]:
df_train.shape

In [None]:
df_test.shape

In [None]:
def plot_feature_over_time(df, feature, date_id_start, date_id_end):
    df_filtered = df[(df['date'] >= date_id_start) & (df['date'] <= date_id_end)]
    
    if feature not in df_filtered.columns:
        print(f"Feature '{feature}' not found in the DataFrame.")
        return
    
    # Plotting
    plt.figure(figsize=(10, 6))
    plt.plot(df_filtered['date'], df_filtered[feature], label=feature, linestyle='-')
    plt.xlabel('Date')
    plt.ylabel(feature)
    plt.title(f'{feature} from {date_id_start} to {date_id_end}')
    plt.xticks(rotation=45)
    plt.grid(True)
    plt.legend()
    plt.tight_layout()
    plt.show()



In [None]:
data['date'] = pd.to_datetime(data['date'])

In [None]:
data

In [None]:
data['wind_speed']

In [None]:
plot_feature_over_time(data, 'electricity_demand', '2017-01-01', '2019-09-07')

In [None]:
plot_feature_over_time(data, 'humidity', '2016-06-01', '2016-12-01')

# Analysis

### Inconsistencies

In [None]:
df= data.copy()

In [None]:
# shape du data
print("Forme du dataframe")
df.shape

In [None]:
# Visualisation des 10 premières valeurs
print("Visualisation des 10 premières valeurs du dataframe\n")
df.head(10)

In [None]:
# Visualisation des 10 dernières valeurs
print("Visualisation des 10 premières valeurs du dataframe\n")
df.tail(10)

In [None]:
# Visualisation de 10 lignes aléatoire
print("Visualisation de 10 lignes aléatoire\n")
df.sample(10)

In [None]:
# Infos générales sur le data
print("Infos générales sur le data")
df.info()

In [None]:
# Statistique descriptive des colonnes de type string
# count= cardinal de var,unique les valeurs prises,top=valeur la plus fréquente et freq son nbre de frequence
df.describe(include=['object'])

In [None]:
# Noms des colonnes de type string
nom_cols_string = df.select_dtypes(include=['object', 'string']).columns
nom_cols_string

In [None]:
for column in ['weather_condition', 'oil_brent_price_indicator']:
    print(f"{column}: {df[column].nunique()} - {df[column].unique()}")

In [None]:
# Regrouper les valeur d'une variable en fonction des types 
for col in df.columns:
    print(f"Colonne: {col}")
    print("Nombre d'éléments par type de données :")
    print(df[col].apply(type).value_counts())
    print("-" * 40)


In [None]:
# Affiche toutes les n lignes où la valeur n'est pas un str
df[df['wind_speed'].apply(lambda x: not isinstance(x, str))].head(3)

### Handling Duplicates

In [None]:
# nombre de copies uniquement
nb_doublons_copies = df.duplicated().sum()
print("Nombre de doublons (copies uniquement) :", nb_doublons_copies)

### Handling Missing Values

In [None]:
# Nombre de valeurs manquantes par colonne
print("Nombre de valeurs manquantes par colonne:")
print(df.isnull().sum())

In [None]:
# Nombre de lignes avec au moins une valeur manquante
print("Nombre de lignes avec au moins une valeur manquante")
df.isnull().any(axis=1).sum()

In [None]:
# Percentage of missing values per column
print("\nPercentage of missing values per column:")
percent_missing = df.isnull().mean() * 100
print(percent_missing)

# Visualizing missing values
plt.figure(figsize=(12, 6))
sns.heatmap(df.isnull(), cbar=False, cmap='viridis', yticklabels=False)
plt.title('Missing Value Heatmap')
plt.show()


In [None]:
# permet de savoir si il faut utiliser le knn
from pandas.plotting import scatter_matrix
import matplotlib.pyplot as plt

scatter_matrix(df.select_dtypes(include='number'), figsize=(10, 10), diagonal='hist')
plt.suptitle("Scatter matrix")
plt.show()

### Handling Categorical Values

In [None]:
# Handling Categorical Values
for column in ['weather_condition', 'oil_brent_price_indicator']:
    print(f"{column}: {df[column].nunique()} - {df[column].unique()}")

In [None]:
weather_to_num = {'Snowy': 0, 'Rainy': 1, 'Cloudy': 2, 'Sunny': 3}
oil_indicator_to_num = {'Very Low': 0, 'Low': 1, 'Moderate': 2, 'High': 3, 'Very High': 4}

def transfer_categorical(X):
    X_new = X.copy()

    # Remplacer les NaN par le mode de la colonne
    if 'weather_condition' in X_new.columns:
        mode_weather = X_new['weather_condition'].mode(dropna=True)[0]
        X_new['weather_condition'] = X_new['weather_condition'].fillna(mode_weather)
        X_new['weather_condition'] = X_new['weather_condition'].map(lambda x: weather_to_num.get(x, -1))

    if 'oil_brent_price_indicator' in X_new.columns:
        mode_oil = X_new['oil_brent_price_indicator'].mode(dropna=True)[0]
        X_new['oil_brent_price_indicator'] = X_new['oil_brent_price_indicator'].fillna(mode_oil)
        X_new['oil_brent_price_indicator'] = X_new['oil_brent_price_indicator'].map(lambda x: oil_indicator_to_num.get(x, -1))

    return X_new


### Handling Outliers

In [None]:
# Detecte le nbre de valeurs abérantes

### Data Preprocessing Evaluation Strategy

In [None]:
# Provide a complete data preprocessing transformations

In [None]:
# 1. Handle Inconsistencies
def handle_inconsistencies(X_train, y_train, X_val=None):
    
    def conversion_km_m(X):
        X = X.copy()  # Éviter de modifier le DataFrame original

        # Extraire les valeurs numériques et les unités de wind_speed
        X['value'] = X['wind_speed'].str.extract(r'([\d.]+)').astype(float)
        X['unit'] = X['wind_speed'].str.extract(r'([a-zA-Z/]+)')

        # Convertir toutes les vitesses en m/s
        def convert_to_mps(value, unit):
            if unit == 'km/h':
                return value / 3.6
            elif unit == 'm/s':
                return value
            else:
                return None  # ou NaN si tu veux

        X['wind_speed'] = X.apply(lambda row: convert_to_mps(row['value'], row['unit']), axis=1)

        # Supprimer les colonnes temporaires
        X.drop(columns=['value', 'unit'], inplace=True)

        return X

    # Nettoyage des colonnes catégorielles
    for df in [X_train] + ([X_val] if X_val is not None else []):
        df['weather_condition'] = df['weather_condition'].astype(str).str.strip().str.lower()
        df['oil_brent_price_indicator'] = df['oil_brent_price_indicator'].astype(str).str.strip().str.lower()

    # Conversion des vitesses
    X_train_clean = conversion_km_m(X_train)

    if X_val is not None:
        X_val_clean = conversion_km_m(X_val)
        return X_train_clean, y_train.copy(), X_val_clean
    else:
        return X_train_clean, y_train.copy()


In [None]:
# 2. Handling Duplicates
def handle_duplicates(X_train, y_train, X_val=None):
    # Supprimer les doublons dans X_train
    X_train_no_duplicate = X_train.drop_duplicates()
    # Garder les mêmes indices dans y_train que dans X_train_no_duplicate
    y_train_no_duplicate = y_train.loc[X_train_no_duplicate.index]

    if X_val is not None:
        # Supprimer les doublons dans X_val
        X_val_no_duplicate = X_val.drop_duplicates()
        return X_train_no_duplicate, y_train_no_duplicate, X_val_no_duplicate
    else:
        return X_train_no_duplicate, y_train_no_duplicate


In [None]:
# 3. Handling Missing Values
from sklearn.preprocessing import StandardScaler
from sklearn.impute import KNNImputer

def handle_missing_values(X_train, y_train, X_val=None):
    def impute(X):
        # Séparer les colonnes numériques
        X_num = X.select_dtypes(include=['number'])
        X_cat = X.drop(columns=X_num.columns)  # colonnes non numériques

        # Standardisation
        scaler = StandardScaler()
        X_scaled = pd.DataFrame(scaler.fit_transform(X_num), columns=X_num.columns, index=X.index)

        # Imputation KNN
        imputer = KNNImputer(n_neighbors=3)
        X_imputed_scaled = pd.DataFrame(imputer.fit_transform(X_scaled), columns=X_num.columns, index=X.index)

        # Retour à l’échelle originale
        X_imputed = pd.DataFrame(scaler.inverse_transform(X_imputed_scaled), columns=X_num.columns, index=X.index)

        # Reconstruire le DataFrame avec les colonnes catégorielles
        X_new = pd.concat([X_imputed, X_cat], axis=1)

        return X_new

    # Imputer X_train
    X_train = impute(X_train)

    # Si X_val existe, l’imputer aussi
    if X_val is not None:
        X_val = impute(X_val)
        return X_train, y_train.copy(), X_val
    else:
        return X_train, y_train.copy()


In [None]:

def handle_categorical(X_train, y_train, X_val=None):
    X_train = X_train.copy()
    X_train = transfer_categorical(X_train)

    if X_val is not None:
        X_val = X_val.copy()
        X_val = transfer_categorical(X_val)
        
        return X_train, X_val
    else:
        return X_train






In [None]:
# 5. Handling Outliers
def handle_outliers(X_train, y_train, X_val=None):
    def find_outliers_iqr(series, threshold=1.5):
        """
        Détecte les outliers d'une série avec la méthode IQR.
        Retourne les index (et non les positions).
        """
        series = pd.Series(series).dropna()
        q1 = np.percentile(series, 25)
        q3 = np.percentile(series, 75)
        iqr = q3 - q1
        lower = q1 - threshold * iqr
        upper = q3 + threshold * iqr
        return series[(series < lower) | (series > upper)].index

    # Colonnes numériques à traiter
    selected_columns = ['humidity', 'wind_speed', 'temperature_station1',
        'temperature_station2', 'temperature_station3', 'temperature_station4',
        'temperature_station5', 'temperature_station6', 'temperature_station7',
        'temperature_station8', 'temperature_station9', 'temperature_station10']

    # Collecte des index à supprimer
    outlier_indices = pd.Index([])

    for col in selected_columns:
        if col in X_train.columns:
            outlier_indices = outlier_indices.union(find_outliers_iqr(X_train[col]))

    # Ajouter les outliers dans y_train
    outlier_indices = outlier_indices.union(find_outliers_iqr(y_train))

    # Supprimer les lignes correspondantes
    X_train_cleaned = X_train.drop(index=outlier_indices, errors='ignore')
    y_train_cleaned = y_train.drop(index=outlier_indices, errors='ignore')

    if X_val is not None:
        return X_train_cleaned, y_train_cleaned, X_val.copy()
    else:
        return X_train_cleaned, y_train_cleaned


In [None]:
# 6. Feature Engineering

from sklearn.preprocessing import PolynomialFeatures


def feature_engineering(X_train, y_train, X_val=None):

    def add_time_features(data):
        new_data = data.copy()
        new_data['date'] = pd.to_datetime(new_data['date'], errors='coerce')  # robustesse
        new_data['dayofyear'] = new_data['date'].dt.dayofyear
        new_data['month'] = new_data['date'].dt.month
        new_data['day'] = new_data['date'].dt.day
        new_data['hour'] = new_data['date'].dt.hour
        new_data['dayofweek'] = new_data['date'].dt.dayofweek
        new_data['is_weekend'] = new_data['dayofweek'].isin([5, 6]).astype(int)
        new_data['week_of_year'] = new_data['date'].dt.isocalendar().week.astype(int)
        new_data['month_progress'] = new_data['date'].dt.day / new_data['date'].dt.days_in_month
        return new_data

    def add_aggregate_features(data):
        new_data = data.copy()

        temp_cols = [f'temperature_station{i}' for i in range(1, 11)]
        new_data['avg_temp'] = new_data[temp_cols].mean(axis=1)

        # Polynomial features
        poly = PolynomialFeatures(degree=2, include_bias=False)
        try:
            poly_features = poly.fit_transform(new_data[['avg_temp', 'humidity']])
            poly_feature_names = poly.get_feature_names_out(['avg_temp', 'humidity'])
            new_data[poly_feature_names] = poly_features
        except KeyError:
            print("⚠️ Colonne(s) manquante(s) pour les features polynomiales : 'avg_temp' ou 'humidity'")

        # Log de wind_speed
        if 'wind_speed' in new_data.columns:
            new_data['log_windspeed'] = np.log1p(new_data['wind_speed'])
        return new_data

    # Apply to training data
    X_train_new = X_train.copy()
    X_train_new = add_time_features(X_train_new)
    X_train_new = add_aggregate_features(X_train_new)

    # Apply to validation data (if exists)
    if X_val is not None:
        X_val_new = X_val.copy()
        X_val_new = add_time_features(X_val_new)
        X_val_new = add_aggregate_features(X_val_new)
        return X_train_new, y_train.copy(), X_val_new
    else:
        return X_train_new, y_train.copy()


In [None]:
# 7. Feature Selection and Dimensionality Reduction
def feature_selection(X_train, y_train, X_val=None):
    expected_columns = [
        'humidity', 'weather_condition', 'wind_speed', 'oil_brent_price_indicator',
        'temperature_station1', 'temperature_station2', 'temperature_station3',
        'temperature_station4', 'temperature_station5', 'temperature_station6',
        'temperature_station7', 'temperature_station8', 'temperature_station9', 'temperature_station10',
        'dayofyear', 'month', 'day', 'hour', 'dayofweek', 'is_weekend',
        'week_of_year', 'month_progress', 'avg_temp', 'avg_temp^2',
        'avg_temp humidity', 'humidity^2', 'log_windspeed'
    ]

    # Filtrer seulement les colonnes existantes
    available_columns = [col for col in expected_columns if col in X_train.columns]

    if X_val is not None:
        # S'assurer que X_val a aussi les colonnes disponibles
        common_columns = [col for col in available_columns if col in X_val.columns]
        return X_train[common_columns].copy(), X_val[common_columns].copy()
    else:
        return X_train[available_columns].copy()


In [None]:
def evaluate_pipeline(X, y, n_splits=5):

    ### call transformations here, if there is no learning and no need to be crossval
    X, y = handle_inconsistencies(X, y)
    X, y = handle_duplicates(X, y)
    X, y = handle_missing_values(X, y)
    X = handle_categorical(X, y)  # On réutilise bien X
    X, y = handle_outliers(X, y)
    X, y = feature_engineering(X, y)
    X = feature_selection(X, y)

    
    model = LinearRegression()
    
    tscv = TimeSeriesSplit(n_splits=n_splits)
    
    train_scores = []
    val_scores = []
    
    for fold, (train_index, val_index) in enumerate(tscv.split(X)):
        print(f"Processing fold {fold + 1}/{n_splits}...")
        
        # Split data into train and validation sets
        X_train, X_val = X.iloc[train_index].copy(), X.iloc[val_index].copy()
        y_train, y_val = y.iloc[train_index].copy(), y.iloc[val_index].copy()

        ### call transformations here, if there is learning
        # X_train, y_train, X_val = handle_inconsistencies(X_train, y_train, X_val)
        #X_train, y_train, X_val = handle_duplicates(X_train, y_train, X_val)
        # X_train, X_val = handle_missing_values(X_train, y_train, X_val)
        #X_train, X_val = handle_categorical(X_train, y_train, X_val)
        # X_train, y_train, X_val = handle_outliers(X_train, y_train, X_val)
        #X_train, y_train, X_val = feature_engineering(X_train, y_train, X_val)
        # X_train, X_val = feature_selection(X_train, y_train, X_val)
        
        # Train the model
        model.fit(X_train, y_train)
        
        # Predict on training set
        y_train_pred = model.predict(X_train)
        train_mse = mean_squared_error(y_train, y_train_pred)
        train_scores.append(train_mse)
        
        # Predict on validation set
        y_val_pred = model.predict(X_val)
        val_mse = mean_squared_error(y_val, y_val_pred)
        val_scores.append(val_mse)
        
        print(f"Fold {fold + 1} Train MSE: {train_mse:.4f}, Validation MSE: {val_mse:.4f}")
    
    # Compute mean, max, and min values for train and validation MSE
    mean_train_mse = np.mean(train_scores)
    max_train_mse = np.max(train_scores)
    min_train_mse = np.min(train_scores)
    
    mean_val_mse = np.mean(val_scores)
    max_val_mse = np.max(val_scores)
    min_val_mse = np.min(val_scores)
    
    # Print results
    print("\nTrain MSE:")
    print(f"Mean: {mean_train_mse:.4f}, Max: {max_train_mse:.4f}, Min: {min_train_mse:.4f}")
    
    print("\nValidation MSE:")
    print(f"Mean: {mean_val_mse:.4f}, Max: {max_val_mse:.4f}, Min: {min_val_mse:.4f}")
    
    return mean_val_mse  # Return mean validation MSE as the overall score

In [None]:
# Prepare X and y
X = df_train.copy().drop(columns=['electricity_demand'], axis=1)
y = df_train.copy().pop('electricity_demand')

# Run the evaluation
evaluate_pipeline(X, y)

### Generating Submission File

In [None]:
# Train and submit your results

In [None]:
# Prepare X_train and y_train from your data
df_train =  pd.read_csv("module5_exercise_train.csv", sep=",")

X_train = df_train.drop(columns=['electricity_demand'], axis=1)
y_train = df_train['electricity_demand']

X_test =  pd.read_csv("module5_exercise_test.csv", sep=",")

In [None]:
def train_and_predict_to_submit(X_train, y_train, X_test):
    model = LinearRegression()
    
    X_train, y_train, X_test = handle_inconsistencies(X_train, y_train, X_test)
    X_train, y_train, X_test = handle_duplicates(X_train, y_train, X_test)
    X_train, y_train, X_test = handle_missing_values(X_train, y_train, X_test)
    X_train, X_test = handle_categorical(X_train, y_train, X_test)
    X_train, y_train, X_test = handle_outliers(X_train, y_train, X_test)
    X_train, y_train, X_test = feature_engineering(X_train, y_train, X_test)
    X_train, X_test = feature_selection(X_train, y_train, X_test)

    # Train the model on the entire training set
    print(f"Training model on entire dataset of shape: {X_train.shape}")
    model.fit(X_train, y_train)
    
    # Predict on the test set
    print(f"Predicting on test dataset of shape: {X_test.shape}")
    y_test_pred = model.predict(X_test)
    
    return y_test_pred

In [None]:
# Call serve_model to train and predict
y_test_pred = train_and_predict_to_submit(X_train, y_train, X_test)

In [None]:
# Generating Submission File
submission = pd.DataFrame({
    'date': X_test['date'],
    'electricity_demand': y_test_pred
})

# Save the submission file
submission.to_csv('submission.csv', index=False, sep=',')
print("Submission file saved as 'submission.csv'.")