In [None]:
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.metrics import mean_squared_error
from sklearn.linear_model import LinearRegression
from sklearn.model_selection import TimeSeriesSplit
import numpy as np
import seaborn as sns

### Data Collection

In [None]:
import requests

# URLs of the files
train_data_url = 'https://www.raphaelcousin.com/modules/data-science-practice/module5/exercise/module5_exercise_train.csv'
test_data_url = 'https://www.raphaelcousin.com/modules/data-science-practice/module5/exercise/module5_exercise_test.csv'

# Function to download a file
def download_file(url, file_name):
    response = requests.get(url)
    response.raise_for_status()  # Ensure we notice bad responses
    with open(file_name, 'wb') as file:
        file.write(response.content)
    print(f'Downloaded {file_name} from {url}')

# Downloading the files
download_file(train_data_url, 'module5_exercise_train.csv')
download_file(test_data_url, 'module5_exercise_test.csv')

In [None]:
df_train =  pd.read_csv("module5_exercise_train.csv", sep=",")
df_test =  pd.read_csv("module5_exercise_test.csv", sep=",")
df_train

In [None]:
df_test

In [None]:
print(df_train.info(),df_train.nunique())
df_test.info(),df_test.nunique()

### Data analysis

#### Make a complete analysis on data preprocessing
# Inconsistencies
# Duplicates (data.duplicated().sum())
# Missing values (data.isnull().sum())
# Categorical
# Outliers
# Feature Engineering
# Feature Selection and/or Dimensionality Reduction

In [None]:
data = pd.concat([df_train, df_test], axis=0)

In [None]:
df_train.shape

In [None]:
df_test.shape

In [None]:
print(data.shape)
data

In [None]:
#print(data.info())
print(data.nunique)

# Inconsistencies

In [None]:
data['weather_condition'] = data['weather_condition'].str.lower().str.strip()
data['oil_brent_price_indicator'] = data['oil_brent_price_indicator'].str.lower().str.strip()
data.head()

# Duplicates (data.duplicated().sum())

In [None]:
print(f"Dates dupliquées : {data.duplicated().sum()}")

In [None]:
data=data.drop_duplicates(subset=['date'], keep='last')
print(f"Dates dupliquées : {data.duplicated().sum()}")
data

In [None]:
def plot_feature_over_time(df, feature, date_id_start, date_id_end):
    df_filtered = df[(df['date'] >= date_id_start) & (df['date'] <= date_id_end)]
    
    if feature not in df_filtered.columns:
        print(f"Feature '{feature}' not found in the DataFrame.")
        return
    
    # Plotting
    plt.figure(figsize=(10, 6))
    plt.plot(df_filtered['date'], df_filtered[feature], label=feature, linestyle='-')
    plt.xlabel('Date')
    plt.ylabel(feature)
    plt.title(f'{feature} from {date_id_start} to {date_id_end}')
    plt.xticks(rotation=45)
    plt.grid(True)
    plt.legend()
    plt.tight_layout()
    plt.show()



# Missing values (data.isnull().sum())

In [None]:
def analyze_missing_data(df, target_column='electricity_demand'):
    # Print the first few rows of the DataFrame to understand its structure
    print("DataFrame head:")
    print(df.head())

    # Basic info about the dataset
    print("\nDataFrame info:")
    df.info()

    # Summarize missing values per column
    print("\nMissing values per column:")
    missing_data = df.isnull().sum()
    print(missing_data)

    # Percentage of missing values per column
    print("\nPercentage of missing values per column:")
    percent_missing = df.isnull().mean() * 100
    print(percent_missing)

    # Visualizing missing values
    plt.figure(figsize=(12, 6))
    sns.heatmap(df.isnull(), cbar=False, cmap='viridis', yticklabels=False)
    plt.title('Missing Value Heatmap')
    plt.show()

    # Visualizing percentage of missing values
    plt.figure(figsize=(10, 6))
    percent_missing.plot(kind='bar', color='dodgerblue')
    plt.title('Percentage of Missing Values Per Column')
    plt.ylabel('Percentage Missing')
    plt.xticks(rotation=90)
    plt.tight_layout()
    plt.show()

    # Compute correlation between missingness and target variable
    missingness_correlation = {}
    for column in df.columns:
        if column != target_column and df[column].isnull().sum() > 0:
            correlation = df[column].isnull().corr(df[target_column])
            missingness_correlation[column] = correlation

    print("\nCorrelation between missingness and target variable:")
    for column, correlation in missingness_correlation.items():
        print(f"{column}: {correlation:.4f}")

    # Visualize correlation between missingness and target variable
    plt.figure(figsize=(10, 6))
    plt.bar(missingness_correlation.keys(), missingness_correlation.values())
    plt.title(f'Correlation between Missingness and {target_column}')
    plt.ylabel('Correlation')
    plt.xticks(rotation=90)
    plt.tight_layout()
    plt.show()

# Run the analysis
analyze_missing_data(data)

In [None]:
data['date'] = pd.to_datetime(data['date'])

In [None]:
data["humidity"] = data["humidity"].interpolate(method="linear")
data  = data.ffill().bfill() # Timeserie stamp
data

In [None]:
data['wind_speed']

# Categorical

In [None]:
# Identify unique values
print("Unique values in each column:")
columns = ["weather_condition", "oil_brent_price_indicator"]
for column in columns:
    print(f"{column}: {data[column].nunique()} - {data[column].unique()}")

In [None]:
#data['weather_condition']= pd.Categorical(data['weather_condition'] categories=['snowy','rainy','cloudy', 'sunny'], ordered=True)
data['oil_brent_price_indicator'] = pd.Categorical(data['oil_brent_price_indicator'], categories=['very low','low','moderate','high','very high'], ordered=True)
data

In [None]:
# Outliers

plt.figure(figsize=(10, 10))

for i in range(data.shape[1]):
    plt.subplot(6, 4, i + 1)
    sns.boxplot(x=data.iloc[:, i])
plt.tight_layout()
plt.show()

In [None]:
def detect_outliers_iqr(data):
    Q1 = data.quantile(0.25)
    Q3 = data.quantile(0.75)
    IQR = Q3 - Q1
    lower_bound = Q1 - 1.5 * IQR
    upper_bound = Q3 + 1.5 * IQR
    return data[(data < lower_bound) | (data > upper_bound)]

outliers = detect_outliers_iqr(data['humidity'])
print(outliers)
outliers = detect_outliers_iqr(data['electricity_demand'])
print("\n", outliers)

In [None]:
plot_feature_over_time(data, 'electricity_demand', '2017-01-01', '2019-09-07')

In [None]:
plot_feature_over_time(data, 'humidity', '2016-06-01', '2016-12-01')

### Data Preprocessing Evaluation Strategy

In [None]:
# Provide a complete data preprocessing transformations

In [None]:
# 1. Handle Inconsistencies
def handle_inconsistencies(X_train, y_train, X_val=None):
    X_train = X_train.copy()
    # fonction pour m/s ->km/h
    def handle_wind(x):
        wind = str(x).split(' ')
        if wind[-1] == "km/h":
            value = float(wind[0])
        elif wind[-1] == "m/s":
            value = float(wind[0]) *3.6
        else:
            value = float(wind[0])
        return value

    X_train['wind_speed'] = X_train['wind_speed'].apply(handle_wind)

    if X_val is not None:
        X_val = X_val.copy()

        X_val['wind_speed'] = X_val['wind_speed'].apply(handle_wind)
        return X_train, y_train, X_val
    else:
        return X_train, y_train


# 2. Handling Duplicates
def handle_duplicates(X_train, y_train, X_val=None):
    X_train_clean = X_train.copy()
    y_train_clean = y_train.copy()

    X_train_clean = X_train.drop_duplicates()
    y_train_clean = y_train.loc[X_train_clean.index]

    if X_val is not None:
        X_val_clean = X_val.copy()

        X_val_clean = X_val.drop_duplicates()
        return X_train_clean, y_train_clean, X_val
    else:
        return X_train_clean, y_train_clean

# 3. Handling Missing Values
def handle_missing_values(X_train, y_train, X_val=None):
    X_train = X_train.copy()
    # forwad fill ensuite backward fill pour les valeurs manquantes
    X_train_imputed = X_train.ffill().bfill() 

    if X_val is not None:
        X_val = X_val.copy()

        X_val_imputed = X_val.ffill().bfill()
        return X_train_imputed, X_val_imputed
    else:
        return X_train_imputed


# 4. Handling Categorical Values
def handle_categorical(X_train, y_train, X_val=None):
    mapping = {'Very Low': 0, 'Low': 1, 'Moderate': 2, 'High': 3, 'Very High': 4}
    X_train_encoded = pd.get_dummies(X_train.copy(), columns=['weather_condition'], dummy_na=False)
    X_train_encoded['oil_brent_price_indicator'] = X_train_encoded['oil_brent_price_indicator'].map(mapping)
    if X_val is not None:
        X_val_encoded = pd.get_dummies(X_val.copy(), columns=['weather_condition'], dummy_na=False)
        X_val_encoded['oil_brent_price_indicator'] = X_val_encoded['oil_brent_price_indicator'].map(mapping)
        X_val_encoded = X_val_encoded.reindex(columns=X_train_encoded.columns, fill_value=0)
        return X_train_encoded, X_val_encoded
    else:
        return X_train_encoded

# 5. Handling Outliers
def handle_outliers(X_train, y_train, X_val=None):
    X_train = X_train.copy()

    def impute_outliers(X):
        Q1 = X.quantile(0.25)
        Q3 = X.quantile(0.75)
        IQR = Q3 - Q1
        lower_bound = Q1 - 1.5*IQR
        upper_bound = Q3 + 1.5*IQR

        impute_value = X.median()
        data_imputed = X.copy()
        data_imputed[(X < lower_bound) | (X > upper_bound)] = impute_value
        return data_imputed

    X_train['humidity'] = impute_outliers(X_train['humidity']) # outlier dans la colonne humidity
    y_train = impute_outliers(y_train) # df_train['Electricity_demand']
    if X_val is not None:
        X_val = X_val.copy()

        X_val['humidity'] = impute_outliers(X_val['humidity'])
        return X_train, y_train, X_val
    else:
        return X_train, y_train


# 6. Feature Engineering
def feature_engineering(X_train, y_train, X_val=None):
    def datatime(df):
        df['date'] = pd.to_datetime(df['date'])
        df['year'] = df['date'].dt.year
        df['month'] = df['date'].dt.month
        df['day'] = df['date'].dt.day
        df['dayofweek'] = df['date'].dt.dayofweek
        df['dayofmonth'] = df['day'] / df['date'].dt.days_in_month
        df['isweekend'] =df['dayofweek'].isin([5, 6]).astype(int)

        return df
    X_train = datatime(X_train.copy())
    if X_val is not None:
        X_val = datatime(X_val.copy())
        return X_train, y_train, X_val
    else:
        return X_train, y_train

# 7. Feature Selection and Dimensionality Reduction
def feature_selection(X_train, y_train, X_val=None):
    #print(X_train.columns.drop(['date', 'day', 'year','month']))
    #X_train.columns.drop(['date', 'day', 'year','month'])
    #print(X_train.columns.drop(['date', 'day', 'year','month']))
    #selected_columns = X_train.columns.drop(['date', 'day', 'year','month'])
    selected_columns =  X_train.columns.drop(['date', 'day', 'year','month'])
    if X_val is not None:
        return X_train[selected_columns] ,X_val[selected_columns]
    else:
        return X_train[selected_columns]


In [None]:
from xgboost import XGBRegressor
def evaluate_pipeline(X, y, n_splits=5):

    ### call transformations here, if there is no learning and no need to be crossval
    X, y = handle_inconsistencies(X, y)
    X, y = handle_duplicates(X, y)
    X  = handle_missing_values(X, y)
    X= handle_categorical(X, y)
    X, y = handle_outliers(X, y)
    X, y = feature_engineering(X, y)
    X = feature_selection(X, y)
    
    model = LinearRegression()
    #model = XGBRegressor() #  Validation MSE:  436.0203, Max: 541.9537, Min: 379.3908 np.float64(436.0203269697978)
    tscv = TimeSeriesSplit(n_splits=n_splits)
    
    train_scores = []
    val_scores = []
    
    for fold, (train_index, val_index) in enumerate(tscv.split(X)):
        print(f"Processing fold {fold + 1}/{n_splits}...")
        
        # Split data into train and validation sets
        X_train, X_val = X.iloc[train_index].copy(), X.iloc[val_index].copy()
        y_train, y_val = y.iloc[train_index].copy(), y.iloc[val_index].copy()

        ### call transformations here, if there is learning
        # X_train, y_train, X_val = handle_inconsistencies(X_train, y_train, X_val)
        #X_train, y_train, X_val = handle_duplicates(X_train, y_train, X_val)
        # X_train, X_val = handle_missing_values(X_train, y_train, X_val)
        #X_train, X_val = handle_categorical(X_train, y_train, X_val)
        # X_train, y_train, X_val = handle_outliers(X_train, y_train, X_val)
        #X_train, y_train, X_val = feature_engineering(X_train, y_train, X_val)
        #X_train, X_val = feature_selection(X_train, y_train, X_val)
        
        # Train the model
        model.fit(X_train, y_train)
        
        # Predict on training set
        y_train_pred = model.predict(X_train)
        train_mse = mean_squared_error(y_train, y_train_pred)
        train_scores.append(train_mse)
        
        # Predict on validation set
        y_val_pred = model.predict(X_val)
        val_mse = mean_squared_error(y_val, y_val_pred)
        val_scores.append(val_mse)
        
        print(f"Fold {fold + 1} Train MSE: {train_mse:.4f}, Validation MSE: {val_mse:.4f}")
    
    # Compute mean, max, and min values for train and validation MSE
    mean_train_mse = np.mean(train_scores)
    max_train_mse = np.max(train_scores)
    min_train_mse = np.min(train_scores)
    
    mean_val_mse = np.mean(val_scores)
    max_val_mse = np.max(val_scores)
    min_val_mse = np.min(val_scores)
    
    # Print results
    print("\nTrain MSE:")
    print(f"Mean: {mean_train_mse:.4f}, Max: {max_train_mse:.4f}, Min: {min_train_mse:.4f}")
    
    print("\nValidation MSE:")
    print(f"Mean: {mean_val_mse:.4f}, Max: {max_val_mse:.4f}, Min: {min_val_mse:.4f}")
    
    return mean_val_mse  # Return mean validation MSE as the overall score

In [None]:
# Prepare X and y
X = df_train.copy().drop(columns=['electricity_demand'], axis=1)
y = df_train.copy().pop('electricity_demand')
# Run the evaluation
evaluate_pipeline(X, y) #np.float64(1109.3930508876085)

### Generating Submission File

In [None]:
# Train and submit your results

In [None]:
# Prepare X_train and y_train from your data
df_train =  pd.read_csv("module5_exercise_train.csv", sep=",")

X_train = df_train.drop(columns=['electricity_demand'], axis=1)
y_train = df_train['electricity_demand']

X_test =  pd.read_csv("module5_exercise_test.csv", sep=",")

In [None]:
def train_and_predict_to_submit(X_train, y_train, X_test):
    model = LinearRegression()
    
    X_train, y_train, X_test = handle_inconsistencies(X_train, y_train, X_test)
    X_train, y_train, X_test = handle_duplicates(X_train, y_train, X_test)
    X_train, X_test = handle_missing_values(X_train, y_train, X_test)
    X_train, X_test = handle_categorical(X_train, y_train, X_test)
    X_train, y_train, X_test = handle_outliers(X_train, y_train, X_test)
    X_train, y_train, X_test = feature_engineering(X_train, y_train, X_test)
    X_train, X_test = feature_selection(X_train, y_train, X_test)

    # Train the model on the entire training set
    print(f"Training model on entire dataset of shape: {X_train.shape}")
    model.fit(X_train, y_train)
    
    # Predict on the test set
    print(f"Predicting on test dataset of shape: {X_test.shape}")
    y_test_pred = model.predict(X_test)
    
    return y_test_pred

In [None]:
# Call serve_model to train and predict
y_test_pred = train_and_predict_to_submit(X_train, y_train, X_test)

In [None]:
# Generating Submission File
submission = pd.DataFrame({
    'date': X_test['date'],
    'electricity_demand': y_test_pred
})

# Save the submission file
submission.to_csv('submission.csv', index=False, sep=',')
print("Submission file saved as 'submission.csv'.")