In [6]:
import pandas as pd
import numpy as np

from sklearn.metrics import mean_squared_error
from sklearn.linear_model import LinearRegression
from sklearn.model_selection import TimeSeriesSplit

import matplotlib.pyplot as plt
import seaborn as sns

sns.set_theme()

### Collecting Data

### Dataset Metadata
- **Description**:	This dataset represents aggregated daily electricity consumption in a major office zone in the United States from January 2015 to December 2019.
- **Source**:	Electricity Consumption Records
- **Target Variable**:	Electricity Demand (MWh)
- **Date**	The specific day of observation
- **Temperature1, ..., Temperature10**	Daily average temperature of the 10 stations (°C)
- **Humidity**	Average daily humidity (%)
- **Wind Speed**	Average wind speed (m/s or km/h)
- **Oil Brent Price Indicator**	Ordinal feature representing fluctuations in oil prices
- **Weather Condition**	Overall weather for each day (e.g., 'Sunny', 'Cloudy', 'Rainy', 'Snowy')
- **Electricity Demand**	Total daily electricity consumption in megawatt-hours (MWh)

In [7]:
import requests

# URLs of the files
train_data_url = 'https://www.raphaelcousin.com/modules/module5/exercise/module5_exercise_train.csv'
test_data_url = 'https://www.raphaelcousin.com/modules/module5/exercise/module5_exercise_test.csv'

# Function to download a file
def download_file(url, file_name):
    response = requests.get(url)
    response.raise_for_status()  # Ensure we notice bad responses
    with open(file_name, 'wb') as file:
        file.write(response.content)
    print(f'Downloaded {file_name} from {url}')

# Downloading the files
download_file(train_data_url, 'module5_exercise_train.csv')
download_file(test_data_url, 'module5_exercise_test.csv')

Downloaded module5_exercise_train.csv from https://www.raphaelcousin.com/modules/module5/exercise/module5_exercise_train.csv
Downloaded module5_exercise_test.csv from https://www.raphaelcousin.com/modules/module5/exercise/module5_exercise_test.csv


In [8]:
df_train =  pd.read_csv("module5_exercise_train.csv", sep=",")
df_test =  pd.read_csv("module5_exercise_test.csv", sep=",")

In [9]:
df_train.head(10)

Unnamed: 0,date,weather_condition,humidity,wind_speed,oil_brent_price_indicator,temperature_station1,temperature_station2,temperature_station3,temperature_station4,temperature_station5,temperature_station6,temperature_station7,temperature_station8,temperature_station9,temperature_station10,electricity_demand
0,2015-01-08,Cloudy,69.304377,27.74 km/h,Moderate,0.369411,,0.159977,0.118224,-0.444455,0.313675,0.106192,0.36943,1.27947,-0.384661,273.511353
1,2015-01-09,Sunny,55.955975,21.78 km/h,Moderate,2.009903,,1.617242,,2.160063,2.515627,1.867474,1.953165,1.878233,1.582365,258.827085
2,2015-01-10,Rainy,62.701614,6.83 m/s,Moderate,-2.603544,-2.422001,-3.685819,-2.392354,-1.936704,-2.950332,-3.074828,-2.69865,-2.35486,-2.770883,302.535033
3,2015-01-11,Snowy,60.375001,5.79 m/s,Moderate,-3.789836,-3.974054,-3.217545,-4.397143,-3.375188,-3.650454,-5.004991,-3.233724,-4.227899,,296.270989
4,2015-01-12,Snowy,,25.14 km/h,High,-2.405522,-2.161129,-2.880773,-2.587199,,,,,-2.790422,-2.033098,323.49809
5,2015-01-13,Cloudy,58.649834,28.75 km/h,High,-5.037719,-4.891658,-4.655657,-5.545906,,-5.323173,-5.771092,,-4.554718,-4.719306,303.623789
6,2015-01-13,Cloudy,58.649834,28.75 km/h,High,-5.037719,-4.891658,-4.655657,-5.545906,,-5.323173,-5.771092,,-4.554718,-4.719306,303.623789
7,2015-01-14,Rainy,69.235096,19.97 km/h,Moderate,-5.213824,-5.509778,,,,-5.366943,-4.717191,-5.678185,-5.101359,-4.618354,321.299389
8,2015-01-15,Cloudy,67.523432,4.27 m/s,Moderate,-4.61189,,-4.672053,-5.932923,-4.960255,-4.769734,-4.513605,,-4.471426,-4.794511,311.156081
9,2015-01-16,Cloudy,67.978062,23.63 km/h,Moderate,-4.172215,-3.95244,,,-4.745741,-4.617068,-3.987911,-4.068369,,-4.15888,311.350108


In [13]:
df_train.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1909 entries, 0 to 1908
Data columns (total 16 columns):
 #   Column                     Non-Null Count  Dtype  
---  ------                     --------------  -----  
 0   date                       1909 non-null   object 
 1   weather_condition          1885 non-null   object 
 2   humidity                   1813 non-null   float64
 3   wind_speed                 1819 non-null   object 
 4   oil_brent_price_indicator  1909 non-null   object 
 5   temperature_station1       1624 non-null   float64
 6   temperature_station2       1634 non-null   float64
 7   temperature_station3       1620 non-null   float64
 8   temperature_station4       1632 non-null   float64
 9   temperature_station5       1630 non-null   float64
 10  temperature_station6       1636 non-null   float64
 11  temperature_station7       1601 non-null   float64
 12  temperature_station8       1627 non-null   float64
 13  temperature_station9       1621 non-null   float

#### Data Analysis

In [10]:
data = pd.concat([df_train, df_test])

In [16]:
def display_type_in_columns(df):
    """
    Display the types in each column of a DataFrame

    :param df: The DataFrame to analyze
    """

    print("\n Types of values of each column of the DataFrame")
    print("__________________________________________________")
    
    for column in df.columns:
        types = df[column].apply(type).unique()
        print(f"Types of the '{column}' column: {types}")
    
    return 0

In [17]:
display_type_in_columns(df_train)


 Types of values of each column of the DataFrame
__________________________________________________
Types of the 'date' column: [<class 'str'>]
Types of the 'weather_condition' column: [<class 'str'> <class 'float'>]
Types of the 'humidity' column: [<class 'float'>]
Types of the 'wind_speed' column: [<class 'str'> <class 'float'>]
Types of the 'oil_brent_price_indicator' column: [<class 'str'>]
Types of the 'temperature_station1' column: [<class 'float'>]
Types of the 'temperature_station2' column: [<class 'float'>]
Types of the 'temperature_station3' column: [<class 'float'>]
Types of the 'temperature_station4' column: [<class 'float'>]
Types of the 'temperature_station5' column: [<class 'float'>]
Types of the 'temperature_station6' column: [<class 'float'>]
Types of the 'temperature_station7' column: [<class 'float'>]
Types of the 'temperature_station8' column: [<class 'float'>]
Types of the 'temperature_station9' column: [<class 'float'>]
Types of the 'temperature_station10' column

0

In [None]:
def display_values_multi_type_columns(df):
    """Display the dfferent values of the multi"""

In [11]:
from datetime import datetime

def standardize_date(date_str):
    """Standardize the format of all dates of the dataframe"""
    if pd.isnull(date_str):
        return np.nan
    
    date_str = str(date_str).strip()
    date_str = str(date_str).replace("\r","")
    date_str = str(date_str).replace("\n", "")
    date_str = str(date_str).replace("\t", "")

    date_formats = [
        "%Y-%m-%d",
        "%Y/%m/%d",
        "%d %b %Y",
    ]

    for date_format in date_formats:
        try:
            dt = datetime.strptime(date_str, date_format)
            return dt.strftime("%Y-%m-%d")
        except ValueError as e:
            pass
    
    return np.nan

In [12]:
def plot_categorical(df, column, title, top_n=20):
    """
    Plot bar chart for categorical data

    :param df: DataFrame
    :par column: Column name to plot
    :param title: Plot title
    :param top_n: Number of top categories to show
    """
    plt.figure(figsize=(12, 6))
    df[column].value_counts().head(top_n).plot(kind="bar")
    plt.title(title)
    plt.xlabel(column)
    plt.ylabel("Count")
    plt.xticks(rotation=45, ha="right")
    plt.tight_layout()
    plt.show()

def plot_numeric(df, column, title):
    """
    Plot histogram for numeric data

    :param df: DataFrame
    :param column: Column name to plot
    :param title: Plot title
    """

    plt.figure(figsize=(10, 6))
    sns.histplot(df[column].dropna(), kde=True, bins=30)
    plt.title(title)
    plt.xlabel(column)
    plt.ylabel("Count")
    plt.show()

def clean_text_column(df, column):
    """
    Clean text column by stripping whitespace and titlecasing.

    :param df: DataFrame
    :param column:Column name to clean
    :return: Cleaned Series
    """
    
    return df[column].str.strip().str.title()

def clean_speed(speed):
    """
    Clean speed values.

    :param speed: Speed value to clean
    :return: Cleaned age as float 
    """
    speed = speed.strip()
    if isinstance(speed, str):
        if "km\h" in speed:
            speed = float(speed.replace("km/h", ""))/3.6
        elif 'm\s' in speed:
            speed = float(speed.replace("m/s", ""))
    elif pd.isna(speed):
        return np.nan

    return speed

def clean_speed_column(df, speed_column):
    """
    Clean speed column using clean_speed function.

    :param df: The DataFrame in which the column is
    :param speed_column: The column of speed values to clean 
    and convert
    """
    df (speed_column).apply(clean_speed, inplace=True)

    return df
    
    

