Import Libraries and Data

In [None]:
import os
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
from IPython.display import display

# import cleaned Dataframes from Data preparation
charging_data = pd.read_pickle(os.path.join('Data', 'charging_data.pkl'))
weather_data = pd.read_pickle(os.path.join('Data', 'weather_data.pkl'))
merged_data = pd.read_pickle(os.path.join('Data', 'merged_charging_weather_data.pkl'))

In [None]:
# Analyse empty time area which was observed in data preparation scatter plot
# Add a column for month and year
charging_data['year_month'] = charging_data['connectionTime'].dt.to_period('M')

# Group by month and count records
monthly_counts = charging_data.groupby('year_month').size()

# Display the counts for visual inspection
monthly_counts

Observations: </br>
- Notably more charging sessions in 2018 and 2019, especially in summer months (pre-covid)
- Much less charging sessions during covid pandemic (started in march 2020)
- Did not reach old level after pandemic start

There appears to be no data for months September and October 2020. Furthermore the month August 2020 has only 30 records, which is notably lower compared to the other months.

In [None]:
# Plot number of sessions over time
charging_data['connectionTime'].groupby(charging_data['year_month']).size().plot(
    kind='line', marker='o', title='Sessions Over Time'
)
plt.xlabel('Month-Year')
plt.ylabel('Number of Sessions')
plt.show()

In [None]:
# Filter data between August 2020 and December 2020
filtered_data = charging_data[
    (charging_data['connectionTime'] >= '2020-08-01') &
    (charging_data['connectionTime'] <= '2020-12-01')
].copy()  # Use .copy() to work with a separate DataFrame

# Extract the date from the 'connectionTime' column
filtered_data['date'] = filtered_data['connectionTime'].dt.date

# Group by date and get the last record for each date
last_records_per_date = filtered_data.groupby(['siteID', 'date']).tail(1)

# Display the result
last_records_per_date

Final observation after looking at the filtered dataset: There is no charging data between 3rd August 2020 and 18th November 2020 for both sites.

## Task 2: Descriptive Analytics

### a) Temporal Patterns and Seasonality

In [None]:
# Extract the hour from 'connectionTime' and 'disconnectTime'
charging_data['connection_hour'] = charging_data['connectionTime'].dt.hour
charging_data['disconnect_hour'] = charging_data['disconnectTime'].dt.hour

# Group by hour and count the number of sessions for both connection and disconnection
connection_hourly_sessions = charging_data.groupby('connection_hour').size()
disconnect_hourly_sessions = charging_data.groupby('disconnect_hour').size()

# Align indexes for plotting
hours = np.arange(0, 24)

# Ensure all hours are present for alignment (fill missing hours with 0)
# Without this step missing hours would not be displayed on the x-axis
connection_hourly_sessions = connection_hourly_sessions.reindex(hours, fill_value=0)
disconnect_hourly_sessions = disconnect_hourly_sessions.reindex(hours, fill_value=0)

# Plot the bar chart
plt.figure(figsize=(10, 4))
plt.bar(hours - 0.2, connection_hourly_sessions, width=0.4, label='Connection Time', alpha=0.7)
plt.bar(hours + 0.2, disconnect_hourly_sessions, width=0.4, label='Disconnection Time', alpha=0.7)

plt.title('Charging Events by Hour of the Day')
plt.xlabel('Hour of the Day')
plt.ylabel('Number of Charging Events')
plt.xticks(hours)
plt.legend()
plt.grid(axis='y', linestyle='--', alpha=0.7)
plt.show()

The chart shows the distribution of charging events of both sites across different hours of the day. </br>
Observations: </br>
- Noticeable peak of the connectionTime during the morning (SOB - start of business) and peak of disconnectTime during the afternoon (EOB - end of business)
- Low Activity during late-night hours

In [None]:
# Extract the day of the week from 'connectionTime'
charging_data['weekday'] = charging_data['connectionTime'].dt.day_name()

# Group by weekday and count the number of charging sessions
weekday_sessions = charging_data.groupby('weekday').size()

# Order the days of the week for better visualization
ordered_days = ['Monday', 'Tuesday', 'Wednesday', 'Thursday', 'Friday', 'Saturday', 'Sunday']
weekday_sessions = weekday_sessions.reindex(ordered_days)

# Plot the number of sessions by day of the week
plt.figure(figsize=(8, 4))
plt.bar(weekday_sessions.index, weekday_sessions.values, color='blue')
plt.title('Charging Events by Day of the Week')
plt.xlabel('Day of the Week')
plt.ylabel('Number of Charging Events')
plt.xticks(rotation=45)
plt.grid(axis='y', linestyle='--', alpha=0.7)
plt.show()

The chart shows the distribution of charging events of both sites across the days of the week. </br>
Observation: Strong weekday and weaker weekend activity, indicating work- & university-related pattern

In [None]:
# Visualize seasonality over both sites
# Define a function to map months to seasons
def map_season(month):
    if month in [12, 1, 2]:
        return 'Winter'
    elif month in [3, 4, 5]:
        return 'Spring'
    elif month in [6, 7, 8]:
        return 'Summer'
    else:
        return 'Fall'

# Map the 'connectionTime' to seasons
charging_data['season'] = charging_data['connectionTime'].dt.month.map(map_season)

# Group by season and count the number of charging sessions
seasonal_sessions = charging_data.groupby('season').size()

# Order the seasons for visualization
ordered_seasons = ['Winter', 'Spring', 'Summer', 'Fall']
seasonal_sessions = seasonal_sessions.reindex(ordered_seasons)

# Plot the number of sessions by season
plt.figure(figsize=(8, 4))
plt.bar(seasonal_sessions.index, seasonal_sessions.values, color='blue')
plt.title('Charging Events by Season')
plt.xlabel('Season')
plt.ylabel('Number of Charging Events')
plt.grid(axis='y', linestyle='--', alpha=0.7)
plt.show()

The chart shows the distribution of charging events of both sites across different seasons (winter, spring, summer, fall). </br>
We have to keep in mind, that between 3rd August 2020 and 18th November 2020 are no records in the charging dataset, which leads to the assumption that there would be even more charging events in summer and fall. </br>
Observations: </br>
- Winter: Fewer outdoor activities could result in lower public charging demand
- Summer: Higher EV usage for outdoor activities may lead to increased charging demand

### Weather-dependant analysis

In [None]:
# Select relevant columns for correlation
correlation_data = merged_data[['temperature', 'precipitation', 'windspeed', 'kWhDelivered', 'charging_duration', 'connection_duration']]

# Compute the correlation matrix
correlation_matrix = correlation_data.corr()

# Plot the correlation matrix as a heatmap
plt.figure(figsize=(6, 5))
sns.heatmap(correlation_matrix, annot=True, cmap='coolwarm', fmt='.2f', linewidths=0.5)
plt.title('Correlation Matrix: Weather Variables and Charging Behavior')
plt.show()

# Extract specific correlations
temp_energy_corr = correlation_matrix.loc['temperature', 'kWhDelivered']
precip_duration_corr = correlation_matrix.loc['precipitation', 'charging_duration']
wind_connection_corr = correlation_matrix.loc['windspeed', 'connection_duration']

Observations from the Correlation Matrix: </br>
- Minimal impact of weather variables: temperature, precipitation and wind speed have minimal influence on energy delivered, charging duration and connection duration
- The most significant relationships are within the charging behavior metric themselves: charging duration and kWhDelivered increase as the connection duration increases, as expected