In [1]:
import matplotlib.pyplot as plt
import pandas as pd
import seaborn as sns

In [2]:
DATA_FOLDER = "./data/"

In [3]:
df = pd.read_parquet(DATA_FOLDER + "chargecurves_train.parquet")

FileNotFoundError: [Errno 2] No such file or directory: './data/chargecurves_train.parquet'

In [None]:
df.describe()

In [None]:
def plot_timeseries(group):
    print(group['nominal_power'].iloc[0])
    
    fig, ax1 = plt.subplots(figsize=(10, 6))
    group_id = group['id'].iloc[0]
    
    group = group.copy()
    group['time_str'] = group['timestamp'].dt.strftime('%H:%M')
    
    ax1.plot(group['time_str'], group['soc'], linestyle='-', color='b', label='SOC')
    ax1.set_xlabel('Time')
    ax1.set_ylabel('State of Charge (SOC)', color='b')
    ax1.tick_params(axis='y', labelcolor='b')
    
    ax2 = ax1.twinx()
    ax2.plot(group['time_str'], group['power'], linestyle='-', color='r', label='Power')
    # Provide both x and y for nominal power so it aligns with the time axis
    ax2.plot(group['time_str'], group['nominal_power'], linestyle='--', color='k', label='Nominal Power')
    ax2.set_ylabel('Power', color='r')
    ax2.tick_params(axis='y', labelcolor='r')
    
    N = max(len(group) // 10, 1)
    ax1.set_xticks(group['time_str'].iloc[::N])
    
    ax1.set_title(f'Group ID: {group_id}')
    ax1.grid(True, linestyle='--', alpha=0.5)
    
    plt.xticks(rotation=45)
    plt.show()


### Data Preprocessing and Cleaning

In [None]:
df.dropna(inplace=True)

Finding the sessions with few datapoints

In [None]:
grouped_df = df.groupby('id')
grouped_counts = grouped_df.count()
short_sessions_df = grouped_counts.loc[grouped_counts['timestamp'] < 5]
short_sessions_ids = list(short_sessions_df.index)

A typical short-sessioned datapoint, not very useful

In [None]:
grouped_df.get_group(short_sessions_ids[15])

In [None]:
plot_timeseries(grouped_df.get_group(short_sessions_ids[15]))

In [None]:
plot_timeseries(grouped_df.get_group(1))

Removing all datapoints with less than 5 loggings

In [None]:
df = df[~df['id'].isin(short_sessions_ids)]

### Featuere Engineering

In [None]:
def compute_total_charged(group):
    group = group.sort_values('timestamp')
    
    # Compute the time differences between consecutive readings in hours.
    time_diffs = group['timestamp'].diff().dt.total_seconds()/ 3600.0  # hours
    
    # Approximate energy delivered in each interval: power (kW) * time (h)
    # Then sum over the session.
    energy = group['power'] * time_diffs
    return energy.sum()

In [None]:
session_df = df.groupby('id').apply(lambda group: pd.Series({
    'start_time': group['timestamp'].min(),
    'end_time': group['timestamp'].max(),
    'nominal_power': group['nominal_power'].iloc[0],
    'duration': group['timestamp'].max() - group['timestamp'].min(),
    'total_charged': group['soc'].max() - group['soc'].min()
}))

session_df['charging_rate_kW'] = session_df['total_charged'] / (session_df['duration'].dt.total_seconds() / 3600)

In [None]:
session_df.head()

Checking for seasonal differences

In [None]:
months = list(range(1, 13))
month_names = ["January", "February", "March", "April", "May", "June",
               "July", "August", "September", "October", "November", "December"]

avg_total_charged_by_month = []

for month in months:
    month_df = session_df.loc[session_df['start_time'].dt.month == month]
    month_session_total = month_df['total_charged']
    avg_total_charged_month = month_session_total.mean()
    avg_total_charged_by_month.append(avg_total_charged_month)
    print(f'Month {month_names[month-1]} avg total charged: {avg_total_charged_month}')

plt.figure(figsize=(10, 6))
plt.bar(months, avg_total_charged_by_month)

plt.xticks(months, month_names, rotation=45)

plt.xlabel('Month')
plt.ylabel('Average Total Charged per session')
plt.title('Average Total Charged per Session by Month')
plt.tight_layout()
plt.show()

### Visualizations

In [None]:
## Basic 

# Konverter timestamp til datetime-format og ekstraher tidselementer
df['timestamp'] = pd.to_datetime(df['timestamp'])
df['weekday'] = df['timestamp'].dt.day_name()  # Navn på ukedag
df['hour'] = df['timestamp'].dt.hour          # Klokkeslett
df['month'] = df['timestamp'].dt.month_name() # Navn på måned
df['date'] = df['timestamp'].dt.date          # Dato

# Gruppér etter ukedag og tell antall målinger
weekday_activity = df.groupby('weekday').size().reindex(
    ['Monday', 'Tuesday', 'Wednesday', 'Thursday', 'Friday', 'Saturday', 'Sunday']
)

# Visualiser ukedagaktivitet
import matplotlib.pyplot as plt
plt.figure(figsize=(10, 6))
plt.bar(weekday_activity.index, weekday_activity.values, color='skyblue')
plt.title("Aktivitet per ukedag", fontsize=14)
plt.xlabel("Ukedag", fontsize=12)
plt.ylabel("Antall aktiviteter", fontsize=12)
plt.grid(axis='y', linestyle='--', alpha=0.7)
plt.tight_layout()
plt.show()

# Gruppér etter time og tell antall målinger
hour_activity = df.groupby('hour').size()

# Visualiser klokkeslettaktivitet
plt.figure(figsize=(10, 6))
plt.plot(hour_activity.index, hour_activity.values, marker='o', linestyle='-', color='blue')
plt.title("Aktivitet per klokkeslett", fontsize=14)
plt.xlabel("Klokkeslett", fontsize=12)
plt.ylabel("Antall aktiviteter", fontsize=12)
plt.grid(axis='y', linestyle='--', alpha=0.7)
plt.tight_layout()
plt.show()


# Gruppér etter måned og tell antall målinger
month_activity = df.groupby('month').size().reindex([
    'January', 'February', 'March', 'April', 'May', 'June', 
    'July', 'August', 'September', 'October', 'November', 'December'
])

# Visualiser månedsaktivitet
plt.figure(figsize=(10, 6))
plt.bar(month_activity.index, month_activity.values, color='lightgreen')
plt.title("Aktivitet per måned", fontsize=14)
plt.xlabel("Måned", fontsize=12)
plt.ylabel("Antall aktiviteter", fontsize=12)
plt.grid(axis='y', linestyle='--', alpha=0.7)
plt.tight_layout()
plt.show()

## 
# 1. Distribusjon av SOC
plt.figure(figsize=(8, 5))
plt.hist(df_clean['soc'], bins=30, edgecolor='black', alpha=0.7)
plt.xlabel("State of Charge (SOC) [%]")
plt.ylabel("Antall forekomster")
plt.title("Distribusjon av SOC (State of Charge)")
plt.grid(axis='y', linestyle='--', alpha=0.7)
plt.show()

# 2. Scatterplot SOC vs. Power
plt.figure(figsize=(8, 5))
plt.scatter(df_clean['soc'], df_clean['power'], alpha=0.5)
plt.xlabel("State of Charge (SOC) [%]")
plt.ylabel("Power [kW]")
plt.title("Sammenheng mellom SOC og effekt")
plt.grid(linestyle='--', alpha=0.7)
plt.show()

In [None]:
# Utforske outliers

# Filtrer rader der power > nominal_power (outliers)
outliers = df[df['power'] > df['nominal_power']]

# Gruppér etter location_id og tell antall outliers
location_outliers = outliers.groupby('location_id').size().reset_index(name='Antall outliers')

# Sorter etter antall outliers
location_outliers = location_outliers.sort_values(by='Antall outliers', ascending=False)

# Vis de topp lokasjonene med potensielle problemer
print(location_outliers.head(10))