In [2]:
import pandas as pd
import numpy as np

In [3]:
df = pd.read_csv("C:/Users/souha/downloads/ckw_opendata_smartmeter_dataset_b_202503.csv.gz", compression='gzip')
df.head()


Unnamed: 0,area_code,timestamp,num_meter,value_kwh
0,6026,2025-02-28T23:00:00.000Z,1398,222.426
1,6026,2025-02-28T23:15:00.000Z,1398,228.109
2,6026,2025-02-28T23:30:00.000Z,1398,247.094
3,6026,2025-02-28T23:45:00.000Z,1398,257.829
4,6026,2025-03-01T00:00:00.000Z,1398,264.8


In [4]:
df['timestamp'] = pd.to_datetime(df['timestamp'], utc=True)

df['timestamp'] = df['timestamp'].dt.tz_convert('CET')
df.head()

Unnamed: 0,area_code,timestamp,num_meter,value_kwh
0,6026,2025-03-01 00:00:00+01:00,1398,222.426
1,6026,2025-03-01 00:15:00+01:00,1398,228.109
2,6026,2025-03-01 00:30:00+01:00,1398,247.094
3,6026,2025-03-01 00:45:00+01:00,1398,257.829
4,6026,2025-03-01 01:00:00+01:00,1398,264.8


In [5]:
df.shape

(211626, 4)

In [6]:
df.describe()

Unnamed: 0,area_code,num_meter,value_kwh
count,211626.0,211626.0,211626.0
mean,6135.108337,1353.36436,148.359873
std,196.793921,1844.700341,172.488742
min,4806.0,11.0,0.0
25%,6044.0,379.0,41.81625
50%,6173.0,760.0,90.7105
75%,6236.0,1509.0,191.31825
max,6344.0,12701.0,1674.002


In [7]:
df.set_index('timestamp', inplace=True)
df.head()

Unnamed: 0_level_0,area_code,num_meter,value_kwh
timestamp,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
2025-03-01 00:00:00+01:00,6026,1398,222.426
2025-03-01 00:15:00+01:00,6026,1398,228.109
2025-03-01 00:30:00+01:00,6026,1398,247.094
2025-03-01 00:45:00+01:00,6026,1398,257.829
2025-03-01 01:00:00+01:00,6026,1398,264.8


In [8]:
df.sort_index(inplace=True)
df.head()

Unnamed: 0_level_0,area_code,num_meter,value_kwh
timestamp,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
2025-03-01 00:00:00+01:00,6026,1398,222.426
2025-03-01 00:00:00+01:00,6247,2236,327.614
2025-03-01 00:00:00+01:00,6214,1529,244.19
2025-03-01 00:00:00+01:00,6206,2225,302.33
2025-03-01 00:00:00+01:00,6203,820,115.49


In [9]:
# Number of unique area codes
unique_area_codes = df['area_code'].nunique()
print("Number of unique area codes:", unique_area_codes, "\n")

# Frequency distribution of area codes values
area_codes_counts = df['area_code'].value_counts()
print("Frequency Distribution of 'area code':")
print(area_codes_counts, "\n")

# Check for missing values in the area codes column
missing_area_codes = df['area_code'].isnull().sum()
print("Number of missing values in 'area code':", missing_area_codes, "\n")

Number of unique area codes: 116 

Frequency Distribution of 'area code':
area_code
6182    1831
6174    1831
6196    1830
6192    1830
6023    1830
        ... 
6263    1824
6276    1824
6295    1824
6285    1824
6264    1824
Name: count, Length: 116, dtype: int64 

Number of missing values in 'area code': 0 



In [11]:
# Number of unique num_meter s
unique_num_meter = df['num_meter'].nunique()
print("Number of unique num_meter:", unique_num_meter, "\n")

# Check for missing values in the num_meter column
missing_num_meter = df['num_meter'].isnull().sum()
print("Number of missing values in 'num_meter':", missing_num_meter, "\n")

Number of unique num_meter: 1439 

Number of missing values in 'num_meter': 0 



In [12]:
# Check for missing values in the value_kwh column
missing_value_kwh = df['value_kwh'].isnull().sum()
print("Number of missing values in 'value_kwh':", missing_value_kwh, "\n")

Number of missing values in 'value_kwh': 0 



In [14]:
import plotly.graph_objects as go

# Aggregate the data into histogram bins (for all area codes)
hist, bin_edges = np.histogram(df['value_kwh'], bins=100)

# Create a bar chart using the aggregated data and set the marker color to #204251
fig = go.Figure(
    data=[go.Bar(x=bin_edges[:-1], y=hist, width=np.diff(bin_edges), marker_color='#204251')]
)

fig.update_layout(
    title='Histogram of value_kwh in dataset b',
    xaxis_title='Value (kWh)',
    yaxis_title='Frequency'
)

fig.show(renderer='browser')

In [15]:
import plotly.graph_objects as go

# Create a box plot for the sampled value_kwh column with the specified color
fig = go.Figure(
    data=[go.Box(y=df, marker_color='#204251')]
)

fig.update_layout(
    title='Box Plot of value_kwh (Sampled)',
    yaxis_title='Value (kWh)'
)

fig.show(renderer='browser')


In [20]:
import plotly.express as px

# Filter for a specific area code
selected_area_code = 6264
df_filtered = df[df['area_code'] == selected_area_code].sort_index()

# Plot the time series where the index is the timestamp
fig = px.line(
    df_filtered,
    x=df_filtered.index,  # Using the DateTimeIndex
    y='value_kwh',
    title=f"Time Series of kWh for area_code {selected_area_code}",
    labels={'x': 'Timestamp', 'value_kwh': 'Energy Consumption (kWh)'},
    color_discrete_sequence=['#204251']
)
fig.show(renderer='browser')

