In [26]:
import pandas as pd
import numpy as np

In [16]:
df = pd.read_csv("./../data/integrated-data/data_raw.csv")

In [17]:
df.head()

Unnamed: 0.1,Unnamed: 0,2000,2001,2002,2003,2004,2005,2006,2007,2008,2009,2010,2011,2012,2013,2014,2015
0,Final energy consumption (ktoe),59460.8,63794.9,61051.4,60130.2,58013.5,56906.1,57147.7,47510.0,53525.6,51367.4,55185.7,47418.3,49562.4,52800.0,44748.5,46513.7
1,Population,82163475.0,82259540.0,82440309.0,82536680.0,82531671.0,82500849.0,82437995.0,82314906.0,82217837.0,82002356.0,81802257.0,80222065.0,80327900.0,80523746.0,80767463.0,81197537.0
2,Number of households,38136234.0,38317671.0,38540012.0,38724361.0,38862308.0,38989059.0,39331104.0,39593509.0,40008680.0,40079353.0,40296678.0,39733564.0,39984022.0,39823811.0,40023520.0,40558210.0
3,Inhabitants per household,2.154,2.147,2.139,2.131,2.124,2.116,2.096,2.079,2.055,2.046,2.03,2.019,2.009,2.022,2.018,2.002
4,Actual heating degree-days,2803.0,3137.0,2977.0,3140.0,3187.0,3148.0,3021.0,2815.0,2985.0,3081.0,3630.0,2872.0,3130.0,3288.0,2661.0,2909.0


In [36]:
rows = ['Population', 'Number of households', 'Inhabitants per household',
    'Actual heating degree-days', 'Actual cooling degree-days',
    'Gross domestic product', 'Household consumption expenditure']

parent_categories = ['Space heating', 'Space cooling', 'Water heating', 'Cooking']

df_transpose = df[df.iloc[:, 0].isin(rows)].copy()
df_transpose.set_index(df_transpose.columns[0], inplace=True)
df_transpose = df_transpose.T
df_transpose.index.name = "Year"
df_transpose.reset_index(inplace=True)


In [41]:
df_transpose

Unnamed: 0,Year,Population,Households,Inhabitants per household,HDD,CDD,GDP,Expenditure
0,2000,82163475.0,38136234.0,2.154,2803.0,8.0,2358694.0,1379742.0
1,2001,82259540.0,38317671.0,2.147,3137.0,13.0,2398682.0,1402017.0
2,2002,82440309.0,38540012.0,2.139,2977.0,15.0,2398691.0,1391177.0
3,2003,82536680.0,38724361.0,2.131,3140.0,56.0,2381651.0,1392767.0
4,2004,82531671.0,38862308.0,2.124,3187.0,5.0,2409529.0,1403317.0
5,2005,82500849.0,38989059.0,2.116,3148.0,10.0,2426556.0,1409110.0
6,2006,82437995.0,39331104.0,2.096,3021.0,35.0,2516323.0,1430214.0
7,2007,82314906.0,39593509.0,2.079,2815.0,10.0,2598379.0,1430224.0
8,2008,82217837.0,40008680.0,2.055,2985.0,10.0,2626510.0,1438747.0
9,2009,82002356.0,40079353.0,2.046,3081.0,4.0,2478921.0,1440923.0


In [38]:
for col in df_transpose.columns:
    if col != 'Year':
        df_transpose[col] = df_transpose[col].astype(str).str.replace(',', '').astype(float)

In [40]:
rename_map = {
    'Actual heating degree-days': 'HDD',
    'Actual cooling degree-days': 'CDD',
    'Gross domestic product': 'GDP',
    'Household consumption expenditure': 'Expenditure',
    'Number of households': 'Households'
}

df_transpose.rename(columns=rename_map, inplace=True)

In [43]:
panel_data = []
current_end_use = None

for index, row in df.iterrows():
    label = str(row.iloc[0]).strip()
    
    if label in parent_categories:
        current_end_use = label
        continue 

    if current_end_use and label not in rows and label != 'Final energy consumption (ktoe)':
    
        row_data = row.iloc[1:].to_dict()
        for year, value in row_data.items():
            panel_data.append({
                'Year': year,
                'End_Use': current_end_use,
                'Fuel_Technology': label,
                'Energy_ktoe': value
            })

In [44]:
df_energy = pd.DataFrame(panel_data)
df_energy['Energy_ktoe'] = df_energy['Energy_ktoe'].astype(str).str.replace(',', '')
df_energy['Energy_ktoe'] = pd.to_numeric(df_energy['Energy_ktoe'], errors='coerce')
df_energy

Unnamed: 0,Year,End_Use,Fuel_Technology,Energy_ktoe
0,2000,Space heating,Solids,813.000
1,2001,Space heating,Solids,778.000
2,2002,Space heating,Solids,648.000
3,2003,Space heating,Solids,669.000
4,2004,Space heating,Solids,538.000
...,...,...,...,...
395,2011,Cooking,Electricity,1360.192
396,2012,Cooking,Electricity,1329.246
397,2013,Cooking,Electricity,1248.194
398,2014,Cooking,Electricity,1309.073


In [50]:
df_transpose['Year'] = df_transpose['Year'].astype(int)
df_energy['Year'] = df_energy['Year'].astype(int)

df_final = pd.merge(df_energy, df_transpose, on='Year', how='left')
df_final = df_final.sort_values(by=['Year', 'End_Use', 'Fuel_Technology'])
df_final

Unnamed: 0,Year,End_Use,Fuel_Technology,Energy_ktoe,Population,Households,Inhabitants per household,HDD,CDD,GDP,Expenditure
368,2000,Cooking,Biomass and wastes,,82163475.0,38136234.0,2.154,2803.0,8.0,2358694.0,1379742.0
384,2000,Cooking,Electricity,1430.199,82163475.0,38136234.0,2.154,2803.0,8.0,2358694.0,1379742.0
352,2000,Cooking,Gases incl. biogas,1575.000,82163475.0,38136234.0,2.154,2803.0,8.0,2358694.0,1379742.0
336,2000,Cooking,Liquified petroleum gas (LPG),586.700,82163475.0,38136234.0,2.154,2803.0,8.0,2358694.0,1379742.0
320,2000,Cooking,Solids,,82163475.0,38136234.0,2.154,2803.0,8.0,2358694.0,1379742.0
...,...,...,...,...,...,...,...,...,...,...,...
239,2015,Water heating,Gases incl. biogas,2950.000,81197537.0,40558210.0,2.002,2909.0,55.0,2800922.0,1534790.0
271,2015,Water heating,Geothermal energy,,81197537.0,40558210.0,2.002,2909.0,55.0,2800922.0,1534790.0
207,2015,Water heating,Liquified petroleum gas (LPG),56.000,81197537.0,40558210.0,2.002,2909.0,55.0,2800922.0,1534790.0
319,2015,Water heating,Solar,633.000,81197537.0,40558210.0,2.002,2909.0,55.0,2800922.0,1534790.0


In [None]:
df_final["Energy_ktoe"].fillna(0, inplace=True)
df_final.reset_index(drop=True, inplace=True)

The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  df_final["Energy_ktoe"].fillna(0, inplace=True)


(400, 11)

In [65]:
df_final.shape

(400, 11)

In [66]:
df_final.to_csv('./../data/integrated-data/residential_panel_data.csv', index=False)