In [1]:
# Import packages
import pandas as pd

### Import Data

In [64]:
files=[
    'csv_results_42_255441_mp-03-naamsestraat-62-taste.csv',
    'csv_results_42_255442_mp-05-calvariekapel-ku-leuven.csv',
    'csv_results_42_255443_mp-06-parkstraat-2-la-filosovia.csv',
    'csv_results_42_255444_mp-07-naamsestraat-81.csv',
]

# Loop through each csv in list files and read them into a consolidated dataframe
df=pd.DataFrame()
for file in files:
    file='../data/raw/export_42/'+file
    df_temp = pd.read_csv(file,sep=';')
    df = pd.concat([df,df_temp])

### Preprocess Data

In [65]:
# Convert column 'result_timestamp' to datetime
df['result_timestamp'] = pd.to_datetime(df['result_timestamp'],format='%d/%m/%Y %H:%M:%S.%f')

# Remove the milliseconds
df['result_timestamp'] = df['result_timestamp'].dt.floor('h')

# Drop unit related text columns
df=df.drop(['lamax_unit','laeq_unit','lceq_unit','lcpeak_unit'],axis=1)

In [66]:
df.head()

Unnamed: 0,#object_id,description,result_timestamp,lamax,laeq,lceq,lcpeak
0,255441,MP 03: Naamsestraat 62 Taste,2022-01-01,87.6,82.7,83.61,97.17
1,255441,MP 03: Naamsestraat 62 Taste,2022-01-01,84.5,83.1,84.42,96.41
2,255441,MP 03: Naamsestraat 62 Taste,2022-01-01,84.8,82.7,84.19,96.24
3,255441,MP 03: Naamsestraat 62 Taste,2022-01-01,81.9,79.3,81.08,94.03
4,255441,MP 03: Naamsestraat 62 Taste,2022-01-01,78.3,76.0,77.12,89.81


In [67]:
# Divide columns into two parts: columns to be grouped by with max values and columns to be grouped by with mean values
avg_cols = ['laeq','lceq']
max_cols = ['lamax','lcpeak']

# Take average for the eq columns
df_group = df.groupby(by=['#object_id','description','result_timestamp'])

In [68]:
# Apply groupby to get the mean values for the corresponding columns
df_group_avg = df_group.mean()
df_group_avg = df_group_avg[avg_cols]

In [69]:
df_group_avg.head()

Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,laeq,lceq
#object_id,description,result_timestamp,Unnamed: 3_level_1,Unnamed: 4_level_1
255441,MP 03: Naamsestraat 62 Taste,2022-01-01 00:00:00,57.126833,63.10465
255441,MP 03: Naamsestraat 62 Taste,2022-01-01 01:00:00,50.853806,58.648786
255441,MP 03: Naamsestraat 62 Taste,2022-01-01 02:00:00,50.049903,58.282633
255441,MP 03: Naamsestraat 62 Taste,2022-01-01 03:00:00,48.964907,57.793745
255441,MP 03: Naamsestraat 62 Taste,2022-01-01 04:00:00,47.285893,55.967906


In [70]:
# Apply groupby to get the max values for the corresponding columns
df_group_max = df_group.max()
df_group_max = df_group_max[max_cols]

In [71]:
df_group_max.head()

Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,lamax,lcpeak
#object_id,description,result_timestamp,Unnamed: 3_level_1,Unnamed: 4_level_1
255441,MP 03: Naamsestraat 62 Taste,2022-01-01 00:00:00,103.1,120.59
255441,MP 03: Naamsestraat 62 Taste,2022-01-01 01:00:00,85.1,103.78
255441,MP 03: Naamsestraat 62 Taste,2022-01-01 02:00:00,84.5,95.64
255441,MP 03: Naamsestraat 62 Taste,2022-01-01 03:00:00,75.9,96.3
255441,MP 03: Naamsestraat 62 Taste,2022-01-01 04:00:00,71.1,90.59


In [72]:
# Concatenate data frames
df_group_combined = pd.concat([df_group_avg,df_group_max],axis=1)

In [73]:
# Add back unit columns
df_group_combined['lamax_unit']='dB(A)'
df_group_combined['laeq_unit']='dB(A)'
df_group_combined['lceq_unit']='dB(C)'
df_group_combined['lcpeak_unit']='dB(C)'

# Remove index
df_group_combined=df_group_combined.reset_index()

# Add additional columns
df_group_combined['day']=df_group_combined['result_timestamp'].dt.day
df_group_combined['hour']=df_group_combined['result_timestamp'].dt.hour

In [74]:
df_group_combined.head()

Unnamed: 0,#object_id,description,result_timestamp,laeq,lceq,lamax,lcpeak,lamax_unit,laeq_unit,lceq_unit,lcpeak_unit,day,hour
0,255441,MP 03: Naamsestraat 62 Taste,2022-01-01 00:00:00,57.126833,63.10465,103.1,120.59,dB(A),dB(A),dB(C),dB(C),1,0
1,255441,MP 03: Naamsestraat 62 Taste,2022-01-01 01:00:00,50.853806,58.648786,85.1,103.78,dB(A),dB(A),dB(C),dB(C),1,1
2,255441,MP 03: Naamsestraat 62 Taste,2022-01-01 02:00:00,50.049903,58.282633,84.5,95.64,dB(A),dB(A),dB(C),dB(C),1,2
3,255441,MP 03: Naamsestraat 62 Taste,2022-01-01 03:00:00,48.964907,57.793745,75.9,96.3,dB(A),dB(A),dB(C),dB(C),1,3
4,255441,MP 03: Naamsestraat 62 Taste,2022-01-01 04:00:00,47.285893,55.967906,71.1,90.59,dB(A),dB(A),dB(C),dB(C),1,4


In [75]:
df_group_combined.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2964 entries, 0 to 2963
Data columns (total 13 columns):
 #   Column            Non-Null Count  Dtype         
---  ------            --------------  -----         
 0   #object_id        2964 non-null   int64         
 1   description       2964 non-null   object        
 2   result_timestamp  2964 non-null   datetime64[ns]
 3   laeq              2964 non-null   float64       
 4   lceq              2964 non-null   float64       
 5   lamax             2964 non-null   float64       
 6   lcpeak            2964 non-null   float64       
 7   lamax_unit        2964 non-null   object        
 8   laeq_unit         2964 non-null   object        
 9   lceq_unit         2964 non-null   object        
 10  lcpeak_unit       2964 non-null   object        
 11  day               2964 non-null   int64         
 12  hour              2964 non-null   int64         
dtypes: datetime64[ns](1), float64(4), int64(3), object(5)
memory usage: 301.2+ KB


In [76]:
df_group_combined.shape

(2964, 13)

In [78]:
df_group_combined.describe(include='all', datetime_is_numeric=True)

Unnamed: 0,#object_id,description,result_timestamp,laeq,lceq,lamax,lcpeak,lamax_unit,laeq_unit,lceq_unit,lcpeak_unit,day,hour
count,2964.0,2964,2964,2964.0,2964.0,2964.0,2964.0,2964,2964,2964,2964,2964.0,2964.0
unique,,4,,,,,,1,1,1,1,,
top,,MP 03: Naamsestraat 62 Taste,,,,,,dB(A),dB(A),dB(C),dB(C),,
freq,,744,,,,,,2964,2964,2964,2964,,
mean,255442.497976,,2022-01-16 10:32:54.898785280,49.02048,58.733293,79.638428,97.953357,,,,,15.959514,11.520243
min,255441.0,,2022-01-01 00:00:00,32.649972,44.058219,44.9,69.44,,,,,1.0,0.0
25%,255441.0,,2022-01-08 17:00:00,45.215604,55.335851,76.075,95.0,,,,,8.0,6.0
50%,255442.0,,2022-01-16 10:00:00,51.708556,60.481903,79.15,98.27,,,,,16.0,12.0
75%,255444.0,,2022-01-24 03:00:00,54.477569,63.151218,82.8,101.205,,,,,24.0,18.0
max,255444.0,,2022-01-31 23:00:00,66.624139,75.055814,108.5,121.27,,,,,31.0,23.0


In [21]:
# export results
# df_group_combined.to_csv('../data/processed/export42_noise_level.csv',index=False)