# Import Modules

In [1]:
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.preprocessing import OneHotEncoder



In [2]:
# display all columns
pd.set_option('display.max_columns', None)

# Import Data

In [3]:
# import data
weather = pd.read_csv('../data/processed/model_weather.csv')
event = pd.read_csv('../data/processed//model_event.csv')
noise_event = pd.read_csv('../data/processed//model_export41_noise_event.csv')
noise_measurement = pd.read_csv('../data/processed/export40_noise_measurements.csv')

# import export 42 data
noise_measurement_2 = pd.read_csv('../data/processed/export42_update.csv')
cols = ['#object_id','result_timestamp','day','hour','lamax','laeq']
noise_measurement_2 = noise_measurement_2[cols]

In [4]:
# standardize column names
event = event.rename(columns={'date': 'DATEUTC'})
noise_event = noise_event.rename(columns={'hourly_timestamp':'DATEUTC'})
noise_measurement = noise_measurement.rename(columns={'result_timestamp':'DATEUTC'})
noise_measurement_2 = noise_measurement_2.rename(columns={'result_timestamp':'DATEUTC'})

# Merge Data

In [5]:
# combine tables with noise events
df_noise_final = noise_measurement.merge(weather,on='DATEUTC').merge(event,on='DATEUTC')\
.merge(noise_event,on=['DATEUTC','description'],how='left') # left outer join

# Column Selection

In [6]:
# Drop columns not needed for modelling
drop_cols = ['Date','Month','Hour','Day','result_date','result_isoweek',
            '#object_id', 'DATEUTC',
            'LC_RAD60','event_count']

# Laf measures
laf_cols = [c for c in df_noise_final.columns if 'laf' in c and 'laf50' not in c]

# drop also laf measures except laf50
drop_cols.extend(laf_cols)

df_noise_final.drop(columns=drop_cols,inplace=True)

In [7]:
df_noise_final.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 50320 entries, 0 to 50319
Data columns (total 15 columns):
 #   Column             Non-Null Count  Dtype  
---  ------             --------------  -----  
 0   description        50320 non-null  object 
 1   laf50_per_hour     50320 non-null  float64
 2   result_hour        50320 non-null  int64  
 3   result_dayofweek   50320 non-null  int64  
 4   result_month       50320 non-null  int64  
 5   Temperature        50320 non-null  float64
 6   LC_HUMIDITY        50320 non-null  float64
 7   LC_RAININ          50320 non-null  float64
 8   LC_WINDSPEED       50320 non-null  float64
 9   break              50320 non-null  int64  
 10  class_human        21724 non-null  float64
 11  class_music        21724 non-null  float64
 12  class_transport    21724 non-null  float64
 13  class_unsupported  21724 non-null  float64
 14  class_wind         21724 non-null  float64
dtypes: float64(10), int64(4), object(1)
memory usage: 6.1+ MB


# Ouput CSV

In [8]:
df_noise_final.to_csv('../data/processed/model_data_bef_engineering.csv',index=False)