In [1]:
import pandas as pd
import numpy as np
import datetime

In [15]:
# Load the CSV file into a DataFrame
file_path = 'data/paris 2020-09-01 to 2021-10-31.csv'
external_data = pd.read_csv(file_path, sep=',', quotechar='"')

In [17]:
external_data['datetime'] = pd.to_datetime(external_data['datetime'])
date_range = pd.date_range('2020-09-01 00:00:00', '2021-10-31 23:00:00', freq='1H')
df_missing = pd.Series(external_data.datetime.unique())

#check for missing datetimeindex values based on reference index (with all values)
missing_dates = date_range[~date_range.isin(df_missing)]
#print(missing_dates)

impute_missing = external_data.loc[external_data['datetime'] == np.datetime64('2021-03-28T03:00:00.000000')].replace('2021-03-28 03:00:00', '2021-03-28 02:00:00')
external_data = pd.concat([external_data, impute_missing], ignore_index=True)

external_data = external_data.apply(pd.to_numeric, errors='ignore')

#external_data[external_data.duplicated(subset=['datetime'],keep=False)]
external_data = external_data.drop_duplicates(subset=['datetime'])

In [7]:
external_data.drop(columns=['name', 'preciptype', 'windgust', 'feelslike', 
                            'visibility', 'snow', 'snowdepth', 'dew', 'winddir', 
                            'sealevelpressure', 'solarradiation', 'solarenergy', 
                            'uvindex', 'severerisk', 'conditions', 'icon', 'stations'
                           ], inplace=True)


In [8]:
external_data['datetime'] = pd.to_datetime(external_data['datetime'])

In [9]:
external_data.head()

Unnamed: 0,datetime,temp,humidity,precip,precipprob,windspeed,cloudcover
0,2020-09-01 00:00:00,14.4,70.52,0.0,0,7.3,33.3
1,2020-09-01 01:00:00,13.8,74.22,0.0,0,7.4,10.0
2,2020-09-01 02:00:00,13.3,77.67,0.0,0,7.0,10.0
3,2020-09-01 03:00:00,12.9,79.13,0.0,0,4.2,10.0
4,2020-09-01 04:00:00,12.4,80.98,0.0,0,3.7,54.0


In [10]:
external_data.isna().any()

datetime      False
temp          False
humidity      False
precip        False
precipprob    False
windspeed     False
cloudcover    False
dtype: bool

In [11]:
numeric_features = external_data.select_dtypes(include=np.number)

In [12]:
from sklearn.preprocessing import MinMaxScaler

scaler = MinMaxScaler()
scaler.fit(numeric_features)
scaled_features = scaler.transform(numeric_features)

In [13]:
scaled_external_data = pd.merge(external_data.datetime, pd.DataFrame(scaled_features, columns=numeric_features.columns), left_index=True, right_index=True)

In [14]:
scaled_external_data.to_csv('data/scaled_weather_data.csv', index=False)