In [26]:
import seaborn as sns
import matplotlib.pyplot as plt
import pandas as pd
import numpy as np

In [19]:
train = pd.read_csv('../data/train.csv', index_col=0)
test = pd.read_csv('../data/test.csv', index_col=0)
weather = pd.read_csv('../data/weather.csv', index_col=0)
meta = pd.read_csv('../data/metadata.csv')
holidays = pd.read_csv('../data/holidays.csv', delimiter=';')

### Weather Data

In [2]:
weather.head()

Unnamed: 0,Timestamp,Temperature,Distance,SiteId
78064,2013-12-31 19:00:00,-7.2,24.889929,1
86746,2013-12-31 19:00:00,-8.3,23.303097,1
90002,2013-12-31 19:00:00,-7.8,20.952256,1
90003,2013-12-31 19:00:00,-8.0,20.952256,1
100541,2013-12-31 19:34:00,-8.1,16.610602,1


### Meta Data

In [9]:
meta.isna().sum()

SiteId               0
Surface              0
Sampling             0
BaseTemperature      0
MondayIsDayOff       0
TuesdayIsDayOff      0
WednesdayIsDayOff    0
ThursdayIsDayOff     0
FridayIsDayOff       0
SaturdayIsDayOff     0
SundayIsDayOff       0
dtype: int64

In [3]:
meta.head(20)

Unnamed: 0,SiteId,Surface,Sampling,BaseTemperature,MondayIsDayOff,TuesdayIsDayOff,WednesdayIsDayOff,ThursdayIsDayOff,FridayIsDayOff,SaturdayIsDayOff,SundayIsDayOff
0,1,1387.205119,15.0,18.0,False,False,False,False,False,True,True
1,2,6098.278376,30.0,18.0,False,False,False,False,False,True,True
2,3,10556.293605,5.0,18.0,False,False,False,False,False,True,False
3,5,12541.181277,30.0,18.0,False,False,False,False,False,True,True
4,6,9150.195373,30.0,18.0,False,False,False,False,False,True,True
5,7,15168.125971,30.0,18.0,False,False,False,False,False,True,True
6,8,22221.851847,30.0,18.0,False,False,False,False,False,True,True
7,9,14588.849015,30.0,18.0,False,False,False,False,False,True,True
8,10,6393.671251,30.0,18.0,False,False,False,False,False,True,True
9,11,2517.739425,30.0,18.0,False,False,False,False,False,True,True


In [10]:
meta.isna().sum()

SiteId               0
Surface              0
Sampling             0
BaseTemperature      0
MondayIsDayOff       0
TuesdayIsDayOff      0
WednesdayIsDayOff    0
ThursdayIsDayOff     0
FridayIsDayOff       0
SaturdayIsDayOff     0
SundayIsDayOff       0
dtype: int64

### Holidays Data

In [12]:
holidays.head()

Unnamed: 0,Date,Holiday,SiteId
0,2016-12-23,Christmas Eve (Observed),1
1,2016-12-24,Christmas Eve,1
2,2017-07-04,Independence Day,1
3,2014-11-04,Election Day,1
4,2016-09-05,Labor Day,12


In [11]:
holidays.isna().sum()

Date       0
Holiday    0
SiteId     0
dtype: int64

### Test Data

In [4]:
test.query('SiteId == 20')

Unnamed: 0_level_0,SiteId,Timestamp,ForecastId,Value
obs_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
3769391,20,2016-02-09 22:00:00,634,173322.308988
5158850,20,2016-02-09 23:00:00,634,175211.436698
6033149,20,2016-02-10 00:00:00,634,227494.864074
5960648,20,2016-02-10 01:00:00,634,288062.871938
7821621,20,2016-02-10 02:00:00,634,298587.455325
...,...,...,...,...
6524424,20,2017-11-04 20:00:00,647,164162.489030
2415874,20,2017-11-04 21:00:00,647,165767.034135
1432775,20,2017-11-04 22:00:00,647,165227.009158
6335607,20,2017-11-04 23:00:00,647,166389.390454


In [21]:
test.head()

Unnamed: 0_level_0,SiteId,Timestamp,ForecastId,Value
obs_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
1677832,1,2015-08-29 00:00:00,1,7413780.0
5379616,1,2015-08-30 00:00:00,1,8927612.0
496261,1,2015-08-31 00:00:00,1,7288439.0
4567147,1,2015-09-01 00:00:00,1,8399679.0
3684873,1,2015-09-02 00:00:00,1,7576456.0


In [40]:
test.duplicated(subset=['SiteId', 'Timestamp'])

obs_id
1677832    False
5379616    False
496261     False
4567147    False
3684873    False
           ...  
6226412    False
4466872    False
2951966    False
6044913    False
6704022    False
Length: 1309176, dtype: bool

In [7]:
test.count()

SiteId        1309176
Timestamp     1309176
ForecastId    1309176
Value         1290114
dtype: int64

In [6]:
test.isna().sum()

SiteId            0
Timestamp         0
ForecastId        0
Value         19062
dtype: int64

### Train Data

In [8]:
train.count()

SiteId        6559830
Timestamp     6559830
ForecastId    6559830
Value         6473229
dtype: int64

In [5]:
train.isna().sum()


SiteId            0
Timestamp         0
ForecastId        0
Value         86601
dtype: int64

In [44]:
visualized_train = train[['Value', 'SiteId']]
visualized_train['Value'] = visualized_train['Value'].map(lambda x: pd.isna(x))
visualized_train.head()

Unnamed: 0_level_0,SiteId,Timestamp,ForecastId,Value
obs_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
744519,1,2014-09-03 00:00:00,1,909655.5
7627564,1,2014-09-04 00:00:00,1,1748273.0
7034705,1,2014-09-05 00:00:00,1,
5995486,1,2014-09-06 00:00:00,1,
7326510,1,2014-09-07 00:00:00,1,


In [13]:
def process_time(df):
    
    df['Timestamp'] = pd.to_datetime(df['Timestamp'])
    df = df.set_index('Timestamp')
    
    df['min'] = df.index.minute
    df['hour'] = df.index.hour
    df['wday'] = df.index.dayofweek
    df['mday'] = df.index.day
    df['yday'] = df.index.dayofyear
    df['month'] = df.index.month
    df['year'] = df.index.year
    
    df['time'] = df['hour'] + (df['min'] / 60)
    df = df.drop(columns=['hour', 'min'])
        
    df['wday_sin'] = np.sin(2 * np.pi * df['wday'] / 6)
    df['wday_cos'] = np.cos(2 * np.pi * df['wday'] / 6)
    
    df['yday_sin'] = np.sin(2 * np.pi * df['yday'] / 365)
    df['yday_cos'] = np.cos(2 * np.pi * df['yday'] / 365)
    
    df['month_sin'] = np.sin(2 * np.pi * df['month'] / 12)
    df['month_cos'] = np.cos(2 * np.pi * df['month'] / 12)
    
    df['time_sin'] = np.sin(2 * np.pi * df['time'] / 24)
    df['time_cos'] = np.cos(2 * np.pi * df['time'] / 24)
    
    df = df.reset_index(level=0)
    
    return df

processed_train = process_time(train)
processed_test = process_time(test)

In [14]:
processed_train.head()

Unnamed: 0,Timestamp,SiteId,ForecastId,Value,wday,mday,yday,month,year,time,wday_sin,wday_cos,yday_sin,yday_cos,month_sin,month_cos,time_sin,time_cos
0,2014-09-03,1,1,909655.5,2,3,246,9,2014,0.0,0.8660254,-0.5,-0.888057,-0.459733,-1.0,-1.83697e-16,0.0,1.0
1,2014-09-04,1,1,1748273.0,3,4,247,9,2014,0.0,1.224647e-16,-1.0,-0.895839,-0.444378,-1.0,-1.83697e-16,0.0,1.0
2,2014-09-05,1,1,,4,5,248,9,2014,0.0,-0.8660254,-0.5,-0.903356,-0.428892,-1.0,-1.83697e-16,0.0,1.0
3,2014-09-06,1,1,,5,6,249,9,2014,0.0,-0.8660254,0.5,-0.910605,-0.413279,-1.0,-1.83697e-16,0.0,1.0
4,2014-09-07,1,1,,6,7,250,9,2014,0.0,-2.449294e-16,1.0,-0.917584,-0.397543,-1.0,-1.83697e-16,0.0,1.0


In [15]:
def add_weather(df, weather):
    
    original_length = len(df)
    
    weather['Timestamp'] = pd.to_datetime(weather['Timestamp'])
    weather = weather.set_index('Timestamp')
    
    weather.index = weather.index.round(freq='15 min')
    weather = weather.reset_index(level=0)
    
    df = pd.merge(df, weather, how = 'left', on = ['year', 'month', 'mday', 'SiteId'])
    
    df = df.sort_values(['Timestamp', 'SiteId', 'Distance'])
    df = df.drop_duplicates(['Timestamp', 'SiteId'], keep='first')
    
    new_length = len(df)
    
    assert original_length == new_length, 'New Length must match original length'

    return df

processed_train = add_weather(processed_train, weather)
processed_test = add_weather(processed_test, weather)

KeyError: 'year'