In [14]:
import pandas as pd
import numpy as np
from datetime import timedelta
from glob import glob
import plotly.express as px

pd.options.plotting.backend = 'plotly'

## plotly.io를 import 한 후 renderers 기본값을 꼭 "notebook_connected" 로 설정해주시기 바랍니다.
import plotly.io as pio
pio.renderers.default = "notebook_connected"

In [15]:
path = 'data/'
files = sorted(glob(path+'*.csv'))

train = pd.read_csv(files[2], header=0, encoding='cp949') 
test = pd.read_csv(files[1], header=0, encoding='cp949')
sample_submission = pd.read_csv(files[0], header=0) 

In [16]:
train.head()

Unnamed: 0,num,date_time,전력사용량(kWh),기온(°C),풍속(m/s),습도(%),강수량(mm),일조(hr),비전기냉방설비운영,태양광보유
0,1,2020-06-01 00,8179.056,17.6,2.5,92.0,0.8,0.0,0.0,0.0
1,1,2020-06-01 01,8135.64,17.7,2.9,91.0,0.3,0.0,0.0,0.0
2,1,2020-06-01 02,8107.128,17.5,3.2,91.0,0.0,0.0,0.0,0.0
3,1,2020-06-01 03,8048.808,17.1,3.2,91.0,0.0,0.0,0.0,0.0
4,1,2020-06-01 04,8043.624,17.0,3.3,92.0,0.0,0.0,0.0,0.0


In [17]:
test.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 10080 entries, 0 to 10079
Data columns (total 9 columns):
 #   Column        Non-Null Count  Dtype  
---  ------        --------------  -----  
 0   num           10080 non-null  int64  
 1   date_time     10080 non-null  object 
 2   기온(°C)        3360 non-null   float64
 3   풍속(m/s)       3360 non-null   float64
 4   습도(%)         3360 non-null   float64
 5   강수량(mm, 6시간)  1680 non-null   float64
 6   일조(hr, 3시간)   3360 non-null   float64
 7   비전기냉방설비운영     2296 non-null   float64
 8   태양광보유         1624 non-null   float64
dtypes: float64(7), int64(1), object(1)
memory usage: 708.9+ KB


In [18]:
test.head(10)

Unnamed: 0,num,date_time,기온(°C),풍속(m/s),습도(%),"강수량(mm, 6시간)","일조(hr, 3시간)",비전기냉방설비운영,태양광보유
0,1,2020-08-25 00,27.8,1.5,74.0,0.0,0.0,,
1,1,2020-08-25 01,,,,,,,
2,1,2020-08-25 02,,,,,,,
3,1,2020-08-25 03,27.3,1.1,78.0,,0.0,,
4,1,2020-08-25 04,,,,,,,
5,1,2020-08-25 05,,,,,,,
6,1,2020-08-25 06,26.1,1.3,83.0,0.0,0.0,,
7,1,2020-08-25 07,,,,,,,
8,1,2020-08-25 08,,,,,,,
9,1,2020-08-25 09,28.8,1.2,77.0,,1.1,,


In [19]:
test.describe()

Unnamed: 0,num,기온(°C),풍속(m/s),습도(%),"강수량(mm, 6시간)","일조(hr, 3시간)",비전기냉방설비운영,태양광보유
count,10080.0,3360.0,3360.0,3360.0,1680.0,3360.0,2296.0,1624.0
mean,30.5,27.805804,2.440327,81.963095,2.191845,0.612917,1.0,1.0
std,17.318961,2.378122,1.835905,11.900843,6.786772,0.929587,0.0,0.0
min,1.0,22.1,0.0,44.0,0.0,0.0,1.0,1.0
25%,15.75,26.0,1.1,74.0,0.0,0.0,1.0,1.0
50%,30.5,27.5,2.1,83.0,0.0,0.0,1.0,1.0
75%,45.25,29.2,3.2,92.0,0.9,1.0,1.0,1.0
max,60.0,35.4,22.5,100.0,83.5,3.0,1.0,1.0


In [20]:
test.isnull().sum()

num                0
date_time          0
기온(°C)          6720
풍속(m/s)         6720
습도(%)           6720
강수량(mm, 6시간)    8400
일조(hr, 3시간)     6720
비전기냉방설비운영       7784
태양광보유           8456
dtype: int64

In [21]:
fig = px.bar(x=test.columns, y=test.isnull().sum(), title='Null values')
fig.show()

In [22]:
train['date'] = train['date_time'].apply(lambda x: x.split()[0])
train['date_time'] = train['date_time'].apply(lambda x: x.split()[1])
# train['date_time'] = train['date_time'].str.rjust(8,'0') # 한자릿수 시간 앞에 0 추가 ex) 3시 -> 03시

# 24시를 00시로 바꿔주기
train.loc[train['date_time']=='24:00:00','date_time'] = '00:00:00'
train['date_time'] = train['date'] + ' ' + train['date_time']
train['date_time'] = pd.to_datetime(train['date_time'])
train.loc[train['date_time'].dt.hour==0,'date_time'] += timedelta(days=1)

In [23]:
train.head()

Unnamed: 0,num,date_time,전력사용량(kWh),기온(°C),풍속(m/s),습도(%),강수량(mm),일조(hr),비전기냉방설비운영,태양광보유,date
0,1,2020-06-02 00:00:00,8179.056,17.6,2.5,92.0,0.8,0.0,0.0,0.0,2020-06-01
1,1,2020-06-01 01:00:00,8135.64,17.7,2.9,91.0,0.3,0.0,0.0,0.0,2020-06-01
2,1,2020-06-01 02:00:00,8107.128,17.5,3.2,91.0,0.0,0.0,0.0,0.0,2020-06-01
3,1,2020-06-01 03:00:00,8048.808,17.1,3.2,91.0,0.0,0.0,0.0,0.0,2020-06-01
4,1,2020-06-01 04:00:00,8043.624,17.0,3.3,92.0,0.0,0.0,0.0,0.0,2020-06-01


In [24]:
# time column을 분해하여 의미있을 것 같은 month와 hour로 시각화
train['month'] = train['date_time'].dt.month
train['hour'] = train['date_time'].dt.hour

In [26]:
train.head()

Unnamed: 0,num,date_time,전력사용량(kWh),기온(°C),풍속(m/s),습도(%),강수량(mm),일조(hr),비전기냉방설비운영,태양광보유,date,month,hour
0,1,2020-06-02 00:00:00,8179.056,17.6,2.5,92.0,0.8,0.0,0.0,0.0,2020-06-01,6,0
1,1,2020-06-01 01:00:00,8135.64,17.7,2.9,91.0,0.3,0.0,0.0,0.0,2020-06-01,6,1
2,1,2020-06-01 02:00:00,8107.128,17.5,3.2,91.0,0.0,0.0,0.0,0.0,2020-06-01,6,2
3,1,2020-06-01 03:00:00,8048.808,17.1,3.2,91.0,0.0,0.0,0.0,0.0,2020-06-01,6,3
4,1,2020-06-01 04:00:00,8043.624,17.0,3.3,92.0,0.0,0.0,0.0,0.0,2020-06-01,6,4


In [27]:
mean_month = train.groupby('month').mean()
fig = px.bar(mean_month, x=mean_month.index, y=['전력사용량(kWh)'])
fig.show()

In [29]:
mean_month = train.groupby('month').mean()
fig = px.bar(mean_month, x=mean_month.index, y=['기온(°C)'])
fig.show()

In [30]:
mean_month = train.groupby('month').mean()
fig = px.bar(mean_month, x=mean_month.index, y=['풍속(m/s)'])
fig.show()

In [31]:
mean_month = train.groupby('month').mean()
fig = px.bar(mean_month, x=mean_month.index, y=['습도(%)'])
fig.show()

In [32]:
mean_month = train.groupby('month').mean()
fig = px.bar(mean_month, x=mean_month.index, y=['강수량(mm)'])
fig.show()

In [33]:
mean_month = train.groupby('month').mean()
fig = px.bar(mean_month, x=mean_month.index, y=['일조(hr)'])
fig.show()

In [34]:
# 시간별

mean_hour = train.groupby('hour').mean()
fig = px.bar(mean_hour, x=mean_hour.index, y=['전력사용량(kWh)'])
fig.show()

In [35]:
mean_hour = train.groupby('hour').mean()
fig = px.bar(mean_hour, x=mean_hour.index, y=['기온(°C)'])
fig.show()

In [36]:
mean_hour = train.groupby('hour').mean()
fig = px.bar(mean_hour, x=mean_hour.index, y=['풍속(m/s)'])
fig.show()

In [37]:
mean_hour = train.groupby('hour').mean()
fig = px.bar(mean_hour, x=mean_hour.index, y=['습도(%)'])
fig.show()

In [38]:
mean_hour = train.groupby('hour').mean()
fig = px.bar(mean_hour, x=mean_hour.index, y=['강수량(mm)'])
fig.show()

In [39]:
mean_hour = train.groupby('hour').mean()
fig = px.bar(mean_hour, x=mean_hour.index, y=['일조(hr)'])
fig.show()

In [41]:
fig = px.imshow(train.corr())
fig.show()

In [42]:
fig = px.imshow(test.corr())
fig.show()