## 서울 자전거 NumPy 실습 (상관계수)

In [1]:
import numpy as np
import pandas as pd

In [2]:
seoul_bike = np.genfromtxt('../0_data/SeoulBikeData.csv', delimiter=',', skip_header=1)
seoul_bike.shape

(8760, 14)

In [3]:
df = pd.read_csv('../0_data/SeoulBikeData.csv')
df.head()

Unnamed: 0,Date,Rented Bike Count,Hour,Temperature,Humidity(%),Wind speed (m/s),Visibility (10m),Dew point temperature,Solar Radiation (MJ/m2),Rainfall(mm),Snowfall (cm),Seasons,Holiday,Functioning Day
0,01/12/2017,254,0,-5.2,37,2.2,2000,-17.6,0.0,0.0,0.0,Winter,No Holiday,Yes
1,01/12/2017,204,1,-5.5,38,0.8,2000,-17.6,0.0,0.0,0.0,Winter,No Holiday,Yes
2,01/12/2017,173,2,-6.0,39,1.0,2000,-17.7,0.0,0.0,0.0,Winter,No Holiday,Yes
3,01/12/2017,107,3,-6.2,40,0.9,2000,-17.6,0.0,0.0,0.0,Winter,No Holiday,Yes
4,01/12/2017,78,4,-6.0,36,2.3,2000,-18.6,0.0,0.0,0.0,Winter,No Holiday,Yes


In [4]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 8760 entries, 0 to 8759
Data columns (total 14 columns):
 #   Column                   Non-Null Count  Dtype  
---  ------                   --------------  -----  
 0   Date                     8760 non-null   object 
 1   Rented Bike Count        8760 non-null   int64  
 2   Hour                     8760 non-null   int64  
 3   Temperature              8760 non-null   float64
 4   Humidity(%)              8760 non-null   int64  
 5   Wind speed (m/s)         8760 non-null   float64
 6   Visibility (10m)         8760 non-null   int64  
 7   Dew point temperature    8760 non-null   float64
 8   Solar Radiation (MJ/m2)  8760 non-null   float64
 9   Rainfall(mm)             8760 non-null   float64
 10  Snowfall (cm)            8760 non-null   float64
 11  Seasons                  8760 non-null   object 
 12  Holiday                  8760 non-null   object 
 13  Functioning Day          8760 non-null   object 
dtypes: float64(6), int64(4),

In [5]:
df.describe()

Unnamed: 0,Rented Bike Count,Hour,Temperature,Humidity(%),Wind speed (m/s),Visibility (10m),Dew point temperature,Solar Radiation (MJ/m2),Rainfall(mm),Snowfall (cm)
count,8760.0,8760.0,8760.0,8760.0,8760.0,8760.0,8760.0,8760.0,8760.0,8760.0
mean,704.602055,11.5,12.882922,58.226256,1.724909,1436.825799,4.073813,0.569111,0.148687,0.075068
std,644.997468,6.922582,11.944825,20.362413,1.0363,608.298712,13.060369,0.868746,1.128193,0.436746
min,0.0,0.0,-17.8,0.0,0.0,27.0,-30.6,0.0,0.0,0.0
25%,191.0,5.75,3.5,42.0,0.9,940.0,-4.7,0.0,0.0,0.0
50%,504.5,11.5,13.7,57.0,1.5,1698.0,5.1,0.01,0.0,0.0
75%,1065.25,17.25,22.5,74.0,2.3,2000.0,14.8,0.93,0.0,0.0
max,3556.0,23.0,39.4,98.0,7.4,2000.0,27.2,3.52,35.0,8.8


- 넘파이를 활용하여 데이터 슬라이싱

In [9]:
seoul_bike = seoul_bike[:, 1:-3]
seoul_bike.shape

(8760, 10)

- 간단한 통계계산

In [11]:
# Rented Bike Count 컬럼 슬라이싱
rented_bike_cnt = seoul_bike[:,0]

In [12]:
len(rented_bike_cnt)

8760

In [13]:
rented_bike_cnt.mean()

704.6020547945205

In [14]:
rented_bike_cnt.std()

644.9606517645436

In [15]:
rented_bike_cnt.min()

0.0

In [17]:
np.quantile(rented_bike_cnt, 0.5) # median = np.median(rented_bike_cnt)

504.5

In [18]:
np.quantile(rented_bike_cnt, 0.25) # 25%

191.0

In [19]:
np.quantile(rented_bike_cnt, 0.75) # 75%

1065.25

In [20]:
rented_bike_cnt.max()

3556.0

- 상관계수 계산

In [27]:
columns = list(df.columns)[1:-3]
columns

['Rented Bike Count',
 'Hour',
 'Temperature',
 'Humidity(%)',
 'Wind speed (m/s)',
 'Visibility (10m)',
 'Dew point temperature',
 'Solar Radiation (MJ/m2)',
 'Rainfall(mm)',
 'Snowfall (cm)']

In [28]:
temperature = seoul_bike[:,2]

In [29]:
temperature.mean()

12.882922374429223

In [30]:
temperature[:10]

array([-5.2, -5.5, -6. , -6.2, -6. , -6.4, -6.6, -7.4, -7.6, -6.5])

In [32]:
np.corrcoef(temperature, rented_bike_cnt) # 온도와 자전거 대여량 상관관계 분석
# 자신과의 상관관계는 1,0.7 이상이면 양의 상관관계가 높다고 판별

array([[1.        , 0.53855815],
       [0.53855815, 1.        ]])

In [33]:
np.corrcoef(seoul_bike)

array([[1.        , 0.99968215, 0.99916291, ..., 0.97557956, 0.9689233 ,
        0.98452257],
       [0.99968215, 1.        , 0.99987655, ..., 0.96974951, 0.96239886,
        0.97981303],
       [0.99916291, 0.99987655, 1.        , ..., 0.96580448, 0.95802417,
        0.97656372],
       ...,
       [0.97557956, 0.96974951, 0.96580448, ..., 1.        , 0.99959118,
        0.99896484],
       [0.9689233 , 0.96239886, 0.95802417, ..., 0.99959118, 1.        ,
        0.99726923],
       [0.98452257, 0.97981303, 0.97656372, ..., 0.99896484, 0.99726923,
        1.        ]])

In [35]:
temperature[0], rented_bike_cnt[0]

(-5.2, 254.0)

In [39]:
np.array([temperature]).transpose()

array([[-5.2],
       [-5.5],
       [-6. ],
       ...,
       [ 2.6],
       [ 2.1],
       [ 1.9]])

In [43]:
# 하루 (24시간)의 데이터
temperature.reshape(-1,24).shape # -1: 다른 차원 값을 가져와서 알아서 계산 해라

(365, 24)

In [45]:
temperature.shape

(8760,)

In [38]:
temperature.reshape(-1,1) # -1 = 8760

array([[-5.2],
       [-5.5],
       [-6. ],
       ...,
       [ 2.6],
       [ 2.1],
       [ 1.9]])

In [59]:
# 온도 컬럼을 8760 개의 행과 1개의 열의 데이터로 배열 모양 변형
tmp_bike1 = np.hstack([temperature.reshape(-1,1), rented_bike_cnt.reshape(-1,1)])

In [55]:
# 위의 동일함
np.vstack([temperature, rented_bike_cnt]).transpose()

array([[ -5.2, 254. ],
       [ -5.5, 204. ],
       [ -6. , 173. ],
       ...,
       [  2.6, 694. ],
       [  2.1, 712. ],
       [  1.9, 584. ]])

In [69]:
# 온도가 28보다 작은 부분만 저장
tmp_bike_part = tmp_bike1[tmp_bike1[:,0]<28,:]

In [62]:
tmp_bike_part.shape

(7869, 2)

In [65]:
tmp_bike_part[:,0].max()

27.9

In [71]:
np.corrcoef(tmp_bike_part, rowvar=False)

array([[1.        , 0.53673761],
       [0.53673761, 1.        ]])

In [74]:
# 온도가 34보다 큰 데이터
tmp_bike_part2 = tmp_bike1[tmp_bike1[:,0]>34,:]
np.corrcoef(tmp_bike_part2, rowvar=False)

array([[ 1.        , -0.12655309],
       [-0.12655309,  1.        ]])

In [1]:
# Rented Bike Count 와 양의 상관관계가 있는 변수들과 상관관계값 (0.3 보다 큰 값)
# [['Hour' '0.41025729132248584']
#  ['Temperature' '0.5385581530139789']
#  ['Dew point temperature' '0.3797881212449726']]
# --------------------------------------------------
# Hour 와 양의 상관관계가 있는 변수들과 상관관계값 (0.3 보다 큰 값)
# [['Rented Bike Count' '0.4102572913224858']]
# --------------------------------------------------
# Temperature 와 양의 상관관계가 있는 변수들과 상관관계값 (0.3 보다 큰 값)
# [['Rented Bike Count' '0.5385581530139789']
#  ['Dew point temperature' '0.9127982187579918']
#  ['Solar Radiation (MJ/m2)' '0.353505470147949']]
# --------------------------------------------------
# Humidity(%) 와 양의 상관관계가 있는 변수들과 상관관계값 (0.3 보다 큰 값)
# [['Dew point temperature' '0.5368944942226763']]
# --------------------------------------------------
# Wind speed (m/s) 와 양의 상관관계가 있는 변수들과 상관관계값 (0.3 보다 큰 값)
# [['Solar Radiation (MJ/m2)' '0.33227424599890565']]
# --------------------------------------------------
# Visibility (10m) 와 양의 상관관계가 있는 변수들과 상관관계값 (0.3 보다 큰 값)
# []
# --------------------------------------------------
# Dew point temperature 와 양의 상관관계가 있는 변수들과 상관관계값 (0.3 보다 큰 값)
# [['Rented Bike Count' '0.3797881212449726']
#  ['Temperature' '0.9127982187579917']
#  ['Humidity(%)' '0.5368944942226764']]
# --------------------------------------------------
# Solar Radiation (MJ/m2) 와 양의 상관관계가 있는 변수들과 상관관계값 (0.3 보다 큰 값)
# [['Temperature' '0.353505470147949']
#  ['Wind speed (m/s)' '0.33227424599890565']]
# --------------------------------------------------
# Rainfall(mm) 와 양의 상관관계가 있는 변수들과 상관관계값 (0.3 보다 큰 값)
# []
# --------------------------------------------------
# Snowfall (cm) 와 양의 상관관계가 있는 변수들과 상관관계값 (0.3 보다 큰 값)
# []
# --------------------------------------------------

In [None]:
# Rented Bike Count 와 음의 상관관계가 있는 변수들과 상관관계값 (0.3 보다 작은 값)
# []
# --------------------------------------------------
# Hour 와 음의 상관관계가 있는 변수들과 상관관계값 (0.3 보다 작은 값)
# []
# --------------------------------------------------
# Temperature 와 음의 상관관계가 있는 변수들과 상관관계값 (0.3 보다 작은 값)
# []
# --------------------------------------------------
# Humidity(%) 와 음의 상관관계가 있는 변수들과 상관관계값 (0.3 보다 작은 값)
# [['Wind speed (m/s)' '-0.3366830416913436']
#  ['Visibility (10m)' '-0.5430903446558317']
#  ['Solar Radiation (MJ/m2)' '-0.4619187969811591']]
# --------------------------------------------------
# Wind speed (m/s) 와 음의 상관관계가 있는 변수들과 상관관계값 (0.3 보다 작은 값)
# [['Humidity(%)' '-0.3366830416913436']]
# --------------------------------------------------
# Visibility (10m) 와 음의 상관관계가 있는 변수들과 상관관계값 (0.3 보다 작은 값)
# [['Humidity(%)' '-0.5430903446558316']]
# --------------------------------------------------
# Dew point temperature 와 음의 상관관계가 있는 변수들과 상관관계값 (0.3 보다 작은 값)
# []
# --------------------------------------------------
# Solar Radiation (MJ/m2) 와 음의 상관관계가 있는 변수들과 상관관계값 (0.3 보다 작은 값)
# [['Humidity(%)' '-0.4619187969811592']]
# --------------------------------------------------
# Rainfall(mm) 와 음의 상관관계가 있는 변수들과 상관관계값 (0.3 보다 작은 값)
# []
# --------------------------------------------------
# Snowfall (cm) 와 음의 상관관계가 있는 변수들과 상관관계값 (0.3 보다 작은 값)
# []
# --------------------------------------------------