# 상관 분석
## 다양한 상관 분석
- 두 변수의 선형관계를 확인하기 위해 상관분석 실시
- 두 수치형 변수의 비교는 __Pearson's Correlation Coefficient__ 를 확인하며 그 외 다양한 상관계수 존재
- 상관계수 0~1 선형관계 약 --> 강

In [1]:
import pandas as pd
from scipy.stats import pearsonr
from scipy.stats import spearmanr
from scipy.stats import kendalltau

In [2]:
df = pd.read_csv("Data/bike.csv")
df.head(2)

Unnamed: 0,datetime,season,holiday,workingday,weather,temp,atemp,humidity,windspeed,casual,registered,count
0,2011-01-01 00:00:00,1,0,0,1,9.84,14.395,81,0.0,3,13,16
1,2011-01-01 01:00:00,1,0,0,1,9.02,13.635,80,0.0,8,32,40


In [3]:
df.corr()

Unnamed: 0,season,holiday,workingday,weather,temp,atemp,humidity,windspeed,casual,registered,count
season,1.0,0.029368,-0.008126,0.008879,0.258689,0.264744,0.19061,-0.147121,0.096758,0.164011,0.163439
holiday,0.029368,1.0,-0.250491,-0.007074,0.000295,-0.005215,0.001929,0.008409,0.043799,-0.020956,-0.005393
workingday,-0.008126,-0.250491,1.0,0.033772,0.029966,0.02466,-0.01088,0.013373,-0.319111,0.11946,0.011594
weather,0.008879,-0.007074,0.033772,1.0,-0.055035,-0.055376,0.406244,0.007261,-0.135918,-0.10934,-0.128655
temp,0.258689,0.000295,0.029966,-0.055035,1.0,0.984948,-0.064949,-0.017852,0.467097,0.318571,0.394454
atemp,0.264744,-0.005215,0.02466,-0.055376,0.984948,1.0,-0.043536,-0.057473,0.462067,0.314635,0.389784
humidity,0.19061,0.001929,-0.01088,0.406244,-0.064949,-0.043536,1.0,-0.318607,-0.348187,-0.265458,-0.317371
windspeed,-0.147121,0.008409,0.013373,0.007261,-0.017852,-0.057473,-0.318607,1.0,0.092276,0.091052,0.101369
casual,0.096758,0.043799,-0.319111,-0.135918,0.467097,0.462067,-0.348187,0.092276,1.0,0.49725,0.690414
registered,0.164011,-0.020956,0.11946,-0.10934,0.318571,0.314635,-0.265458,0.091052,0.49725,1.0,0.970948


In [4]:
df[["casual", "registered", "count"]].corr()

Unnamed: 0,casual,registered,count
casual,1.0,0.49725,0.690414
registered,0.49725,1.0,0.970948
count,0.690414,0.970948,1.0


In [7]:
df[["casual", "registered", "count"]].corr(method = "spearman")

Unnamed: 0,casual,registered,count
casual,1.0,0.775785,0.847378
registered,0.775785,1.0,0.988901
count,0.847378,0.988901,1.0


In [9]:
pearsonr(df["casual"], df["registered"])

(0.49724968508700823, 0.0)

In [10]:
stat, p = pearsonr(df["casual"], df["registered"])
print(stat)
print(p)

0.49724968508700823
0.0


In [11]:
# ====================================================
# 1. 기온, 체감온도, 상대습도, 총 자전거 대여 숫자의 상관관계를
# 분석하였을 때 가장 낮은 상관계수는 얼마인가? -0.35

# bike.csv
# 자전거 대여 숫자는 casual 변수 사용
# ====================================================
df = pd.read_csv("Data/bike.csv")
df.head(2)

Unnamed: 0,datetime,season,holiday,workingday,weather,temp,atemp,humidity,windspeed,casual,registered,count
0,2011-01-01 00:00:00,1,0,0,1,9.84,14.395,81,0.0,3,13,16
1,2011-01-01 01:00:00,1,0,0,1,9.02,13.635,80,0.0,8,32,40


In [12]:
df[["temp", "atemp", "humidity", "casual"]].corr()

Unnamed: 0,temp,atemp,humidity,casual
temp,1.0,0.984948,-0.064949,0.467097
atemp,0.984948,1.0,-0.043536,0.462067
humidity,-0.064949,-0.043536,1.0,-0.348187
casual,0.467097,0.462067,-0.348187,1.0


In [13]:
df[["temp", "atemp", "humidity", "casual"]].corr().round(2)

Unnamed: 0,temp,atemp,humidity,casual
temp,1.0,0.98,-0.06,0.47
atemp,0.98,1.0,-0.04,0.46
humidity,-0.06,-0.04,1.0,-0.35
casual,0.47,0.46,-0.35,1.0


In [15]:
# ====================================================
# 2. 계절별로 체감온도와 자전거 대여 숫자의 상관관계
# 상관계수로 옳은 것은? 2

# 1) 봄: 0.444
# 2) 여름: 0.378
# 3) 가을: 0.382
# 4) 겨울: 0.478

# bike.csv
# 자전거 대여 숫자는 casual 변수 사용
# ====================================================
df = pd.read_csv("Data/bike.csv")
df[["season", "atemp", "casual"]].groupby("season").corr()

Unnamed: 0_level_0,Unnamed: 1_level_0,atemp,casual
season,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
1,atemp,1.0,0.478312
1,casual,0.478312,1.0
2,atemp,1.0,0.378122
2,casual,0.378122,1.0
3,atemp,1.0,0.381423
3,casual,0.381423,1.0
4,atemp,1.0,0.443751
4,casual,0.443751,1.0


In [16]:
df_corr = df[["season", "atemp", "casual"]].groupby("season").corr()
df_corr = df_corr.reset_index()
df_corr

Unnamed: 0,season,level_1,atemp,casual
0,1,atemp,1.0,0.478312
1,1,casual,0.478312,1.0
2,2,atemp,1.0,0.378122
3,2,casual,0.378122,1.0
4,3,atemp,1.0,0.381423
5,3,casual,0.381423,1.0
6,4,atemp,1.0,0.443751
7,4,casual,0.443751,1.0


In [17]:
df_corr = df_corr.loc[df_corr["atemp"] < 1, ]
df_corr

Unnamed: 0,season,level_1,atemp,casual
1,1,casual,0.478312,1.0
3,2,casual,0.378122,1.0
5,3,casual,0.381423,1.0
7,4,casual,0.443751,1.0


In [18]:
# ====================================================
# 3. 날씨에 따른 기온과 자전거 대여의 상관계수 변화
# 날씨가 맑은 날과 그렇지 않은 날의 상관계수 차이의 절댓값?

# 정답: 0.025

# bike.csv
# 자전거 대여 숫자는 casual 변수 사용
# ====================================================
df = pd.read_csv("Data/bike.csv")
df.head(2)

Unnamed: 0,datetime,season,holiday,workingday,weather,temp,atemp,humidity,windspeed,casual,registered,count
0,2011-01-01 00:00:00,1,0,0,1,9.84,14.395,81,0.0,3,13,16
1,2011-01-01 01:00:00,1,0,0,1,9.02,13.635,80,0.0,8,32,40


In [19]:
df["is_sunny"] = (df["weather"] == 1) + 0
df.head(2)

Unnamed: 0,datetime,season,holiday,workingday,weather,temp,atemp,humidity,windspeed,casual,registered,count,is_sunny
0,2011-01-01 00:00:00,1,0,0,1,9.84,14.395,81,0.0,3,13,16,1
1,2011-01-01 01:00:00,1,0,0,1,9.02,13.635,80,0.0,8,32,40,1


In [21]:
df_corr = df.groupby("is_sunny")[["temp", "casual"]].corr()
df_corr

Unnamed: 0_level_0,Unnamed: 1_level_0,temp,casual
is_sunny,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
0,temp,1.0,0.446361
0,casual,0.446361,1.0
1,temp,1.0,0.471053
1,casual,0.471053,1.0


In [23]:
round(abs(df_corr.iloc[1, 0] - df_corr.iloc[3, 0]), 3)

0.025