In [1]:
import pandas as pd
from scipy.stats import f_oneway
from statsmodels.formula.api import ols
from statsmodels.stats.anova import anova_lm
from statsmodels.stats.multicomp import pairwise_tukeyhsd

# 일원 분산 분석 - f_oneway()

In [2]:
df = pd.read_csv("실기 Python 실습파일 (2)/diamonds.csv")
df.head(2)

Unnamed: 0,carat,cut,color,clarity,depth,table,price,x,y,z
0,0.23,Ideal,E,SI2,61.5,55.0,326,3.95,3.98,2.43
1,0.21,Premium,E,SI1,59.8,61.0,326,3.89,3.84,2.31


In [3]:
df["color"].unique()

array(['E', 'I', 'J', 'H', 'F', 'G', 'D'], dtype=object)

In [6]:
stat, p = f_oneway(df.loc[df["color"]  == 'E',"price"],
                   df.loc[df["color"]  == 'I',"price"],
                   df.loc[df["color"]  == 'J',"price"])
print(round(stat, 3))
print(round(p, 3))

621.846
0.0


In [7]:
df.groupby("color")["price"].mean().reset_index()

Unnamed: 0,color,price
0,D,3169.954096
1,E,3076.752475
2,F,3724.886397
3,G,3999.135671
4,H,4486.669196
5,I,5091.874954
6,J,5323.81802


# 일원 분산 분석 - ols(), anova_lm()

In [10]:
bike = pd.read_csv("실기 Python 실습파일 (2)/bike.csv")
bike.head(2)

Unnamed: 0,datetime,season,holiday,workingday,weather,temp,atemp,humidity,windspeed,casual,registered,count
0,2011-01-01 00:00:00,1,0,0,1,9.84,14.395,81,0.0,3,13,16
1,2011-01-01 01:00:00,1,0,0,1,9.02,13.635,80,0.0,8,32,40


In [12]:
model = ols(formula = "temp ~ season", data = bike).fit()
anova_lm(model)

Unnamed: 0,df,sum_sq,mean_sq,F,PR(>F)
season,1.0,44221.657301,44221.657301,780.591754,5.87956e-166
Residual,10884.0,616594.417651,56.651453,,


In [13]:
model = ols(formula = "temp ~ C(season)", data = bike).fit()
anova_lm(model)

Unnamed: 0,df,sum_sq,mean_sq,F,PR(>F)
C(season),3.0,412885.270005,137628.423335,6040.687453,0.0
Residual,10882.0,247930.804947,22.78357,,


# 사후검정

어떤 집단에서 평균이 다른지 알기 어렵기 때문에 사후검정을 한다.

In [14]:
result = pairwise_tukeyhsd(bike["temp"],bike["season"])
print(result)

 Multiple Comparison of Means - Tukey HSD, FWER=0.05 
group1 group2 meandiff p-adj  lower    upper   reject
-----------------------------------------------------
     1      2   10.293 0.001   9.9598  10.6262   True
     1      3  16.2586 0.001  15.9254  16.5918   True
     1      4   4.1187 0.001   3.7856   4.4519   True
     2      3   5.9656 0.001   5.6339   6.2974   True
     2      4  -6.1742 0.001   -6.506  -5.8425   True
     3      4 -12.1399 0.001 -12.4716 -11.8081   True
-----------------------------------------------------


Q1

Q2

In [15]:
bike = pd.read_csv("실기 Python 실습파일 (2)/bike.csv")
bike.head(2)

Unnamed: 0,datetime,season,holiday,workingday,weather,temp,atemp,humidity,windspeed,casual,registered,count
0,2011-01-01 00:00:00,1,0,0,1,9.84,14.395,81,0.0,3,13,16
1,2011-01-01 01:00:00,1,0,0,1,9.02,13.635,80,0.0,8,32,40


In [16]:
bike.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 10886 entries, 0 to 10885
Data columns (total 12 columns):
 #   Column      Non-Null Count  Dtype  
---  ------      --------------  -----  
 0   datetime    10886 non-null  object 
 1   season      10886 non-null  int64  
 2   holiday     10886 non-null  int64  
 3   workingday  10886 non-null  int64  
 4   weather     10886 non-null  int64  
 5   temp        10886 non-null  float64
 6   atemp       10886 non-null  float64
 7   humidity    10886 non-null  int64  
 8   windspeed   10886 non-null  float64
 9   casual      10886 non-null  int64  
 10  registered  10886 non-null  int64  
 11  count       10886 non-null  int64  
dtypes: float64(3), int64(8), object(1)
memory usage: 1020.7+ KB


In [17]:
formula = "temp ~ C(season)" #C() 범주형 변수임을 명시 
lm = ols(formula, df).fit()
anova_lm(lm)

Unnamed: 0,df,sum_sq,mean_sq,F,PR(>F)
C(season),3.0,412885.270005,137628.423335,6040.687453,0.0
Residual,10882.0,247930.804947,22.78357,,


Q3.

In [18]:
bike = pd.read_csv("실기 Python 실습파일 (2)/bike.csv")
bike.head(2)

Unnamed: 0,datetime,season,holiday,workingday,weather,temp,atemp,humidity,windspeed,casual,registered,count
0,2011-01-01 00:00:00,1,0,0,1,9.84,14.395,81,0.0,3,13,16
1,2011-01-01 01:00:00,1,0,0,1,9.02,13.635,80,0.0,8,32,40


In [19]:
df["datetime"] = pd.to_datetime(df["datetime"])
df["wday"] = df["datetime"].dt.weekday
df.head(2)

Unnamed: 0,datetime,season,holiday,workingday,weather,temp,atemp,humidity,windspeed,casual,registered,count,wday
0,2011-01-01 00:00:00,1,0,0,1,9.84,14.395,81,0.0,3,13,16,5
1,2011-01-01 01:00:00,1,0,0,1,9.02,13.635,80,0.0,8,32,40,5


In [20]:
print(pairwise_tukeyhsd(df["registered"],df["wday"]))

 Multiple Comparison of Means - Tukey HSD, FWER=0.05  
group1 group2 meandiff p-adj   lower    upper   reject
------------------------------------------------------
     0      1   6.1979    0.9  -9.7188  22.1146  False
     0      2    5.343    0.9 -10.5427  21.2287  False
     0      3  12.7424 0.2132  -3.1383   28.623  False
     0      4   6.2956    0.9  -9.6471  22.2384  False
     0      5 -27.5063  0.001 -43.3091 -11.7036   True
     0      6 -36.7583  0.001 -52.5734 -20.9431   True
     1      2  -0.8549    0.9 -16.7716  15.0618  False
     1      3   6.5445 0.8863  -9.3671  22.4561  False
     1      4   0.0977    0.9 -15.8759  16.0713  False
     1      5 -33.7042  0.001 -49.5381 -17.8704   True
     1      6 -42.9562  0.001 -58.8024 -27.1099   True
     2      3   7.3994 0.7916  -8.4813    23.28  False
     2      4   0.9526    0.9 -14.9901  16.8954  False
     2      5 -32.8493  0.001 -48.6521 -17.0466   True
     2      6 -42.1013  0.001 -57.9164 -26.2861   True
     3    