# 등분산 검정
## 등분산 검정 종류
- __F-test__: 두 집단의 등분산 검정을 실시하며 각 집단은 정규분포를 따를 때 사용
- __Bartlett's test'__: 두 집단 이상의 등분산 검정을 실시하며 각 집단은 정규분포를 따를 때 사용
- __Levene's test__: 두 집단 이상의 등분산 검정을 실시하며 각 집단은 정규분포를 따를 필요 없음

## 가설
- 귀무가설(H0): 집단 간 분산은 서로 같음
- 대립가설(H1): 집단 간 분산은 서로 다름

## 주요 함수 및 메서드
- F-검정: __f.cdc()__
- Bartlett 검정: __bartlett()__
- Levene 검정: __levene()__

In [1]:
import pandas as pd
from scipy.stats import f
from scipy.stats import bartlett
from scipy.stats import levene

In [2]:
df = pd.read_csv("Data/financial_info_10k_persons.csv")
df.head(2)

Unnamed: 0,ID,is_attrited,Age,Gender,Dependent_cnt,Edu_level,Marital_status,Income,Card,Period_m,Total_rel_cnt,Inactive_last_12m,Contacts_cnt_last_12m,Credit_limit,Total_trans_amt,Total_trans_cnt
0,1,0,41,F,2,High School,Married,Less than $40K,Blue,36,6,2,2,4953.0,4183,67
1,2,0,38,M,0,High School,Single,$80K - $120K,Blue,29,3,3,2,5983.0,4141,65


In [3]:
ser_M = df.loc[df["Gender"] == "M", "Period_m"]
ser_F = df.loc[df["Gender"] == "F", "Period_m"]

In [4]:
F = ser_M.var() / ser_F.var()
F

1.040426345317289

In [6]:
result = f.cdf(F, dfd = len(ser_M)-1, dfn = len(ser_F))
result

0.9187893064992898

In [8]:
p = (1 - result) * 2 # p-value
p

0.1624213870014204

In [9]:
bartlett(ser_F, ser_M)

BartlettResult(statistic=1.9563015878266161, pvalue=0.16190940989253869)

In [10]:
stat, p = bartlett(ser_F, ser_M)
print(stat)
print(p)

1.9563015878266161
0.16190940989253869


In [11]:
levene(ser_F, ser_M)

LeveneResult(statistic=2.4640198991740747, pvalue=0.11651198398605053)

In [12]:
# ===============================================
# 1. 남성과 여성의 1회 평균 송금액의 분산을 비교 검정하고
# 그 결과의 검정 통계량은 얼마인가?
# F-검정 사용

# 정답: 1.7
# ===============================================
df = pd.read_csv("Data/financial_info_10k_persons.csv")
df.head(2)

Unnamed: 0,ID,is_attrited,Age,Gender,Dependent_cnt,Edu_level,Marital_status,Income,Card,Period_m,Total_rel_cnt,Inactive_last_12m,Contacts_cnt_last_12m,Credit_limit,Total_trans_amt,Total_trans_cnt
0,1,0,41,F,2,High School,Married,Less than $40K,Blue,36,6,2,2,4953.0,4183,67
1,2,0,38,M,0,High School,Single,$80K - $120K,Blue,29,3,3,2,5983.0,4141,65


In [13]:
df["trans_1_mean"] = df["Total_trans_amt"] / df["Total_trans_cnt"]
df.head(2)

Unnamed: 0,ID,is_attrited,Age,Gender,Dependent_cnt,Edu_level,Marital_status,Income,Card,Period_m,Total_rel_cnt,Inactive_last_12m,Contacts_cnt_last_12m,Credit_limit,Total_trans_amt,Total_trans_cnt,trans_1_mean
0,1,0,41,F,2,High School,Married,Less than $40K,Blue,36,6,2,2,4953.0,4183,67,62.432836
1,2,0,38,M,0,High School,Single,$80K - $120K,Blue,29,3,3,2,5983.0,4141,65,63.707692


In [14]:
samp_m = df.loc[df["Gender"] == "M", "trans_1_mean"]
samp_f = df.loc[df["Gender"] == "F", "trans_1_mean"]
F = samp_m.var() / samp_f.var()
print(F)

1.6665446172570928


In [16]:
# ======================================================
# 2. 50, 60, 70대의 1회 평균 송금액의 분산을 비교 검정하였을 때
# 산출되는 p-value는?
# Bartlett 검정 사용

# 정답: 0.004
# ======================================================
df = pd.read_csv("Data/financial_info_10k_persons.csv")
df["trans_1_mean"] = df["Total_trans_amt"] / df["Total_trans_cnt"]
df["Age_g"] = (df["Age"] // 10) * 10
df.head(2)

Unnamed: 0,ID,is_attrited,Age,Gender,Dependent_cnt,Edu_level,Marital_status,Income,Card,Period_m,Total_rel_cnt,Inactive_last_12m,Contacts_cnt_last_12m,Credit_limit,Total_trans_amt,Total_trans_cnt,trans_1_mean,Age_g
0,1,0,41,F,2,High School,Married,Less than $40K,Blue,36,6,2,2,4953.0,4183,67,62.432836,40
1,2,0,38,M,0,High School,Single,$80K - $120K,Blue,29,3,3,2,5983.0,4141,65,63.707692,30


In [17]:
bartlett(df.loc[df["Age_g"] == 50, "trans_1_mean"],
         df.loc[df["Age_g"] == 60, "trans_1_mean"],
         df.loc[df["Age_g"] == 70, "trans_1_mean"])

BartlettResult(statistic=10.989031521671865, pvalue=0.004109245841612487)

In [18]:
# ======================================================
# 3. 부양가족이 없는 남성을 대상으로 교육수준에 따른 1회 평균
# 송금액의 분산을 비교 검정하였을 때 산출되는 p-value? 0.507
# Levene 검정 사용
# ======================================================
df = pd.read_csv("Data/financial_info_10k_persons.csv")
df["trans_1_mean"] = df["Total_trans_amt"] / df["Total_trans_cnt"]
df_sub = df.loc[(df["Dependent_cnt"] == 0) & (df["Gender"] == "M"),]
df_sub.head(2)

Unnamed: 0,ID,is_attrited,Age,Gender,Dependent_cnt,Edu_level,Marital_status,Income,Card,Period_m,Total_rel_cnt,Inactive_last_12m,Contacts_cnt_last_12m,Credit_limit,Total_trans_amt,Total_trans_cnt,trans_1_mean
1,2,0,38,M,0,High School,Single,$80K - $120K,Blue,29,3,3,2,5983.0,4141,65,63.707692
26,27,0,32,M,0,College,Unknown,Less than $40K,Blue,36,3,3,4,3788.0,3975,65,61.153846


In [19]:
len(df_sub)

413

In [20]:
df_sub["Edu_level"].unique()

array(['High School', 'College', 'Unknown', 'Graduate', 'Uneducated',
       'Doctorate', 'Post-Graduate'], dtype=object)

In [21]:
df_sub["Edu_level"].nunique()

7

In [23]:
df_sub["Edu_level"].unique()[6]

'Post-Graduate'

In [25]:
levene(df_sub.loc[df_sub["Edu_level"] == df_sub["Edu_level"].unique()[0], "trans_1_mean"],
       df_sub.loc[df_sub["Edu_level"] == df_sub["Edu_level"].unique()[1], "trans_1_mean"],
       df_sub.loc[df_sub["Edu_level"] == df_sub["Edu_level"].unique()[2], "trans_1_mean"],
       df_sub.loc[df_sub["Edu_level"] == df_sub["Edu_level"].unique()[3], "trans_1_mean"],
       df_sub.loc[df_sub["Edu_level"] == df_sub["Edu_level"].unique()[4], "trans_1_mean"],
       df_sub.loc[df_sub["Edu_level"] == df_sub["Edu_level"].unique()[5], "trans_1_mean"],
       df_sub.loc[df_sub["Edu_level"] == df_sub["Edu_level"].unique()[6], "trans_1_mean"],)

LeveneResult(statistic=0.8832361640792544, pvalue=0.5070685402777693)