In [1]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
from scipy.stats import levene
from scipy.stats import shapiro
from scipy.stats import kruskal
from statsmodels.formula.api import ols
from statsmodels.stats.anova import anova_lm

In [2]:
#집단별 각 변수의 차이가 유의미한가?

In [3]:
lst_name = ['수원','용인','부천','시흥','광명','안산','안양','포천','하남','화성']
lst_path = [f"클러스터링_데이터/클러스터링_{name}.csv" for name in lst_name]

In [4]:
df= pd.read_csv(lst_path[0])

In [5]:
df_anova=df.iloc[:,2:7]
df_anova.head()

Unnamed: 0,입출차비율,소매/유통,의료/건강,70세_이상_비율,kmeans_cluster
0,0.26,56591.0,58389.0,0.19,2
1,0.24,102121.0,64102.0,0.19,2
2,0.24,56630.0,111038.0,0.06,2
3,0.22,148486.0,64920.0,0.07,2
4,0.26,36529.0,13262.0,0.13,1


# 등분산 검정

In [6]:
#등분산성 확인 - 레빈 검증
levene_act = levene(df_anova.입출차비율[df_anova.kmeans_cluster==1],
             df_anova.입출차비율[df_anova.kmeans_cluster==2],
             df_anova.입출차비율[df_anova.kmeans_cluster==0])
levene_dict = {'statistic':np.round(levene_act[0],4),'pvalue':np.round(levene_act[1],4)}
levene_dict

{'statistic': 0.7023, 'pvalue': 0.5013}

# 정규성 검정

In [7]:
#정규성 검정

shapiro_0_act = shapiro(df_anova.입출차비율[df_anova.kmeans_cluster==0])
shapiro_1_act = shapiro(df_anova.입출차비율[df_anova.kmeans_cluster==1])
shapiro_2_act = shapiro(df_anova.입출차비율[df_anova.kmeans_cluster==2])

shapiro_0_dict = {'statistic':np.round(shapiro_0_act[0],4),'pvalue':np.round(shapiro_0_act[1],4)}
shapiro_1_dict = {'statistic':np.round(shapiro_1_act[0],4),'pvalue':np.round(shapiro_1_act[1],4)}
shapiro_2_dict = {'statistic':np.round(shapiro_2_act[0],4),'pvalue':np.round(shapiro_2_act[1],4)}

shapiro_0_dict, shapiro_1_dict, shapiro_2_dict

({'statistic': 0.8327, 'pvalue': 0.036},
 {'statistic': 0.9174, 'pvalue': 0.0229},
 {'statistic': 0.9447, 'pvalue': 0.683})

# 정규성 가정이 성립하지 않을때(비모수검정:Kruskal-Wallis)

In [8]:
#집단별 '입출차비율'의 차이가 유의미한가
kw_car = kruskal(df_anova.loc[df_anova.kmeans_cluster==2,'입출차비율'],
              df_anova.loc[df_anova.kmeans_cluster==1,'입출차비율'],
              df_anova.loc[df_anova.kmeans_cluster==0,'입출차비율'])
         
kw_car_dict = {'statistic':np.round(kw_car[0],4),'pvalue':np.round(kw_car[1],4)}
kw_car_dict

{'statistic': 18.9426, 'pvalue': 0.0001}

In [9]:
#집단별 '70세_이상_비율'의 차이가 유의미한가
kw_elder = kruskal(df_anova.loc[df_anova.kmeans_cluster==2,'70세_이상_비율'],
              df_anova.loc[df_anova.kmeans_cluster==1,'70세_이상_비율'],
              df_anova.loc[df_anova.kmeans_cluster==0,'70세_이상_비율'])
         
kw_elder_dict = {'statistic':np.round(kw_elder[0],4),'pvalue':np.round(kw_elder[1],4)}
kw_elder_dict

{'statistic': 3.0951, 'pvalue': 0.2128}

In [10]:
#집단별 '의료/건강'의 차이가 유의미한가
kw_hos = kruskal(df_anova.loc[df_anova.kmeans_cluster==2,'의료/건강'],
              df_anova.loc[df_anova.kmeans_cluster==1,'의료/건강'],
              df_anova.loc[df_anova.kmeans_cluster==0,'의료/건강'])
         
kw_hos_dict = {'statistic':np.round(kw_hos[0],4),'pvalue':np.round(kw_hos[1],4)}
kw_hos_dict

{'statistic': 11.0941, 'pvalue': 0.0039}

In [11]:
#집단별 '소매/유통'의 차이가 유의미한가
kw_mart = kruskal(df_anova.loc[df_anova.kmeans_cluster==2,'소매/유통'],
              df_anova.loc[df_anova.kmeans_cluster==1,'소매/유통'],
              df_anova.loc[df_anova.kmeans_cluster==0,'소매/유통'])
         
kw_mart_dict = {'statistic':np.round(kw_mart[0],4),'pvalue':np.round(kw_mart[1],4)}
kw_mart_dict

{'statistic': 5.4599, 'pvalue': 0.0652}

In [12]:
df_stat = pd.DataFrame([levene_dict,shapiro_0_dict, shapiro_1_dict, shapiro_2_dict,kw_car_dict,kw_elder_dict,kw_hos_dict,kw_mart_dict],index = ['등분산성','정규성_0','정규성_1','정규성_2','입출차비율','70세_이상_비율','의료/건강','소매/유통'])

In [13]:
df_stat

Unnamed: 0,statistic,pvalue
등분산성,0.7023,0.5013
정규성_0,0.8327,0.036
정규성_1,0.9174,0.0229
정규성_2,0.9447,0.683
입출차비율,18.9426,0.0001
70세_이상_비율,3.0951,0.2128
의료/건강,11.0941,0.0039
소매/유통,5.4599,0.0652


# 전체결합해서 내보내기

In [16]:
def anova_all(path,name):
    
    # 파일 불러들이기
    df= pd.read_csv(path)
    df_anova=df.iloc[:,2:7]
    
    # 등분산성 검정
    levene_act = levene(df_anova.입출차비율[df_anova.kmeans_cluster==1],
             df_anova.입출차비율[df_anova.kmeans_cluster==2],
             df_anova.입출차비율[df_anova.kmeans_cluster==0])
    levene_dict = {'statistic':np.round(levene_act[0],4),'pvalue':np.round(levene_act[1],4)}
    
    # 정규성 검정
    try:
        shapiro_0_act = shapiro(df_anova.입출차비율[df_anova.kmeans_cluster==0])
        shapiro_1_act = shapiro(df_anova.입출차비율[df_anova.kmeans_cluster==1])
        shapiro_2_act = shapiro(df_anova.입출차비율[df_anova.kmeans_cluster==2])
        shapiro_0_dict = {'statistic':np.round(shapiro_0_act[0],4),'pvalue':np.round(shapiro_0_act[1],4)}
        shapiro_1_dict = {'statistic':np.round(shapiro_1_act[0],4),'pvalue':np.round(shapiro_1_act[1],4)}
        shapiro_2_dict = {'statistic':np.round(shapiro_2_act[0],4),'pvalue':np.round(shapiro_2_act[1],4)}
    except ValueError: # 개수 부족으로 정규성 검정 불가능할 시 -1로 입력
        shapiro_0_dict = {'statistic':-1,'pvalue':-1}
        shapiro_1_dict = {'statistic':-1,'pvalue':-1}
        shapiro_2_dict = {'statistic':-1,'pvalue':-1}
    # 입출차비율,집단간차이 분석
    kw_car = kruskal(df_anova.loc[df_anova.kmeans_cluster==2,'입출차비율'],
              df_anova.loc[df_anova.kmeans_cluster==1,'입출차비율'],
              df_anova.loc[df_anova.kmeans_cluster==0,'입출차비율'])
    kw_car_dict = {'statistic':np.round(kw_car[0],4),'pvalue':np.round(kw_car[1],4)}
    
    # 70세이상비율, 집단간차이 분석
    kw_elder = kruskal(df_anova.loc[df_anova.kmeans_cluster==2,'70세_이상_비율'],
              df_anova.loc[df_anova.kmeans_cluster==1,'70세_이상_비율'],
              df_anova.loc[df_anova.kmeans_cluster==0,'70세_이상_비율'])
    kw_elder_dict = {'statistic':np.round(kw_elder[0],4),'pvalue':np.round(kw_elder[1],4)}
    
    # 의료/건강, 집단 간 차이 분석
    kw_hos = kruskal(df_anova.loc[df_anova.kmeans_cluster==2,'의료/건강'],
              df_anova.loc[df_anova.kmeans_cluster==1,'의료/건강'],
              df_anova.loc[df_anova.kmeans_cluster==0,'의료/건강'])
    kw_hos_dict = {'statistic':np.round(kw_hos[0],4),'pvalue':np.round(kw_hos[1],4)}
    
    # 소매/유통, 집단 간 차이 분석
    kw_mart = kruskal(df_anova.loc[df_anova.kmeans_cluster==2,'소매/유통'],
              df_anova.loc[df_anova.kmeans_cluster==1,'소매/유통'],
              df_anova.loc[df_anova.kmeans_cluster==0,'소매/유통'])
    kw_mart_dict = {'statistic':np.round(kw_mart[0],4),'pvalue':np.round(kw_mart[1],4)}

    # 데이터 통합
    df_stat = pd.DataFrame([levene_dict,shapiro_0_dict, shapiro_1_dict, shapiro_2_dict,kw_car_dict,kw_elder_dict,kw_hos_dict,kw_mart_dict],
                           index = ['등분산성','정규성_0','정규성_1','정규성_2','입출차비율','70세_이상_비율','의료/건강','소매/유통'])
    print(df_stat)
    # 데이터 반출
    df_stat.to_csv(f'집단간차이분석결과/{name}_집단간차이분석.csv')

In [17]:
for path,name in zip(lst_path, lst_name):
    anova_all(path,name)

           statistic  pvalue
등분산성          0.7023  0.5013
정규성_0         0.8327  0.0360
정규성_1         0.9174  0.0229
정규성_2         0.9447  0.6830
입출차비율        18.9426  0.0001
70세_이상_비율     3.0951  0.2128
의료/건강        11.0941  0.0039
소매/유통         5.4599  0.0652
           statistic  pvalue
등분산성          4.5474  0.0176
정규성_0        -1.0000 -1.0000
정규성_1        -1.0000 -1.0000
정규성_2        -1.0000 -1.0000
입출차비율        14.3562  0.0008
70세_이상_비율     7.8011  0.0202
의료/건강        25.5376  0.0000
소매/유통        19.5435  0.0001
           statistic  pvalue
등분산성          1.9865  0.1528
정규성_0        -1.0000 -1.0000
정규성_1        -1.0000 -1.0000
정규성_2        -1.0000 -1.0000
입출차비율        23.2398  0.0000
70세_이상_비율    11.6985  0.0029
의료/건강        12.3069  0.0021
소매/유통         6.0318  0.0490
           statistic  pvalue
등분산성          2.9938  0.0769
정규성_0         0.8978  0.4202
정규성_1         0.9054  0.4581
정규성_2         0.8942  0.1334
입출차비율         9.9592  0.0069
70세_이상_비율     3.3560  0.1867
의료/건강        1