In [2]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

from scipy.stats import chisquare
from scipy import stats
import scipy as sp
import statsmodels.api as sm
from statsmodels.formula.api import ols, logit, glm

In [21]:
df = pd.read_csv('./data/spstat1.csv', encoding='euc-kr')
df.head()

Unnamed: 0,번호,사용브랜드,사용기간,구매장소,구매가격,정보획득,구입조언,하루사용시간,브랜드이미지,가격만족도,...,구매중요도9_A/S,구매중요도10_명성,성별,연령,연령2,학력,직업,월수입,결혼,가족수
0,181,1,3,1,15,1,2,1,2,4,...,4,3,0,4,2,2,4,2,2,3
1,94,1,3,3,20,4,3,3,4,3,...,4,5,1,3,2,4,4,4,2,3
2,12,1,2,1,25,1,1,1,3,2,...,4,4,0,4,2,4,7,4,2,1
3,156,1,5,3,26,4,3,1,4,4,...,4,4,0,6,3,2,2,5,3,1
4,19,1,3,5,30,6,2,3,2,2,...,3,3,0,2,1,4,9,2,1,3


# Test1 : 월수입, 사용브랜드

In [22]:
a = pd.crosstab(df['월수입'], df['사용브랜드']).values
a

array([[20, 13],
       [14, 22],
       [28, 24],
       [20, 17],
       [12, 28],
       [33, 21]], dtype=int64)

In [23]:
rsum = a.sum(axis=0).reshape(1,-1)
csum = a.sum(axis=1).reshape(-1,1)
xe = rsum*csum/rsum.sum()
xe

array([[16.63095238, 16.36904762],
       [18.14285714, 17.85714286],
       [26.20634921, 25.79365079],
       [18.6468254 , 18.3531746 ],
       [20.15873016, 19.84126984],
       [27.21428571, 26.78571429]])

In [24]:
((a - xe)**2/xe).sum()

12.86516581745558

In [32]:
# 유의함
chisquare(a, xe, ddof=5, axis=None)

Power_divergenceResult(statistic=12.86516581745558, pvalue=0.045227798622814)

link : https://junsik-hwang.tistory.com/23

In [33]:
# chisquare 함수의 ddof가 6일 때 결과와 같음
stats.chi2_contingency(observed=a)

(12.86516581745558,
 0.024675611662926037,
 5,
 array([[16.63095238, 16.36904762],
        [18.14285714, 17.85714286],
        [26.20634921, 25.79365079],
        [18.6468254 , 18.3531746 ],
        [20.15873016, 19.84126984],
        [27.21428571, 26.78571429]]))

In [34]:
chisquare(a, xe, ddof=6, axis=None)

Power_divergenceResult(statistic=12.86516581745558, pvalue=0.024675611662926037)

# Test2 : 구매장소, 성별

In [85]:
a = pd.crosstab(df['구매장소'],df['성별']).values
a

array([[24, 19],
       [42, 29],
       [ 9, 13],
       [38, 52],
       [13, 13]], dtype=int64)

In [86]:
rsum = a.sum(axis=0).reshape(1,-1)
csum = a.sum(axis=1).reshape(-1,1)
xe = rsum*csum/rsum.sum()
xe

array([[21.5, 21.5],
       [35.5, 35.5],
       [11. , 11. ],
       [45. , 45. ],
       [13. , 13. ]])

In [87]:
((a - xe)**2/xe).sum()

5.866727544028559

In [89]:
chisquare(a, xe, ddof=2, axis=None)

Power_divergenceResult(statistic=5.866727544028559, pvalue=0.555394410553356)

# Test3
link : https://angeloyeo.github.io/2021/12/13/chi_square.html

In [73]:
a = np.array([[21,13,6],[16,15,14]]).T
a

array([[21, 16],
       [13, 15],
       [ 6, 14]])

In [74]:
rsum = a.sum(axis=0).reshape(1,-1)
csum = a.sum(axis=1).reshape(-1,1)
xe = rsum*csum/rsum.sum()
xe

array([[17.41176471, 19.58823529],
       [13.17647059, 14.82352941],
       [ 9.41176471, 10.58823529]])

In [75]:
((a - xe)**2/xe).sum()

3.7373471685971698

In [80]:
chisquare(a, xe, ddof=2, axis=None)

Power_divergenceResult(statistic=3.7373471685971698, pvalue=0.2912582956880207)

# Test4
link : http://contents.kocw.net/KOCW/document/2013/koreasejong/HongSungsik4/10.pdf page3

In [98]:
a = np.array([[120,89,173,118], [124,155,147,74]])
a

array([[120,  89, 173, 118],
       [124, 155, 147,  74]])

In [102]:
rsum = a.sum(axis=0).reshape(1,-1)
csum = a.sum(axis=1).reshape(-1,1)
xe = rsum*csum/rsum.sum()
xe

array([[122., 122., 160.,  96.],
       [122., 122., 160.,  96.]])

In [103]:
((a - xe)**2/xe).sum()

30.113866120218578

In [104]:
chisquare(a, xe, ddof=3, axis=None)

Power_divergenceResult(statistic=30.113866120218578, pvalue=4.640017883913084e-06)

In [105]:
chisquare(a, ddof=3, axis=None)

Power_divergenceResult(statistic=61.279999999999994, pvalue=1.561182050211938e-12)

In [106]:
chisquare(a, ddof=3, axis=None)

Power_divergenceResult(statistic=0.0, pvalue=1.0)

# Test5 : f_obs 만 입력
- One-way chisquare test : 전체 빈도수를 동일하게 나눈 수치만큼 기대빈도를 설정해 검정 진행.

In [109]:
chisquare([16, 18, 16, 14, 12, 12])

Power_divergenceResult(statistic=2.0, pvalue=0.8491450360846096)

In [113]:
s_ = sum([16, 18, 16, 14, 12, 12])
chisquare([16, 18, 16, 14, 12, 12], [s_/6]*6)

Power_divergenceResult(statistic=2.0, pvalue=0.8491450360846096)

# Test 6 : 세탁기 크기와 가족규모
link : https://brunch.co.kr/@linecard/623

In [3]:
# 교차표(corsstabulation)
a = np.array([[25,37,8], [10,62,53], [5,41,59]])
a

array([[25, 37,  8],
       [10, 62, 53],
       [ 5, 41, 59]])

In [4]:
# 기대값(f_exp, explectation 계산)
rsum = a.sum(axis=0).reshape(1,-1)
csum = a.sum(axis=1).reshape(-1,1)
xe = rsum*csum/rsum.sum()
xe

array([[ 9.33333333, 32.66666667, 28.        ],
       [16.66666667, 58.33333333, 50.        ],
       [14.        , 49.        , 42.        ]])

In [5]:
# 카이스퀘어 계산
((a - xe)**2/xe).sum()

58.20809523809523

In [8]:
chisquare(f_obs=a, f_exp=xe, ddof=4, axis=None)

Power_divergenceResult(statistic=58.20809523809523, pvalue=6.900770913722909e-12)

**과제파일**

In [9]:
# 교차표(corsstabulation)
a = np.array([[9,5,1], [2,8,5], [11,13,6]])
a

array([[ 9,  5,  1],
       [ 2,  8,  5],
       [11, 13,  6]])

In [10]:
# 기대값(f_exp, explectation 계산)
rsum = a.sum(axis=0).reshape(1,-1)
csum = a.sum(axis=1).reshape(-1,1)
xe = rsum*csum/rsum.sum()
xe

array([[ 5.5,  6.5,  3. ],
       [ 5.5,  6.5,  3. ],
       [11. , 13. ,  6. ]])

In [11]:
# 카이스퀘어 계산
((a - xe)**2/xe).sum()

7.813519813519813

In [12]:
chisquare(f_obs=a, f_exp=xe, ddof=4, axis=None)

Power_divergenceResult(statistic=7.813519813519813, pvalue=0.09865305493613609)