In [1]:
import pandas as pd
import numpy as np

# 모평균 검정(모집단 1개)

In [2]:
# 데이터셋 mpg평균이 20과 같다고 할 수 있는지?
df = pd.read_csv("mtcars.csv")

In [3]:
import scipy.stats as stats
from scipy.stats import shapiro

In [5]:
# 정규성 검정
statistic, pvalue = stats.shapiro(df['mpg'])
print(round(statistic,4),round(pvalue,4))

0.9476 0.1229


In [6]:
# 정규성 만족
statistic, pvalue = stats.ttest_1samp(df['mpg'], popmean=20, alternative='two-sided')
                                                            # 보다 크다 : 'greater'
                                                            # 보다 작다 : 'less'
print(round(statistic,4),round(pvalue,4))

0.0851 0.9328


In [7]:
# 정규성 불만족
statistic, pvalue = stats.wilcoxon(df['mpg']-20, alternative = 'two-sided')
print(round(statistic,4),round(pvalue,4))

249.0 0.7891


# 모평균 검정(모집단 2개-쌍체)

In [8]:
# 혈압약 먹기 전,후의 차이
df = pd.DataFrame({
    'before': [120, 135, 122, 124, 135, 122, 145, 160, 155, 142, 144, 135, 167],
    'after': [110, 132, 123, 119, 123, 115, 140, 162, 142, 138, 135, 142, 160]})

In [9]:
import scipy.stats as stats
from scipy.stats import shapiro

In [10]:
# 정규성 검정
statistic, pvalue = stats.shapiro(df['after']-df['before'])
print(round(statistic,4), round(pvalue,4))

0.9589 0.7363


In [12]:
# 정규성 만족
statistic, pvalue = stats.ttest_rel(df['after'],df['before'],alternative = 'two-sided')
                                                               # 보다 크다 : 'greater'
                                                               # 보다 작다 : 'less'
print(round(statistic,4),round(pvalue,4))

-3.1382 0.0086


In [13]:
# 정규성 불만족
statistic, pvalue = stats.wilcoxon(df['after']-df['before'],alternative='two-sided')
print(round(statistic,4),round(pvalue,4))

11.0 0.0134


# 모평균 검정(모집단 2개-2sample)

In [14]:
# 두 그룹의 혈압평균이 다르다고 할 수 있는지?
df = pd.DataFrame({
    'A': [120, 135, 122, 124, 135, 122, 145, 160, 155, 142, 144, 135, 167],
    'B': [110, 132, 123, 119, 123, 115, 140, 162, 142, 138, 135, 142, 160]})

In [28]:
import scipy.stats as stats
from scipy.stats import shapiro

In [15]:
# 정규성 검정
statisticA, pvalueA = stats.shapiro(df['A'])
statisticB, pvalueB = stats.shapiro(df['B'])

print(round(statisticA,4),round(pvalueA,4))
print(round(statisticB,4),round(pvalueB,4))

0.9314 0.3559
0.9498 0.5956


In [17]:
# 등분산성 검정
statistic, pvalue = stats.bartlett(df['A'],df['B'])
print(round(statistic,4),round(pvalue,4))

0.0279 0.8673


In [19]:
# 정규성 만족, 등분산성 만족/불만족
statistic, pvalue = stats.ttest_ind(df['A'],df['B'],
                                   equal_var = True,     # 등분산 불만족 : False
                                   alternative = 'two-sided')
print(round(statistic,4),round(pvalue,4))

0.8192 0.4207


In [21]:
# 정규성 불만족
statistic, pvalue = stats.ranksums(df['A'],df['B'],alternative = 'two-sided')
print(round(statistic,4),round(pvalue,4))

0.8462 0.3975


# 모평균 검정(모집단 3개)

In [22]:
# 세 그룹의 성적 평균이 같다고 할 수 있는지?
df = pd.DataFrame({
    'A': [120, 135, 122, 124, 135, 122, 145, 160, 155, 142, 144, 135, 167],
    'B': [110, 132, 123, 119, 123, 115, 140, 162, 142, 138, 135, 142, 160],
    'C': [130, 120, 115, 122, 133, 144, 122, 120, 110, 134, 125, 122, 122]})

In [29]:
import scipy.stats as stats
from scipy.stats import shapiro

In [23]:
# 정규성 검정
print(stats.shapiro(df['A']))
print(stats.shapiro(df['B']))
print(stats.shapiro(df['C']))

ShapiroResult(statistic=0.9314376711845398, pvalue=0.35585272312164307)
ShapiroResult(statistic=0.9498201012611389, pvalue=0.5955665707588196)
ShapiroResult(statistic=0.9396706223487854, pvalue=0.45265132188796997)


In [24]:
# 등분산성 검정
print(stats.bartlett(df['A'],df['B'],df['C']))

BartlettResult(statistic=4.222248448848066, pvalue=0.12110174433684852)


In [25]:
# 정규성 만족, 등분산성 만족
statistic, pvalue = stats.f_oneway(df['A'],df['B'],df['C'])
print(round(statistic,4),round(pvalue,4))

3.6971 0.0346


In [26]:
# 정규성 불만족
statistic, pvalue = stats.kruskal(df['A'],df['B'],df['C'])
print(round(statistic,4),round(pvalue,4))

6.897 0.0318


# 카이제곱 검정

In [30]:
# 동일한 비욜로 들어있다고 할 수 있는지?(각 범주에 속할 확률이 같은지?)
row1 = [30, 20, 15, 35]
df = pd.DataFrame([row1], columns=['A','B','C','D'])

In [32]:
from scipy.stats import chisquare

In [35]:
# 검정
f_obs = [30,20,15,35]
f_exp = [25,25,25,25]
# 동일한 비율이 아닐 때(30%, 10%, 35%, 20%)
# a = 100*0.3
# b = 100*0.1
# c = 100*0.35
# d = 100*0.2
# f_exp = [a,b,c,d]

statistic , pvalue = chisquare(f_obs=f_obs, f_exp=f_exp)
print(statistic) 
print(pvalue)

10.0
0.01856613546304325


In [36]:
# 차이가 있는지?(두 개의 범주형 변수가 서로 독립인지?)
row1, row2 = [200,190,250], [220,250,300]
df = pd.DataFrame([row1, row2], columns=['딸기','초코','바닐라'], index=['10대','20대'])

In [37]:
from scipy.stats import chi2_contingency
statistic, pvalue, dof, expected = chi2_contingency(df)

print(statistic)
print(pvalue)
print(dof)
print(np.round(expected,2))

1.708360126075226
0.4256320394874311
2
[[190.64 199.72 249.65]
 [229.36 240.28 300.35]]


In [38]:
# 교차표 형태가 아닐 경우
df = pd.DataFrame({
    '아이스크림' : ['딸기','초코','바닐라','딸기','초코','바닐라'],
    '연령' : ['10대','10대','10대','20대','20대','20대'],
    '인원' : [200,190,250,220,250,300]
})
df

Unnamed: 0,아이스크림,연령,인원
0,딸기,10대,200
1,초코,10대,190
2,바닐라,10대,250
3,딸기,20대,220
4,초코,20대,250
5,바닐라,20대,300


In [39]:
table = pd.crosstab(index=df['연령'],columns=df['아이스크림'],values=df['인원'],aggfunc=sum)
table

아이스크림,딸기,바닐라,초코
연령,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
10대,200,250,190
20대,220,300,250


# 다중회귀분석

In [40]:
from sklearn.datasets import load_diabetes

diabetes = load_diabetes()
x = pd.DataFrame(diabetes.data, columns = diabetes.feature_names)
y = pd.DataFrame(diabetes.target)
y.columns = ['target']

In [42]:
x = x[['age','sex','bmi']]
y = y['target']

In [43]:
import statsmodels.api as sm
x = sm.add_constant(x)
model = sm.OLS(y, x).fit()
summary = model.summary()
print(summary)

                            OLS Regression Results                            
Dep. Variable:                 target   R-squared:                       0.351
Model:                            OLS   Adj. R-squared:                  0.346
Method:                 Least Squares   F-statistic:                     78.94
Date:                Tue, 28 Nov 2023   Prob (F-statistic):           7.77e-41
Time:                        17:15:23   Log-Likelihood:                -2451.6
No. Observations:                 442   AIC:                             4911.
Df Residuals:                     438   BIC:                             4928.
Df Model:                           3                                         
Covariance Type:            nonrobust                                         
                 coef    std err          t      P>|t|      [0.025      0.975]
------------------------------------------------------------------------------
const        152.1335      2.964     51.321      0.0

In [44]:
# 1. Rsq(결정계수)
# r2 = 0.351

# 2. 회귀계수
# age = 138.9039
# sex = -36.1353
# bmi = 926.9120

# 3. 회귀계수(절편)
# const = 152.1335

# 4. 회귀식 p-value
# pvalue = 7.77e-41

# 상관분석

In [45]:
from sklearn.datasets import load_diabetes

diabetes = load_diabetes()
x = pd.DataFrame(diabetes.data, columns = diabetes.feature_names)
y = pd.DataFrame(diabetes.target)
y.columns = ['target']

In [46]:
x = x['bmi']
y = y['target']

In [49]:
from scipy.stats import pearsonr

r, pvalue = pearsonr(x,y)

print(round(r,2))    # 상관계수
print(round(pvalue,2))
# 통계량 : r*root(n-2) / root(1-r^2)

n = len(x)
statistic = r*((n-2)**0.5) / ((1-r**2)**0.5)
print(round(statistic,2))

0.59
0.0
15.19


# 로지스틱 회귀분석

In [2]:
import seaborn as sns
df = sns.load_dataset('titanic')

In [3]:
df = df[['survived','sex','sibsp','fare']]

In [4]:
# 문자형 데이터타입을 숫자로 변경
df['sex'] = df['sex'].map({'female':1,
                          'male':0})

In [5]:
x = df.drop(['survived'],axis=1)
y = df['survived']

In [6]:
x

Unnamed: 0,sex,sibsp,fare
0,0,1,7.2500
1,1,1,71.2833
2,1,0,7.9250
3,1,1,53.1000
4,0,0,8.0500
...,...,...,...
886,0,0,13.0000
887,1,0,30.0000
888,1,1,23.4500
889,0,0,30.0000


In [7]:
import statsmodels.api as sm

x = sm.add_constant(x)
model = sm.Logit(y, x).fit()
summary = model.summary()
print(summary)

Optimization terminated successfully.
         Current function value: 0.483846
         Iterations 6
                           Logit Regression Results                           
Dep. Variable:               survived   No. Observations:                  891
Model:                          Logit   Df Residuals:                      887
Method:                           MLE   Df Model:                            3
Date:                Fri, 01 Dec 2023   Pseudo R-squ.:                  0.2734
Time:                        21:42:30   Log-Likelihood:                -431.11
converged:                       True   LL-Null:                       -593.33
Covariance Type:            nonrobust   LLR p-value:                 5.094e-70
                 coef    std err          z      P>|z|      [0.025      0.975]
------------------------------------------------------------------------------
const         -1.6964      0.129    -13.134      0.000      -1.950      -1.443
sex            2.5668      0.