In [2]:
import pandas as pd
import numpy as np

# 모평균 검정(모집단 1개)

In [14]:
# 데이터셋 mpg평균이 20과 같다고 할 수 있는지?
df = pd.read_csv("mtcars.csv")

In [16]:
import scipy.stats as stats
from scipy.stats import shapiro

# 정규성
stat,pvalue = stats.shapiro(df['mpg'])
print(stat, pvalue)

0.9475648403167725 0.1228824257850647


In [18]:
# 만족
stat, pvalue = stats.ttest_1samp(df['mpg'],popmean=20,alternative='two-sided')
print(stat, pvalue)

0.08506003568133688 0.9327606409093872


In [20]:
# 불만족
stat, pvalue = stats.wilcoxon(df['mpg']-20,alternative='two-sided')
print(stat,pvalue)

249.0 0.78912592260167


# 모평균 검정(모집단 2개-쌍체)

In [21]:
# 혈압약 먹기 전,후의 차이
df = pd.DataFrame({
    'before': [120, 135, 122, 124, 135, 122, 145, 160, 155, 142, 144, 135, 167],
    'after': [110, 132, 123, 119, 123, 115, 140, 162, 142, 138, 135, 142, 160]})

In [22]:
import scipy.stats as stats
from scipy.stats import shapiro

# 정규성
stat,pvalue = stats.shapiro(df['after']-df['before'])
print(stat,pvalue)

0.9588785171508789 0.7362699508666992


In [23]:
# 만족
stat,pvalue = stats.ttest_rel(df['after'],df['before'],alternative = 'two-sided')
print(stat,pvalue)

-3.13822957230424 0.008560014166295203


In [24]:
# 불만족
stat, pvalue = stats.wilcoxon(df['after']-df['before'])
print(stat,pvalue)

11.0 0.013427734375


# 모평균 검정(모집단 2개-2sample)

In [25]:
# 두 그룹의 혈압평균이 다르다고 할 수 있는지?
df = pd.DataFrame({
    'A': [120, 135, 122, 124, 135, 122, 145, 160, 155, 142, 144, 135, 167],
    'B': [110, 132, 123, 119, 123, 115, 140, 162, 142, 138, 135, 142, 160]})

In [26]:
import scipy.stats as stats
from scipy.stats import shapiro

# 정규성
statA, pvalueA = stats.shapiro(df['A'])
statB, pvalueB = stats.shapiro(df['B'])

print(statA,pvalueA)
print(statB,pvalueB)

0.9314376711845398 0.35585272312164307
0.9498201012611389 0.5955665707588196


In [27]:
# 등분산성
stat, pvalue = stats.bartlett(df['A'],df['B'])
print(stat,pvalue)

0.027930756790756274 0.867271716287991


In [28]:
# 만족, 등분산성
stat, pvalue = stats.ttest_ind(df['A'],df['B'],
                              equal_var = True,
                              alternative = 'two-sided')
print(stat, pvalue)

0.8191722818556323 0.4207486213941666


In [29]:
# 불만족
stat, pvalue = stats.ranksums(df['A'],df['B'],alternative = 'two-sided')
print(stat,pvalue)

0.8461538461538461 0.39746692542259277


# 모평균 검정(모집단 3개)

In [31]:
# 세 그룹의 성적 평균이 같다고 할 수 있는지?
df = pd.DataFrame({
    'A': [120, 135, 122, 124, 135, 122, 145, 160, 155, 142, 144, 135, 167],
    'B': [110, 132, 123, 119, 123, 115, 140, 162, 142, 138, 135, 142, 160],
    'C': [130, 120, 115, 122, 133, 144, 122, 120, 110, 134, 125, 122, 122]})

In [34]:
import scipy.stats as stats
from scipy.stats import shapiro

# 정규성
print(stats.shapiro(df['A']))
print(stats.shapiro(df['B']))
print(stats.shapiro(df['C']))

ShapiroResult(statistic=0.9314376711845398, pvalue=0.35585272312164307)
ShapiroResult(statistic=0.9498201012611389, pvalue=0.5955665707588196)
ShapiroResult(statistic=0.9396706223487854, pvalue=0.45265132188796997)


In [35]:
# 등분산성
stat,pvalue = stats.bartlett(df['A'],df['B'],df['C'])
print(stat,pvalue)

4.222248448848066 0.12110174433684852


In [36]:
# 만족, 등분산성
stat, pvalue = stats.f_oneway(df['A'],df['B'],df['C'])
print(stat,pvalue)

3.6970511827172867 0.0346491232068625


In [37]:
# 불만족
stat, pvalue = stats.kruskal(df['A'],df['B'],df['C'])
print(stat,pvalue)

6.896997321554428 0.03179333314266727


# 카이제곱 검정

In [38]:
# 동일한 비욜로 들어있다고 할 수 있는지?(각 범주에 속할 확률이 같은지?)
row1 = [30, 20, 15, 35]
df = pd.DataFrame([row1], columns=['A','B','C','D'])

In [39]:
from scipy.stats import chisquare

f_obs = [30,20,15,35]
f_exp = [25,25,25,25]

stat, pvalue = stats.chisquare(f_obs=f_obs,f_exp=f_exp)
print(stat,pvalue)

10.0 0.01856613546304325


In [41]:
# 차이가 있는지?(두 개의 범주형 변수가 서로 독립인지?)
row1, row2 = [200,190,250], [220,250,300]
df = pd.DataFrame([row1, row2], columns=['딸기','초코','바닐라'], index=['10대','20대'])

In [42]:
from scipy.stats import chi2_contingency
stat,pvalue,dof,expected = chi2_contingency(df)
print(stat)
print(pvalue)
print(dof)
print(expected)

1.708360126075226
0.4256320394874311
2
[[190.63829787 199.71631206 249.64539007]
 [229.36170213 240.28368794 300.35460993]]


In [49]:
# 교차표 형태가 아닐 경우
df = pd.DataFrame({
    '아이스크림' : ['딸기','초코','바닐라','딸기','초코','바닐라'],
    '연령' : ['10대','10대','10대','20대','20대','20대'],
    '인원' : [200,190,250,220,250,300]
})
df

Unnamed: 0,아이스크림,연령,인원
0,딸기,10대,200
1,초코,10대,190
2,바닐라,10대,250
3,딸기,20대,220
4,초코,20대,250
5,바닐라,20대,300


In [50]:
df = pd.crosstab(index = df['연령'],columns=df['아이스크림'],values=df['인원'],
                aggfunc=sum)
df

아이스크림,딸기,바닐라,초코
연령,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
10대,200,250,190
20대,220,300,250


# 다중회귀분석

In [51]:
from sklearn.datasets import load_diabetes

diabetes = load_diabetes()
x = pd.DataFrame(diabetes.data, columns = diabetes.feature_names)
y = pd.DataFrame(diabetes.target)
y.columns = ['target']

x = x[['age','sex','bmi']]
y = y['target']

In [52]:
import statsmodels.api as sm

x = sm.add_constant(x)
model = sm.OLS(y,x).fit()
summary = model.summary()
print(summary)

                            OLS Regression Results                            
Dep. Variable:                 target   R-squared:                       0.351
Model:                            OLS   Adj. R-squared:                  0.346
Method:                 Least Squares   F-statistic:                     78.94
Date:                Wed, 29 Nov 2023   Prob (F-statistic):           7.77e-41
Time:                        19:55:17   Log-Likelihood:                -2451.6
No. Observations:                 442   AIC:                             4911.
Df Residuals:                     438   BIC:                             4928.
Df Model:                           3                                         
Covariance Type:            nonrobust                                         
                 coef    std err          t      P>|t|      [0.025      0.975]
------------------------------------------------------------------------------
const        152.1335      2.964     51.321      0.0

# 상관분석

In [54]:
from sklearn.datasets import load_diabetes

diabetes = load_diabetes()
x = pd.DataFrame(diabetes.data, columns = diabetes.feature_names)
y = pd.DataFrame(diabetes.target)
y.columns = ['target']

x = x['bmi']
y = y['target']

In [55]:
from scipy.stats import pearsonr

r,pvalue = pearsonr(x,y)
print(r, pvalue)

0.5864501344746883 3.4660064451677685e-42


In [56]:
n = len(x)
stat = r*(n-2)**0.5 / (1-r**2)**0.5
print(stat)

15.187289570365293


# 로지스틱 회귀분석

In [62]:
import seaborn as sns
df = sns.load_dataset('titanic')

df = df[['survived','sex','sibsp','fare']]

In [63]:
df['sex']=df['sex'].map({'male':1,
                        'female':0})
x = df.drop(['survived'],axis=1)
y = df['survived']

import statsmodels.api as sm

x = sm.add_constant(x)
model = sm.Logit(y,x).fit()
summary = model.summary()
print(summary)

Optimization terminated successfully.
         Current function value: 0.483846
         Iterations 6
                           Logit Regression Results                           
Dep. Variable:               survived   No. Observations:                  891
Model:                          Logit   Df Residuals:                      887
Method:                           MLE   Df Model:                            3
Date:                Wed, 29 Nov 2023   Pseudo R-squ.:                  0.2734
Time:                        20:01:45   Log-Likelihood:                -431.11
converged:                       True   LL-Null:                       -593.33
Covariance Type:            nonrobust   LLR p-value:                 5.094e-70
                 coef    std err          z      P>|z|      [0.025      0.975]
------------------------------------------------------------------------------
const          0.8703      0.163      5.352      0.000       0.552       1.189
sex           -2.5668      0.