In [23]:
import pandas as pd
import numpy as np
from scipy import stats
import scipy
from statsmodels.stats.proportion import proportions_ztest
import statsmodels.api as sm
import statsmodels.formula.api as smf


# 모평균 추정 (z분포)

In [15]:
df = pd.DataFrame({"sample":[18, 18, 20, 21, 20, 23, 19, 18, 17, 21, 22, 20, 20, 21, 20, 19, 19, 18, 17, 19]})

lower, upper = stats.norm.interval(0.95, loc = np.mean(df), scale = 3.8/np.sqrt(20))
print("신뢰구간: ({0}, {1})".format(lower.round(2), upper.round(2)))

신뢰구간: ([17.83], [21.17])


# 모평균 추정 (t분포)

In [24]:
df = pd.DataFrame({'sample': [73, 71, 74, 69, 70, 73, 70, 68, 75, 72, 70, 72, 73, 70, 70, 72, 71, 70, 75, 72]})

lower, upper = stats.t.interval(0.95, len(df) - 1, loc=np.mean(df), scale=scipy.stats.sem(df))
print('신뢰구간: ({0}, {1})'.format(lower.round(2), upper.round(2)))

신뢰구간: ([70.6], [72.4]


# 평균 검정(모집단 표준편차를 모르는 경우)

In [53]:
df = pd.DataFrame({'sample':[85.0, 79.0, 79.1, 79.9, 81.6, 78.6, 85.4, 83.4, 78.1, 79.2]})
shapiro(df)

(0.8473219275474548, 0.053975414484739304)

In [55]:
t_result = stats.ttest_1samp(df, 78.0)

t, p = t_result.statistic.round(3), t_result.pvalue.round(3)

print('1-Sample t-test')
print('t 통계량: {}'.format(t))
print('p-value: {}'.format(p))

1-Sample t-test
t 통계량: [3.379]
p-value: [0.008]


# 2 Sample t-test

In [56]:
df1 = pd.DataFrame({'sample':[1.435,1.572,1.486,1.511,1.457,1.548,1.404,1.883]})
df2 = pd.DataFrame({'sample':[1.883,1.715,1,799,1.768,1.711,1.832,1.427,1.344]})

shapiro(df1)

(0.7734811902046204, 0.014793740585446358)

In [57]:
shapiro(df2)

(0.3908460736274719, 3.3066382343349687e-07)

In [58]:
stats.levene(df1['sample'], df2['sample'])

LeveneResult(statistic=0.8848047272536113, pvalue=0.3617916531591635)

In [59]:
t_result = stats.ttest_ind(df1, df2, equal_var=True)

t, p = t_result.statistic.round(3), t_result.pvalue.round(3)

print('2-sample t-test')
print('t: {}'.format(t))
print('p: {}'.format(p))

2-sample t-test
t: [-0.94]
p: [0.362]


# Paired t-test

In [60]:
df1 = pd.DataFrame({'before':[720,589,780,648,720,589,780,648,780,648]})
df2 = pd.DataFrame({'after':[710,580,787,712,750,600,782,670,790,680]})

t_result = stats.ttest_rel(df1, df2)

t, p = t_result.statistic.round(3), t_result.pvalue.round(3)

print('Paired t-test')
print('t: {}'.format(t))
print('p: {}'.format(p))

Paired t-test
t: [-2.266]
p: [0.05]


# 1 Proportion test

In [65]:
count = 40
n_obs = 100
value = 0.5

stat, pval = proportions_ztest(count, n_obs, value)

print('1 Proportion test')
print('z: {0:0.3f}'.format(stat))
print('p: {0:0.3f}'.format(pval))

1 Proportion test
z: -2.041
p: 0.041


# 2 Proportion test

In [67]:
count = np.array([14,5])
nobs = np.array([1200, 1200])

stat, pval = proportions_ztest(count,nobs)

print('2 Proportion test')
print('z: {0:0.3f}'.format(stat))
print('p: {0:0.3f}'.format(pval))

2 Proportion test
z: 2.073
p: 0.038


# 카이제곱 검정

In [69]:
df = pd.DataFrame({'as':[18,8,4,4,3,3], 'price':[1,2,1,1,1,25], 'performance':[8,14,3,2,3,8],
                  'expand':[7,5,4,3,1,10], 'design':[10,5,9,2,1,2], 'safe':[9,9,5,7,1,1], 
                   'function':[10,4,4,3,1,7]})

chi, pval, dof, expected = stats.chi2_contingency(df.T)

print('chi-square test')
print('chisq: {0:0.3f}'.format(chi))
print('p: {0:0.3f}'.format(pval))
print('degree of freedom: {}'.format(dof))
print('expected value:\n{}'.format(expected.round(3)))

chi-square test
chisq: 98.274
p: 0.000
degree of freedom: 30
expected value:
[[11.004  8.21   5.24   3.843  1.921  9.782]
 [ 8.528  6.362  4.061  2.978  1.489  7.581]
 [10.454  7.799  4.978  3.651  1.825  9.293]
 [ 8.253  6.157  3.93   2.882  1.441  7.336]
 [ 7.978  5.952  3.799  2.786  1.393  7.092]
 [ 8.803  6.568  4.192  3.074  1.537  7.825]
 [ 7.978  5.952  3.799  2.786  1.393  7.092]]


# One way ANOVA

In [96]:
df = pd.read_csv('../Data/1. 통계이해/ANOVA.csv')

a = df.loc[df['gangjong'] == 'a', 'time']
b = df.loc[df['gangjong'] == 'b', 'time']
c = df.loc[df['gangjong'] == 'c', 'time']
d = df.loc[df['gangjong'] == 'd', 'time']

f_result = stats.f_oneway(a,b,c,d)
f, p = f_result.statistic.round(3), f_result.pvalue.round(3)

print('One-way')
print('F 통계량: {}'.format(f))
print('p-value: {}'.format(p))

One-way
F 통계량: 4.172
p-value: 0.019


# 상관분석

In [100]:
strength = [37.6,38.6,37.2,36.4,38.6,39,37.2,36.1,35.9,37.1,36.9,37.5,36.3,38.1,39,36.9,36.8,37.6,33,33.5]
temperature = [14,15,14,16,17,14,17,16,15,16,14,16,18,13,15,17,14,16,20,21]

corr, pval = stats.pearsonr(strength, temperature)

print('Correlation Analysis')
print('corr: {0:0.3f}'.format(corr))
print('p-value: {0:0.3f}'.format(pval))

Correlation Analysis
corr: -0.723
p-value: 0.000


# 단순선형회귀 1

In [113]:
strength = [37.6,38.6,37.2,36.4,38.6,39,37.2,36.1,35.9,37.1,36.9,37.5,36.3,38.1,39,36.9,36.8,37.6,33,33.5]
baking_time = [0.798992,0.84902,0.816163,0.758266,0.815894,0.807477,0.809068,0.7664,0.742243,0.751118,0.740629,0.751495,0.738863,0.827428,0.818172,0.740939,0.757128,0.804695,0.695121,0.735377]

baking_time = sm.add_constant(baking_time)
model = sm.OLS(strength, baking_time)
result = model.fit()
print(result.summary())

                            OLS Regression Results                            
Dep. Variable:                      y   R-squared:                       0.646
Model:                            OLS   Adj. R-squared:                  0.627
Method:                 Least Squares   F-statistic:                     32.90
Date:                Tue, 30 Jun 2020   Prob (F-statistic):           1.95e-05
Time:                        18:51:12   Log-Likelihood:                -26.432
No. Observations:                  20   AIC:                             56.86
Df Residuals:                      18   BIC:                             58.85
Df Model:                           1                                         
Covariance Type:            nonrobust                                         
                 coef    std err          t      P>|t|      [0.025      0.975]
------------------------------------------------------------------------------
const         13.0026      4.183      3.108      0.0

# 단순선형회귀 2

In [114]:
df = pd.read_csv('../Data/1. 통계이해/correaltion.csv')
model = smf.ols(formula = "strength ~ Oven_TMP", data=df)
result = model.fit()
print(result.summary())

                            OLS Regression Results                            
Dep. Variable:               strength   R-squared:                       0.001
Model:                            OLS   Adj. R-squared:                 -0.011
Method:                 Least Squares   F-statistic:                    0.1136
Date:                Tue, 30 Jun 2020   Prob (F-statistic):              0.737
Time:                        18:51:14   Log-Likelihood:                -223.88
No. Observations:                  86   AIC:                             451.8
Df Residuals:                      84   BIC:                             456.7
Df Model:                           1                                         
Covariance Type:            nonrobust                                         
                 coef    std err          t      P>|t|      [0.025      0.975]
------------------------------------------------------------------------------
Intercept     30.4190      5.271      5.771      0.0