In [202]:
import numpy as np
import pandas as pd
import scipy.stats as stats

In [203]:
# 디카를로자동차 사라토가 대리점 사례

# 데이터 입력

Data = {
    'Sales_s' : [0,1,2,3,4,5],
    'Days_s' : [54,117,72,42,12,3]
}


In [204]:
# 데이터프레임 생성

df = pd.DataFrame(Data)
df 

Unnamed: 0,Sales_s,Days_s
0,0,54
1,1,117
2,2,72
3,3,42
4,4,12
5,5,3


In [205]:
# 날짜 합계
Sum_Days =  df['Days_s'].sum()
Sum_Days.item()

300

In [206]:
# 상대도수 열 추가
df['Prob_Sales_s'] = df['Days_s'] / Sum_Days
df

Unnamed: 0,Sales_s,Days_s,Prob_Sales_s
0,0,54,0.18
1,1,117,0.39
2,2,72,0.24
3,3,42,0.14
4,4,12,0.04
5,5,3,0.01


In [207]:
# 1일 매출의 기댓값

E_X = (df['Sales_s'] * df['Prob_Sales_s']).sum()
E_X.item()

1.5

In [208]:
df['Diff_Sales_s'] = df['Sales_s'] - E_X
df

Unnamed: 0,Sales_s,Days_s,Prob_Sales_s,Diff_Sales_s
0,0,54,0.18,-1.5
1,1,117,0.39,-0.5
2,2,72,0.24,0.5
3,3,42,0.14,1.5
4,4,12,0.04,2.5
5,5,3,0.01,3.5


In [209]:
# 편차제곱을 열로 추가
df['Diff_squared_s'] = df['Diff_Sales_s'] ** 2 
df

Unnamed: 0,Sales_s,Days_s,Prob_Sales_s,Diff_Sales_s,Diff_squared_s
0,0,54,0.18,-1.5,2.25
1,1,117,0.39,-0.5,0.25
2,2,72,0.24,0.5,0.25
3,3,42,0.14,1.5,2.25
4,4,12,0.04,2.5,6.25
5,5,3,0.01,3.5,12.25


In [210]:
# 분산
Variance_s = (df['Diff_squared_s'] * df['Prob_Sales_s']).sum()
Variance_s.item()

1.25

In [211]:
# 표준편차
Std_s = np.sqrt(Variance_s)
Std_s.item()

1.118033988749895

In [212]:
print(f"Std_s = {Std_s:.2f}")

Std_s = 1.12


이변량 분포 : 사라토가 & 제네바 

In [213]:
# 축 생성 & 두 대리점의 판매량 값 지정
saratoga_values = [0,1,2,3,4,5]
geneve_values =[0,1,2,3]

In [214]:
# 데이터 입력
data = [
    [21,30,24,9,2,0],
    [21,36,33,18,2,1],
    [9,42,9,12,3,2],
    [3,9,6,3,5,0],
]



In [215]:
# 데이터프레임 생성

cross_table = pd.DataFrame(data, columns=saratoga_values, index=geneve_values)
cross_table

Unnamed: 0,0,1,2,3,4,5
0,21,30,24,9,2,0
1,21,36,33,18,2,1
2,9,42,9,12,3,2
3,3,9,6,3,5,0


In [216]:
# 전체 합계 계산
total_sum = cross_table.values.sum()
total_sum.item()

300

In [217]:
# 행의 합계를 계산 및 새로운 열로 추가

cross_table['Sum'] = cross_table.sum(axis=1)
cross_table

Unnamed: 0,0,1,2,3,4,5,Sum
0,21,30,24,9,2,0,86
1,21,36,33,18,2,1,111
2,9,42,9,12,3,2,77
3,3,9,6,3,5,0,26


In [218]:
cross_table.loc['Sum'] = cross_table.sum(axis=0)


In [219]:
cross_table

Unnamed: 0,0,1,2,3,4,5,Sum
0,21,30,24,9,2,0,86
1,21,36,33,18,2,1,111
2,9,42,9,12,3,2,77
3,3,9,6,3,5,0,26
Sum,54,117,72,42,12,3,300


In [220]:
# 교차제표를 결합확률분포표로 바꿔서 생성

df_jointprob = cross_table / total_sum
df_jointprob

Unnamed: 0,0,1,2,3,4,5,Sum
0,0.07,0.1,0.08,0.03,0.006667,0.0,0.286667
1,0.07,0.12,0.11,0.06,0.006667,0.003333,0.37
2,0.03,0.14,0.03,0.04,0.01,0.006667,0.256667
3,0.01,0.03,0.02,0.01,0.016667,0.0,0.086667
Sum,0.18,0.39,0.24,0.14,0.04,0.01,1.0


In [221]:
# Geneve의 판매량과 Saratoga의 판매량을 합쳐서 s 생성

s = [x + y for x in saratoga_values for y in geneve_values]
s

[0, 1, 2, 3, 1, 2, 3, 4, 2, 3, 4, 5, 3, 4, 5, 6, 4, 5, 6, 7, 5, 6, 7, 8]

In [222]:
# s의 중복된 값을 한가지만 취해서 리스트를 다시 작성
s = set(s)
s

{0, 1, 2, 3, 4, 5, 6, 7, 8}

In [223]:
# 다른 방법
np.unique(s)

array([{0, 1, 2, 3, 4, 5, 6, 7, 8}], dtype=object)

In [224]:
# s = 0 ~ 8 각각에 대한 확률 계산
P_s = {}

for s in np.arange(0,9):
    prob_sum = 0
    for x in saratoga_values:
      for y in geneve_values:
        if x + y == s:
            prob_sum += df_jointprob.loc[y,x]
    P_s[s] = round(prob_sum, 4)


In [225]:
P_s


{np.int64(0): np.float64(0.07),
 np.int64(1): np.float64(0.17),
 np.int64(2): np.float64(0.23),
 np.int64(3): np.float64(0.29),
 np.int64(4): np.float64(0.1267),
 np.int64(5): np.float64(0.0667),
 np.int64(6): np.float64(0.0233),
 np.int64(7): np.float64(0.0233),
 np.int64(8): np.float64(0.0)}

In [226]:
# s에 대한 확률분포표 생성
 
s_df = pd.DataFrame(list(P_s.items()), columns = ['s', 'P(s)'])
s_df

Unnamed: 0,s,P(s)
0,0,0.07
1,1,0.17
2,2,0.23
3,3,0.29
4,4,0.1267
5,5,0.0667
6,6,0.0233
7,7,0.0233
8,8,0.0


In [227]:
# Geneve에 대한 확률분포 계산

geneve_df = pd.DataFrame({'Sales_g': geneve_values})
geneve_df

Unnamed: 0,Sales_g
0,0
1,1
2,2
3,3


In [228]:
# df_jointprob 에서 Sum 열을 가져와서 geneve_df에 확률을 열로 추가

geneve_df['Prob_Sales_g'] = df_jointprob['Sum'].iloc[:-1].round(4)
geneve_df

Unnamed: 0,Sales_g,Prob_Sales_g
0,0,0.2867
1,1,0.37
2,2,0.2567
3,3,0.0867


In [229]:
# 제네바 대리점 1일 판매량의 기댓값 계산

E_Y = (geneve_df['Sales_g'] * geneve_df['Prob_Sales_g']).sum()
E_Y.item()

# 편차 제곱
geneve_df['Diff_Sales_g'] = geneve_df['Sales_g'] - E_Y
geneve_df

Unnamed: 0,Sales_g,Prob_Sales_g,Diff_Sales_g
0,0,0.2867,-1.1435
1,1,0.37,-0.1435
2,2,0.2567,0.8565
3,3,0.0867,1.8565


In [230]:
# 편차 제곱 계산하여 열 추가 
geneve_df['Diff_Squared_g'] = geneve_df['Diff_Sales_g'] ** 2

In [231]:
geneve_df

Unnamed: 0,Sales_g,Prob_Sales_g,Diff_Sales_g,Diff_Squared_g
0,0,0.2867,-1.1435,1.307592
1,1,0.37,-0.1435,0.020592
2,2,0.2567,0.8565,0.733592
3,3,0.0867,1.8565,3.446592


In [232]:
# 제네바 판매량의 분산

Variance_g = (geneve_df['Diff_Squared_g'] * geneve_df['Prob_Sales_g']).sum()
print(f"Variance_g = {Variance_g:.2f}")

Variance_g = 0.87


In [233]:
# 제네바 판매량의 표준편차

Std_g = np.sqrt(Variance_g)
print(f"Std_g = {Std_g:.2f}")

Std_g = 0.93


In [234]:
# s = x + y 의 분산을 이용한 x,y의 공분산 계산
# var(x+y) = var(x) + var(y) + 2cov(x,y)
# cov(x,y) = var(x+y) - var(x) - var(y)

In [235]:
# s 기댓값
E_s = (s_df['s'] * s_df['P(s)']).sum()
E_s.item()

2.6432

In [236]:
# s 편차
s_df['Diff_s'] = s_df['s'] - E_s
s_df

Unnamed: 0,s,P(s),Diff_s
0,0,0.07,-2.6432
1,1,0.17,-1.6432
2,2,0.23,-0.6432
3,3,0.29,0.3568
4,4,0.1267,1.3568
5,5,0.0667,2.3568
6,6,0.0233,3.3568
7,7,0.0233,4.3568
8,8,0.0,5.3568


In [237]:
# s 편차 제곱
s_df['Diff_s_Squared'] = s_df['Diff_s'] ** 2
s_df

Unnamed: 0,s,P(s),Diff_s,Diff_s_Squared
0,0,0.07,-2.6432,6.986506
1,1,0.17,-1.6432,2.700106
2,2,0.23,-0.6432,0.413706
3,3,0.29,0.3568,0.127306
4,4,0.1267,1.3568,1.840906
5,5,0.0667,2.3568,5.554506
6,6,0.0233,3.3568,11.268106
7,7,0.0233,4.3568,18.981706
8,8,0.0,5.3568,28.695306


In [None]:
# s의 분산

Var_s = (s_df['Diff_s_Squared'] * s_df['P(s)']).sum()
print(f"Var_s = {Var_s:.2f}")

Var_s = 2.39


In [239]:
# 사라토가 판매량 x과 제네바 판매량 y의 공분산
Cov_xy = (Var_s - Variance_s - Variance_g) / 2
print(f"Cov_xy = {Cov_xy:.4f}")

Cov_xy = 0.1345


In [240]:
# 상관계수
Rho_xy = Cov_xy / (Std_s * Std_g)
print(f"Rho_xy = {Rho_xy:.4f}")

Rho_xy = 0.1290
