In [1]:
# 필요한 라이브러리 추가
import numpy as np
import pandas as pd
from scipy import stats
import matplotlib.pyplot as plt
from statsmodels.formula.api import ols
from statsmodels.stats.anova import anova_lm

In [2]:
lstm_df = pd.DataFrame(
    data  =
        {'seq_length':[30,30,30,30,30,30,60,60,60,60,60,60,120,120,120,120,120,120],
        'rmse':[2.0999,2.1046,2.2558,2.2894,2.3682,2.3132,2.3642,2.3405,2.5351,2.5086,2.5760,2.5783,2.2858,2.2855,2.46,2.4657,2.4925,2.5393]}
)
gru_df = pd.DataFrame(
    data  =
        {'seq_length':[30,30,30,30,30,30,60,60,60,60,60,60,120,120,120,120,120,120],
        'rmse':[2.098,2.1155,2.2825,2.2765,2.2404,2.2838,2.3110,2.2897,2.4925,2.5292,2.6002,2.5499,2.2999,2.2798,2.4454,2.4778,2.5734,2.4777]}
)
transformer_df = pd.DataFrame(
    data  =
        {'seq_length':[30,30,30,30,30,30,60,60,60,60,60,60,120,120,120,120,120,120],
        'rmse':[2.308,2.309,2.466,2.548,2.580,2.610,2.560,2.556,2.722,2.804,2.838,2.856,2.483,2.500,2.643,2.782,2.786,2.780]}
)

In [3]:
lstm_df.head(), gru_df.head(), transformer_df.head()

(   seq_length    rmse
 0          30  2.0999
 1          30  2.1046
 2          30  2.2558
 3          30  2.2894
 4          30  2.3682,
    seq_length    rmse
 0          30  2.0980
 1          30  2.1155
 2          30  2.2825
 3          30  2.2765
 4          30  2.2404,
    seq_length   rmse
 0          30  2.308
 1          30  2.309
 2          30  2.466
 3          30  2.548
 4          30  2.580)

In [4]:
lstm_30 = lstm_df.loc[lstm_df['seq_length'] == 30]
gru_30 = gru_df.loc[gru_df['seq_length'] == 30]
transformer_30 = transformer_df.loc[transformer_df['seq_length'] == 30]
lstm_60 = lstm_df.loc[lstm_df['seq_length'] == 60]
gru_60 = gru_df.loc[gru_df['seq_length'] == 60]
transformer_60 = transformer_df.loc[transformer_df['seq_length'] == 60]
lstm_120 = lstm_df.loc[lstm_df['seq_length'] == 120]
gru_120 = gru_df.loc[gru_df['seq_length'] == 120]
transformer_120 = transformer_df.loc[transformer_df['seq_length'] == 120]

In [5]:
## 정규성 검정
lstm_group = [lstm_30, lstm_60, lstm_120]
gru_group = [gru_30, gru_60, gru_120]
transformer_group = [transformer_30, transformer_60, transformer_120]

for group in lstm_group:
    print("bigger than 0.05" if (stats.shapiro(group['rmse']).pvalue) > 0.05 else "no")
for group in gru_group:
    print("bigger than 0.05" if (stats.shapiro(group['rmse']).pvalue) > 0.05 else "no")
for group in transformer_group:
    print("bigger than 0.05" if (stats.shapiro(group['rmse']).pvalue) > 0.05 else "no")

bigger than 0.05
bigger than 0.05
bigger than 0.05
no
bigger than 0.05
bigger than 0.05
bigger than 0.05
bigger than 0.05
bigger than 0.05


In [6]:
# Kruskal-Wallis H 검정
# print(stats.kruskal(lstm_30['rmse'], lstm_60['rmse'], lstm_120['rmse']))
print(stats.kruskal(gru_30['rmse'], gru_60['rmse'], gru_120['rmse']))
# print(stats.kruskal(transformer_30['rmse'], transformer_60['rmse'], transformer_120['rmse']))

KruskalResult(statistic=10.713450292397667, pvalue=0.004716326123185766)


In [7]:
# 등분산성 검정 > 만족
### gru는 등분산성은 만족하지만 정규성이 애매함.
print(stats.levene(lstm_30['rmse'], lstm_60['rmse'], lstm_120['rmse']))
print(stats.levene(transformer_30['rmse'], transformer_60['rmse'], transformer_120['rmse']))

print(stats.bartlett(lstm_30['rmse'], lstm_60['rmse'], lstm_120['rmse']))
print(stats.bartlett(transformer_30['rmse'], transformer_60['rmse'], transformer_120['rmse']))

print(stats.levene(gru_30['rmse'], gru_60['rmse'], gru_120['rmse']))
print(stats.bartlett(gru_30['rmse'], gru_60['rmse'], gru_120['rmse']))

LeveneResult(statistic=0.01576860243592082, pvalue=0.9843713656177684)
LeveneResult(statistic=0.040653174624792635, pvalue=0.9602674952485132)
BartlettResult(statistic=0.015845045664427022, pvalue=0.9921087776376936)
BartlettResult(statistic=0.022140384385811358, pvalue=0.9889908569019508)
LeveneResult(statistic=0.25877432482979246, pvalue=0.7753737172125356)
BartlettResult(statistic=0.7580708374639175, pvalue=0.6845213673580811)


In [8]:
# anova test
# 종속변수 ~ C(독립변수)
lmodel = ols('rmse ~ C(seq_length)', data=lstm_df).fit()
print("LSTM | RMSE ~ C(Seq_Length)")
print(anova_lm(lmodel, typ=1))
print()
lmodel = ols('rmse ~ C(seq_length)', data=transformer_df).fit()
print("Transformer | RMSE ~ C(Seq_Length)")
print(anova_lm(lmodel, typ=1))

LSTM | RMSE ~ C(Seq_Length)
                 df    sum_sq   mean_sq         F    PR(>F)
C(seq_length)   2.0  0.195020  0.097510  8.253633  0.003825
Residual       15.0  0.177213  0.011814       NaN       NaN

Transformer | RMSE ~ C(Seq_Length)
                 df    sum_sq   mean_sq         F    PR(>F)
C(seq_length)   2.0  0.208649  0.104324  5.505371  0.016108
Residual       15.0  0.284243  0.018950       NaN       NaN


### 사후검정

In [9]:
# Tukey 라이브러리 가져오기
from statsmodels.stats.multicomp import pairwise_tukeyhsd

# endog = 종속변수, groups = 독립변수
tukey_result = pairwise_tukeyhsd(endog=lstm_df['rmse'], groups=lstm_df['seq_length'])
print(tukey_result)

tukey_result = pairwise_tukeyhsd(endog=transformer_df['rmse'], groups=transformer_df['seq_length'])
print(tukey_result)

Multiple Comparison of Means - Tukey HSD, FWER=0.05
group1 group2 meandiff p-adj   lower  upper  reject
---------------------------------------------------
    30     60   0.2453 0.0038  0.0823 0.4083   True
    30    120    0.183 0.0271  0.0199  0.346   True
    60    120  -0.0623 0.5923 -0.2253 0.1007  False
---------------------------------------------------
Multiple Comparison of Means - Tukey HSD, FWER=0.05
group1 group2 meandiff p-adj   lower  upper  reject
---------------------------------------------------
    30     60   0.2525 0.0162  0.0461 0.4589   True
    30    120   0.1922 0.0699 -0.0143 0.3986  False
    60    120  -0.0603 0.7329 -0.2668 0.1461  False
---------------------------------------------------


In [10]:
# prompt: lstm_df와  transformer_df에 대해 ancova 분석을 수행하는 코드를 작성해

import numpy as np
import pandas as pd
from scipy import stats
import matplotlib.pyplot as plt
from statsmodels.formula.api import ols
import pingouin as pg
# 필요한 라이브러리 추가
lstm_all = pd.DataFrame(
    data  =
        {'seq_length':[30,30,30,30,30,30,60,60,60,60,60,60,120,120,120,120,120,120],
        'batch_size':[1,1,4,4,8,8,1,1,4,4,8,8,1,1,4,4,8,8],
        'model_size':[64,128,64,128,64,128,64,128,64,128,64,128,64,128,64,128,64,128],
        'rmse':[2.0999,2.1046,2.2558,2.2894,2.3682,2.3132,2.3642,2.3405,2.5351,2.5086,2.5760,2.5783,2.2858,2.2855,2.46,2.4657,2.4925,2.5393]}
)
transformer_all = pd.DataFrame(
    data  =
        {'seq_length':[30,30,30,30,30,30,60,60,60,60,60,60,120,120,120,120,120,120],
        'batch_size':[1,1,4,4,8,8,1,1,4,4,8,8,1,1,4,4,8,8],
        'model_size':[64,128,64,128,64,128,64,128,64,128,64,128,64,128,64,128,64,128],
        'rmse':[2.308,2.309,2.466,2.548,2.580,2.610,2.560,2.556,2.722,2.804,2.838,2.856,2.483,2.500,2.643,2.782,2.786,2.780]}
)

ancova_lstm = pg.ancova(data=lstm_all, dv='rmse', 
                        covar = ['batch_size','model_size'], between='seq_length')
print(ancova_lstm)

print("-------------------------")

ancova_transformer = pg.ancova(data=transformer_all, dv='rmse',
                       covar = ['batch_size','model_size'], between='seq_length')
print(ancova_transformer)

       Source        SS  DF          F         p-unc       np2
0  seq_length  0.195020   2  50.036701  7.830488e-07  0.885030
1  batch_size  0.151870   1  77.931396  7.488557e-07  0.857035
2  model_size  0.000009   1   0.004383  9.482202e-01  0.000337
3    Residual  0.025334  13        NaN           NaN       NaN
-------------------------
       Source        SS  DF          F         p-unc       np2
0  seq_length  0.208649   2  35.183521  5.677645e-06  0.844063
1  batch_size  0.238537   1  80.446713  6.259555e-07  0.860883
2  model_size  0.007160   1   2.414737  1.441973e-01  0.156651
3    Residual  0.038547  13        NaN           NaN       NaN


In [11]:
lstm_t = pd.DataFrame(
    data  =
        {'stock':[2.080,2.097,2.265,2.253,2.346,2.33,2.298,2.355,2.519,2.545,2.565,2.624,2.27,2.336,2.491,2.526,2.540,2.519],
        'total':[2.0999,2.1046,2.2558,2.2894,2.3682,2.3132,2.3642,2.3405,2.5351,2.5086,2.5760,2.5783,2.2858,2.2855,2.46,2.4657,2.4925,2.5393]}
)
gru_t = pd.DataFrame(
    data  =
        {'stock':[2.098,2.1155,2.2825,2.2765,2.2404,2.2838,2.3110,2.2897,2.4925,2.5292,2.6002,2.5499,2.2999,2.2798,2.4454,2.4778,2.5734,2.4777],
        'total':[2.1143,2.0791,2.2759,2.2461,2.3286,2.3328,2.3475,2.3452,2.5378,2.546,2.608,2.5757,2.629,2.252,2.4715,2.4587,2.5297,2.4882]}
)
transformer_t = pd.DataFrame(
    data  =
        {'stock':[2.322,2.352,2.522,2.580,2.557,2.608,2.567,2.562,2.745,2.829,2.843,2.847,2.479,2.486,2.651,2.738,2.789,2.763],
        'total':[2.308,2.309,2.466,2.548,2.580,2.610,2.560,2.556,2.722,2.804,2.838,2.856,2.483,2.500,2.643,2.782,2.786,2.780]}
)

In [12]:
import scipy.stats as stats
results = []
t_stat, p_value = stats.ttest_ind(lstm_t['stock'].values, lstm_t['total'].values)
results.append(('LSTM', t_stat, p_value))

t_stat, p_value = stats.ttest_ind(gru_t['stock'].values, gru_t['total'].values)
results.append(('GRU', t_stat, p_value))

t_stat, p_value = stats.ttest_ind(transformer_t['stock'].values, transformer_t['total'].values)
results.append(('Transformer', t_stat, p_value))

# 출력
print(f"{'Model':<15} {'t-statistic':<25} {'p-value':<25}")
for result in results:
    print(f"{result[0]:<15} {result[1]:<25} {result[2]:<25}")

Model           t-statistic               p-value                  
LSTM            0.1037865841026376        0.9179485128034262       
GRU             -0.5691924012118816       0.5729669381028167       
Transformer     0.10943502234467681       0.9135008203825664       


Kruskal-Wallis test : 범주형 독립변수와 연속형 종속변수의 상관관계

In [16]:
print(stats.kruskal(lstm_30['rmse'], lstm_60['rmse'], lstm_120['rmse']))
print(stats.kruskal(gru_30['rmse'], gru_60['rmse'], gru_120['rmse']))
print(stats.kruskal(transformer_30['rmse'], transformer_60['rmse'], transformer_120['rmse']))

KruskalResult(statistic=8.046783625730995, pvalue=0.017892174994385477)
KruskalResult(statistic=10.713450292397667, pvalue=0.004716326123185766)
KruskalResult(statistic=6.67836257309942, pvalue=0.03546598231531404)


In [18]:
print(stats.kruskal(lstm_t['stock'],lstm_t['total']))
print(stats.kruskal(gru_t['stock'],gru_t['total']))
print(stats.kruskal(transformer_t['stock'],transformer_t['total']))

KruskalResult(statistic=0.004004519386177176, pvalue=0.9495425286473611)
KruskalResult(statistic=0.44144144144144093, pvalue=0.5064275522686174)
KruskalResult(statistic=0.04229773601637117, pvalue=0.8370532765183694)
