In [None]:
import pandas as pd
from scipy.stats import shapiro, mannwhitneyu

base_path = r'your path'
sub_place = ['동부권', '서부권', '수도권', '제주도 및 도서지역']
sub_folder = ['TS_csv', 'VS_csv']

# 전국 단위 데이터를 쌓기 위한 리스트
travel_merge = []
accomm_con_merge = []
active_con_merge = []
trans_con_merge = []
pre_con_merge = []

for place in sub_place:
    for folder in sub_folder:
        travel_path = base_path + "\\" + place + "\\" + folder

        suffix = {
            '동부권': 'B',
            '서부권': 'C',
            '수도권': 'A',
            '제주도 및 도서지역': 'D'
        }[place]

        travel = pd.read_csv(travel_path + f"\\tn_travel_여행_{suffix}.csv")
        accomm = pd.read_csv(travel_path + f"\\tn_lodge_consume_his_숙박소비내역_{suffix}.csv")
        active = pd.read_csv(travel_path + f"\\tn_activity_consume_his_활동소비내역_{suffix}.csv")
        trans = pd.read_csv(travel_path + f"\\tn_mvmn_consume_his_이동수단소비내역_{suffix}.csv")
        pre = pd.read_csv(travel_path + f"\\tn_adv_consume_his_사전소비내역_{suffix}.csv")

        travel_merge.append(travel)
        accomm_con_merge.append(accomm)
        active_con_merge.append(active)
        trans_con_merge.append(trans)
        pre_con_merge.append(pre)

# ==== 여기서부터 전국 단위 합치기 ====
travel_merge_all = pd.concat(travel_merge).reset_index(drop=True)
accomm_con_merge_all = pd.concat(accomm_con_merge).reset_index(drop=True)
active_con_merge_all = pd.concat(active_con_merge).reset_index(drop=True)
trans_con_merge_all = pd.concat(trans_con_merge).reset_index(drop=True)
pre_con_merge_all = pd.concat(pre_con_merge).reset_index(drop=True)

travel_merge_all['TRAVEL_END_YMD'] = pd.to_datetime(travel_merge_all['TRAVEL_END_YMD'])
travel_merge_all['TRAVEL_START_YMD'] = pd.to_datetime(travel_merge_all['TRAVEL_START_YMD'])
travel_merge_all['TRAVEL_DAYS'] = (travel_merge_all['TRAVEL_END_YMD'] - travel_merge_all['TRAVEL_START_YMD']).dt.days

TravelID = travel_merge_all[['TRAVEL_ID', 'TRAVEL_DAYS']]
accomm_con_sel = accomm_con_merge_all[['TRAVEL_ID', 'LODGING_TYPE_CD', 'PAYMENT_AMT_WON']].rename(columns={'PAYMENT_AMT_WON': '숙박결제금액'})
active_con_sel = active_con_merge_all[['TRAVEL_ID', 'PAYMENT_AMT_WON']].rename(columns={'PAYMENT_AMT_WON': '활동결제금액'})
trans_con_sel = trans_con_merge_all[['TRAVEL_ID', 'PAYMENT_AMT_WON']].rename(columns={'PAYMENT_AMT_WON': '교통결제금액'})
pre_con_sel = pre_con_merge_all[['TRAVEL_ID', 'PAYMENT_AMT_WON']].rename(columns={'PAYMENT_AMT_WON': '사전결제금액'})

accomm_drop = accomm_con_sel.drop_duplicates(subset=['TRAVEL_ID', 'LODGING_TYPE_CD', '숙박결제금액'])
accomm_sum = accomm_drop.groupby(['TRAVEL_ID', 'LODGING_TYPE_CD'], as_index=False)['숙박결제금액'].sum()
active_sum = active_con_sel.groupby(['TRAVEL_ID'], as_index=False)['활동결제금액'].sum()
trans_sum = trans_con_sel.groupby(['TRAVEL_ID'], as_index=False)['교통결제금액'].sum()
pre_con_sum = pre_con_sel.groupby(['TRAVEL_ID'], as_index=False)['사전결제금액'].sum()

all_merge = pd.merge(TravelID, accomm_sum, on='TRAVEL_ID', how='left')
all_merge = pd.merge(all_merge, active_sum, on='TRAVEL_ID', how='left')
all_merge = pd.merge(all_merge, trans_sum, on='TRAVEL_ID', how='left')
all_merge = pd.merge(all_merge, pre_con_sum, on='TRAVEL_ID', how='left')

all_merge['전체결제금액'] = all_merge[['활동결제금액', '교통결제금액', '사전결제금액']].sum(axis=1, skipna=True)

day_trip = all_merge[all_merge['LODGING_TYPE_CD'].isna()].reset_index(drop=True)
lodgment = all_merge[all_merge['LODGING_TYPE_CD'].notna()].reset_index(drop=True)

long_trip = day_trip[day_trip['TRAVEL_DAYS'] >= 3]
lodgment = pd.concat([lodgment, long_trip], ignore_index=True)
day_trip = day_trip[day_trip['TRAVEL_DAYS'] < 3]

day_trip = day_trip[day_trip['TRAVEL_DAYS'] < 10]
lodgment = lodgment[lodgment['TRAVEL_DAYS'] < 10]

lodgment = lodgment.drop_duplicates('TRAVEL_ID', keep='first')
day_trip['하루평균결제금액'] = day_trip['전체결제금액'] / day_trip['TRAVEL_DAYS']
lodgment['하루평균결제금액'] = lodgment['전체결제금액'] / lodgment['TRAVEL_DAYS']

display(day_trip)
display(lodgment)

Unnamed: 0,TRAVEL_ID,TRAVEL_DAYS,LODGING_TYPE_CD,숙박결제금액,활동결제금액,교통결제금액,사전결제금액,전체결제금액,하루평균결제금액
0,b_b015715,1,,,38900.0,50200.0,,89100.0,89100.0
1,b_b000419,2,,,183000.0,123000.0,,306000.0,153000.0
2,b_b013638,1,,,120400.0,52800.0,,173200.0,173200.0
3,b_b007836,1,,,118000.0,52900.0,,170900.0,170900.0
4,b_b010123,1,,,132400.0,94400.0,,226800.0,226800.0
...,...,...,...,...,...,...,...,...,...
7157,d_d003501,1,,,54200.0,15500.0,,69700.0,69700.0
7158,d_d002686,1,,,41300.0,10440.0,,51740.0,51740.0
7160,d_d012290,1,,,134500.0,30000.0,,164500.0,164500.0
7161,d_d002993,2,,,118400.0,30000.0,9200.0,157600.0,78800.0


Unnamed: 0,TRAVEL_ID,TRAVEL_DAYS,LODGING_TYPE_CD,숙박결제금액,활동결제금액,교통결제금액,사전결제금액,전체결제금액,하루평균결제금액
0,b_b004718,2,1.0,114950.0,74360.0,38000.0,,112360.0,56180.000000
1,b_b001944,3,1.0,338000.0,353400.0,113000.0,8900.0,475300.0,158433.333333
2,b_b005554,3,3.0,230000.0,78500.0,72360.0,45000.0,195860.0,65286.666667
3,b_b011374,2,11.0,60075.0,116930.0,57500.0,,174430.0,87215.000000
4,b_b004400,2,1.0,278070.0,115100.0,59200.0,,174300.0,87150.000000
...,...,...,...,...,...,...,...,...,...
8918,d_d001048,3,,,351000.0,19000.0,,370000.0,123333.333333
8919,c_c000890,3,,,215300.0,210000.0,112760.0,538060.0,179353.333333
8920,d_d001511,3,,,106910.0,30350.0,,137260.0,45753.333333
8921,d_d006862,3,,,63000.0,112600.0,,175600.0,58533.333333


In [7]:
## 당일치기 여행자들의 전체 결제금액에서 독립변수들은 얼마나 영향을 줄까?

import statsmodels.api as sm
import numpy as np

# 예시 변수들: 교통비, 활동비, 동반자수 등
X = day_trip[['교통결제금액', '활동결제금액', '사전결제금액']]
y = day_trip['전체결제금액']

# 1. inf → NaN으로
X = X.replace([np.inf, -np.inf], np.nan)

# 2. NaN을 0으로 채움
X = X.fillna(0)

# 3. y와 X 인덱스 정렬 (X가 변했을 경우 필수!)
y = y.loc[X.index]

X = sm.add_constant(X)
model = sm.OLS(y, X).fit()
print(model.summary())


                            OLS Regression Results                            
Dep. Variable:                 전체결제금액   R-squared:                       1.000
Model:                            OLS   Adj. R-squared:                  1.000
Method:                 Least Squares   F-statistic:                 1.080e+34
Date:                Tue, 12 Aug 2025   Prob (F-statistic):               0.00
Time:                        01:53:38   Log-Likelihood:             1.4552e+05
No. Observations:                6623   AIC:                        -2.910e+05
Df Residuals:                    6619   BIC:                        -2.910e+05
Df Model:                           3                                         
Covariance Type:            nonrobust                                         
                 coef    std err          t      P>|t|      [0.025      0.975]
------------------------------------------------------------------------------
const      -3.638e-12   1.17e-12     -3.112      0.0

In [8]:
import statsmodels.api as sm
import numpy as np

# 예시 변수들: 교통비, 활동비, 동반자수 등
X = lodgment[['교통결제금액', '활동결제금액', '사전결제금액']]
y = lodgment['전체결제금액']

# 1. inf → NaN으로
X = X.replace([np.inf, -np.inf], np.nan)

# 2. NaN을 0으로 채움
X = X.fillna(0)

# 3. y와 X 인덱스 정렬 (X가 변했을 경우 필수!)
y = y.loc[X.index]

X = sm.add_constant(X)
model = sm.OLS(y, X).fit()
print(model.summary())


                            OLS Regression Results                            
Dep. Variable:                 전체결제금액   R-squared:                       1.000
Model:                            OLS   Adj. R-squared:                  1.000
Method:                 Least Squares   F-statistic:                 8.224e+33
Date:                Tue, 12 Aug 2025   Prob (F-statistic):               0.00
Time:                        01:53:45   Log-Likelihood:             1.5858e+05
No. Observations:                7762   AIC:                        -3.172e+05
Df Residuals:                    7758   BIC:                        -3.171e+05
Df Model:                           3                                         
Covariance Type:            nonrobust                                         
                 coef    std err          t      P>|t|      [0.025      0.975]
------------------------------------------------------------------------------
const      -1.892e-10   4.92e-12    -38.441      0.0