In [1]:
from google.cloud import bigquery
from google.cloud.bigquery import job
import pandas as pd
import os
from tqdm import tqdm
import pandas as pd
PROJCECT = 'ballosodeuk'
bq = bigquery.Client(project=PROJCECT)
import statsmodels.api as sm 
import numpy as np

In [2]:
query = """
WITH hourly_sales AS (
  SELECT 
    FORMAT_DATETIME('%Y-%m-%d %H:00:00', orderYmdt) as datetime,
    EXTRACT(DAYOFWEEK FROM orderYmdt) as dayofweek,
    EXTRACT(HOUR FROM orderYmdt) as hour,
    SUM(lastMainPayAmt) as total_sales
  FROM `ballosodeuk.airbridge_warehouse.shopby_order`
  WHERE orderYmdt >= DATETIME_SUB(CURRENT_DATETIME(), INTERVAL 90 DAY)
  GROUP BY 1, 2, 3
)
SELECT * FROM hourly_sales
ORDER BY datetime DESC
"""

In [3]:
client = bigquery.Client()
job_config = bigquery.QueryJobConfig()
query_job = client.query(query, job_config=job_config)

In [4]:
df = query_job.to_dataframe()
df




Unnamed: 0,datetime,dayofweek,hour,total_sales
0,2024-11-28 23:00:00,5,23,100101
1,2024-11-28 22:00:00,5,22,198455
2,2024-11-28 21:00:00,5,21,85906
3,2024-11-28 20:00:00,5,20,44460
4,2024-11-28 19:00:00,5,19,231626
...,...,...,...,...
1189,2024-10-01 16:00:00,3,16,0
1190,2024-10-01 15:00:00,3,15,0
1191,2024-10-01 14:00:00,3,14,0
1192,2024-10-01 13:00:00,3,13,1000


In [8]:
import pandas as pd
import numpy as np
from datetime import datetime
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestRegressor

class SalesPrediction:
    def __init__(self, df):
        self.df = df
        self.model = None
        
    def prepare_data(self):
        # datetime이 이미 있으므로 변환 과정 생략
        self.df['is_weekend'] = self.df['dayofweek'].isin([1, 7]).astype(int)
        self.df['is_business_hour'] = self.df['hour'].between(9, 18).astype(int)
        
        self.df['sales_ma_7d'] = self.df.groupby(['dayofweek', 'hour'])['total_sales'].transform(
            lambda x: x.rolling(7, min_periods=1).mean()
        )
        
    def train_model(self):
        features = ['dayofweek', 'hour', 'is_weekend', 'is_business_hour', 'sales_ma_7d']
        X = self.df[features]
        y = self.df['total_sales']
        
        X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
        
        self.model = RandomForestRegressor(n_estimators=100, random_state=42)
        self.model.fit(X_train, y_train)
        
        train_score = self.model.score(X_train, y_train)
        test_score = self.model.score(X_test, y_test)
        print(f"Train R² score: {train_score:.3f}")
        print(f"Test R² score: {test_score:.3f}")
        
    def predict_remaining_day(self, current_hour=17):
        today = datetime.now().date()
        future_hours = []
        
        for hour in range(current_hour, 24):
            future_hours.append({
                'hour': hour,
                'dayofweek': datetime.now().isoweekday(),
                'is_weekend': int(datetime.now().isoweekday() in [1, 7]),
                'is_business_hour': int(9 <= hour <= 18),
                'sales_ma_7d': self.df[
                    (self.df['dayofweek'] == datetime.now().isoweekday()) & 
                    (self.df['hour'] == hour)
                ]['total_sales'].mean()
            })
        
        future_df = pd.DataFrame(future_hours)
        predictions = self.model.predict(future_df[self.model.feature_names_in_])
        
        return pd.DataFrame({
            'hour': future_df['hour'],
            'predicted_sales': predictions
        })

# 바로 예측 실행
predictor = SalesPrediction(df)
predictor.prepare_data()
predictor.train_model()

predictions = predictor.predict_remaining_day(current_hour=17)

total_predicted = predictions['predicted_sales'].sum()
print(f"\n오늘 남은 시간 예상 매출: ₩{total_predicted:,.0f}")

print("\n시간대별 예측:")
for _, row in predictions.iterrows():
    print(f"{row['hour']:02d}:00 - ₩{row['predicted_sales']:,.0f}")

Train R² score: 0.904
Test R² score: 0.215

오늘 남은 시간 예상 매출: ₩798,728

시간대별 예측:


ValueError: Unknown format code 'd' for object of type 'float'

In [10]:
import pandas as pd
import numpy as np
from datetime import datetime
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestRegressor

class SalesPrediction:
    def __init__(self, df):
        self.df = df
        self.model = None
        
    def prepare_data(self):
        # 기존 특성
        self.df['is_weekend'] = self.df['dayofweek'].isin([1, 7]).astype(int)
        self.df['is_business_hour'] = self.df['hour'].between(9, 18).astype(int)
        
        # 이동평균 기간 조정 및 추가 특성
        self.df['sales_ma_7d'] = self.df.groupby(['dayofweek', 'hour'])['total_sales'].transform(
            lambda x: x.rolling(7, min_periods=1).mean()
        )
        self.df['sales_ma_14d'] = self.df.groupby(['dayofweek', 'hour'])['total_sales'].transform(
            lambda x: x.rolling(14, min_periods=1).mean()
        )
        
        # 시간대 구분
        self.df['time_period'] = pd.cut(self.df['hour'], 
                                      bins=[-1, 6, 11, 14, 17, 21, 24],
                                      labels=['새벽', '오전', '점심', '오후', '저녁', '밤'])
        
        # 요일 그룹
        self.df['day_group'] = self.df['dayofweek'].map({
            1: '월', 2: '화수목', 3: '화수목', 4: '화수목', 
            5: '금', 6: '주말', 7: '주말'
        })
        
        # One-hot encoding
        self.df = pd.get_dummies(self.df, columns=['time_period', 'day_group'])
        
    def train_model(self):
        # 사용할 특성 선택
        feature_columns = [col for col in self.df.columns if col not in 
                         ['datetime', 'total_sales']]
        
        X = self.df[feature_columns]
        y = self.df['total_sales']
        
        X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
        
        # 모델 파라미터 조정
        self.model = RandomForestRegressor(
            n_estimators=200,
            max_depth=10,
            min_samples_split=5,
            min_samples_leaf=4,
            random_state=42
        )
        self.model.fit(X_train, y_train)
        
        train_score = self.model.score(X_train, y_train)
        test_score = self.model.score(X_test, y_test)
        print(f"Train R² score: {train_score:.3f}")
        print(f"Test R² score: {test_score:.3f}")
        
        # 특성 중요도 출력
        feature_importance = pd.DataFrame({
            'feature': feature_columns,
            'importance': self.model.feature_importances_
        }).sort_values('importance', ascending=False)
        print("\n상위 10개 중요 특성:")
        print(feature_importance.head(10))
    
    def predict_full_day(self, target_date=datetime(2024, 11, 29)):
        future_hours = []
        target_dayofweek = target_date.isoweekday()
        
        for hour in range(24):
            row = {
                'hour': hour,
                'dayofweek': target_dayofweek,
                'is_weekend': int(target_dayofweek in [1, 7]),
                'is_business_hour': int(9 <= hour <= 18)
            }
            
            # 이동평균 추가
            row['sales_ma_7d'] = self.df[
                (self.df['dayofweek'] == target_dayofweek) & 
                (self.df['hour'] == hour)
            ]['total_sales'].mean()
            
            row['sales_ma_14d'] = self.df[
                (self.df['dayofweek'] == target_dayofweek) & 
                (self.df['hour'] == hour)
            ]['total_sales'].mean()
            
            # 시간대 및 요일 그룹 추가
            time_period = pd.cut([hour], bins=[-1, 6, 11, 14, 17, 21, 24],
                               labels=['새벽', '오전', '점심', '오후', '저녕', '밤'])[0]
            day_group = {
                1: '월', 2: '화수목', 3: '화수목', 4: '화수목', 
                5: '금', 6: '주말', 7: '주말'
            }[target_dayofweek]
            
            # One-hot encoding 컬럼 추가
            for period in ['새벽', '오전', '점심', '오후', '저녁', '밤']:
                row[f'time_period_{period}'] = 1 if time_period == period else 0
            
            for group in ['월', '화수목', '금', '주말']:
                row[f'day_group_{group}'] = 1 if day_group == group else 0
            
            future_hours.append(row)
        
        future_df = pd.DataFrame(future_hours)
        # 원본 데이터와 동일한 컬럼 순서로 맞추기
        future_df = future_df[self.model.feature_names_in_]
        
        predictions = self.model.predict(future_df)
        
        return pd.DataFrame({
            'hour': future_df['hour'],
            'predicted_sales': predictions
        })

# 모델 실행
predictor = SalesPrediction(df)
predictor.prepare_data()
predictor.train_model()

# 29일 전체 매출 예측
print("\n[11월 29일 전체 매출 예측]")
full_day_predictions = predictor.predict_full_day()
total_full_day = full_day_predictions['predicted_sales'].sum()
print(f"29일 전체 예상 매출: ₩{total_full_day:,.0f}")

print("\n시간대별 예측:")
for _, row in full_day_predictions.iterrows():
    print(f"{row['hour']:02d}:00 - ₩{row['predicted_sales']:,.0f}")

Train R² score: 0.698
Test R² score: 0.363

상위 10개 중요 특성:
           feature  importance
4      sales_ma_7d    0.556535
5     sales_ma_14d    0.327622
1             hour    0.043490
0        dayofweek    0.022239
15   day_group_화수목    0.010749
10  time_period_저녁    0.006021
8   time_period_점심    0.005656
12     day_group_금    0.004812
2       is_weekend    0.004511
7   time_period_오전    0.004505

[11월 29일 전체 매출 예측]
29일 전체 예상 매출: ₩1,792,767

시간대별 예측:


ValueError: Unknown format code 'd' for object of type 'float'