## 1. Setup dan Import Library

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import lightgbm as lgb
from sklearn.model_selection import TimeSeriesSplit
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score
import joblib
import warnings
warnings.filterwarnings('ignore')

# Set style
sns.set_style('whitegrid')
plt.rcParams['figure.figsize'] = (14, 6)

print("Library berhasil diimport")

## 2. Load Data yang Sudah Diproses

In [None]:
# Load processed daily trends data
# Ganti dengan path file processed Anda
data_path = '../data/processed/daily_trends_processed_latest.csv'

df = pd.read_csv(data_path)
df['date'] = pd.to_datetime(df['date'])
df = df.sort_values(['keyword', 'category', 'date'])

print(f"Shape data: {df.shape}")
print(f"Rentang tanggal: {df['date'].min()} sampai {df['date'].max()}")
print(f"Jumlah keywords: {df['keyword'].nunique()}")
print(f"Jumlah categories: {df['category'].nunique()}")

df.head()

## 3. Feature Engineering untuk Prediksi 7 Hari

In [None]:
def create_features_7day(df):
    """
    Buat features untuk prediksi 7 hari ke depan
    """
    df = df.copy()
    
    # Time-based features
    df['day'] = df['date'].dt.day
    df['month'] = df['date'].dt.month
    df['year'] = df['date'].dt.year
    df['dayofweek'] = df['date'].dt.dayofweek
    df['quarter'] = df['date'].dt.quarter
    df['weekofyear'] = df['date'].dt.isocalendar().week
    df['is_weekend'] = df['dayofweek'].isin([5, 6]).astype(int)
    
    # Lag features (30 hari untuk prediksi 7 hari)
    for lag in [1, 2, 3, 7, 14, 21, 30]:
        df[f'lag_{lag}'] = df.groupby(['keyword', 'category'])['interest_value'].shift(lag)
    
    # Rolling statistics (window lebih besar untuk stabilitas)
    for window in [7, 14, 30]:
        df[f'rolling_mean_{window}'] = df.groupby(['keyword', 'category'])['interest_value'].transform(
            lambda x: x.rolling(window=window, min_periods=1).mean()
        )
        df[f'rolling_std_{window}'] = df.groupby(['keyword', 'category'])['interest_value'].transform(
            lambda x: x.rolling(window=window, min_periods=1).std()
        )
        df[f'rolling_max_{window}'] = df.groupby(['keyword', 'category'])['interest_value'].transform(
            lambda x: x.rolling(window=window, min_periods=1).max()
        )
        df[f'rolling_min_{window}'] = df.groupby(['keyword', 'category'])['interest_value'].transform(
            lambda x: x.rolling(window=window, min_periods=1).min()
        )
    
    # Trend features
    df['diff_1'] = df.groupby(['keyword', 'category'])['interest_value'].diff(1)
    df['diff_7'] = df.groupby(['keyword', 'category'])['interest_value'].diff(7)
    
    # Target: interest_value 7 hari ke depan
    df['target_7d'] = df.groupby(['keyword', 'category'])['interest_value'].shift(-7)
    
    return df

# Create features
df = create_features_7day(df)

# Drop rows dengan NaN
df = df.dropna()

print(f"Features berhasil dibuat. Shape baru: {df.shape}")
print(f"Jumlah kolom features: {df.shape[1]}")

## 4. Persiapan Train/Test Split

In [None]:
# Define features dan target
feature_cols = [
    'day', 'month', 'year', 'dayofweek', 'quarter', 'weekofyear', 'is_weekend',
    'lag_1', 'lag_2', 'lag_3', 'lag_7', 'lag_14', 'lag_21', 'lag_30',
    'rolling_mean_7', 'rolling_mean_14', 'rolling_mean_30',
    'rolling_std_7', 'rolling_std_14', 'rolling_std_30',
    'rolling_max_7', 'rolling_max_14', 'rolling_max_30',
    'rolling_min_7', 'rolling_min_14', 'rolling_min_30',
    'diff_1', 'diff_7'
]

categorical_features = ['keyword', 'category', 'dayofweek', 'month', 'quarter']

# Encode categorical features
for col in categorical_features:
    if col in df.columns:
        df[col] = df[col].astype('category')

# Time-based split (80% train, 20% test)
split_date = df['date'].quantile(0.8)

train_df = df[df['date'] < split_date]
test_df = df[df['date'] >= split_date]

X_train = train_df[feature_cols + categorical_features]
y_train = train_df['target_7d']

X_test = test_df[feature_cols + categorical_features]
y_test = test_df['target_7d']

print(f"Train set: {X_train.shape[0]} samples")
print(f"Test set: {X_test.shape[0]} samples")
print(f"Periode train: {train_df['date'].min()} sampai {train_df['date'].max()}")
print(f"Periode test: {test_df['date'].min()} sampai {test_df['date'].max()}")

## 5. Training Model LightGBM

In [None]:
# LightGBM parameters
params = {
    'objective': 'regression',
    'metric': 'rmse',
    'boosting_type': 'gbdt',
    'num_leaves': 31,
    'learning_rate': 0.05,
    'feature_fraction': 0.9,
    'bagging_fraction': 0.8,
    'bagging_freq': 5,
    'verbose': -1,
    'random_state': 42
}

# Create datasets
train_data = lgb.Dataset(X_train, label=y_train, categorical_feature=categorical_features)
test_data = lgb.Dataset(X_test, label=y_test, reference=train_data, categorical_feature=categorical_features)

# Train model
print("Training model LightGBM untuk prediksi 7 hari...")
model = lgb.train(
    params,
    train_data,
    num_boost_round=1000,
    valid_sets=[train_data, test_data],
    valid_names=['train', 'test'],
    callbacks=[lgb.early_stopping(stopping_rounds=50), lgb.log_evaluation(period=100)]
)

print("\nTraining selesai")
print(f"Best iteration: {model.best_iteration}")
print(f"Best RMSE: {model.best_score['test']['rmse']:.4f}")

## 6. Evaluasi Model

In [None]:
# Prediksi
y_pred_train = model.predict(X_train, num_iteration=model.best_iteration)
y_pred_test = model.predict(X_test, num_iteration=model.best_iteration)

# Hitung metrik
def calculate_metrics(y_true, y_pred, dataset_name):
    mae = mean_absolute_error(y_true, y_pred)
    rmse = np.sqrt(mean_squared_error(y_true, y_pred))
    r2 = r2_score(y_true, y_pred)
    mape = np.mean(np.abs((y_true - y_pred) / (y_true + 1e-10))) * 100
    
    print(f"\nMetrik {dataset_name}:")
    print(f"  MAE:  {mae:.4f}")
    print(f"  RMSE: {rmse:.4f}")
    print(f"  R2:   {r2:.4f}")
    print(f"  MAPE: {mape:.2f}%")
    
    return {'MAE': mae, 'RMSE': rmse, 'R2': r2, 'MAPE': mape}

train_metrics = calculate_metrics(y_train, y_pred_train, "Train")
test_metrics = calculate_metrics(y_test, y_pred_test, "Test")

## 7. Feature Importance

In [None]:
# Plot feature importance
importance_df = pd.DataFrame({
    'feature': model.feature_name(),
    'importance': model.feature_importance(importance_type='gain')
}).sort_values('importance', ascending=False)

plt.figure(figsize=(10, 8))
sns.barplot(data=importance_df.head(20), x='importance', y='feature')
plt.title('Top 20 Feature Importance - Prediksi 7 Hari', fontsize=14, fontweight='bold')
plt.xlabel('Importance (Gain)')
plt.tight_layout()
plt.show()

print("\nTop 10 Features Paling Penting:")
print(importance_df.head(10).to_string(index=False))

## 8. Visualisasi Prediksi

In [None]:
# Scatter plot: Actual vs Predicted
fig, axes = plt.subplots(1, 2, figsize=(15, 5))

# Train set
axes[0].scatter(y_train, y_pred_train, alpha=0.3, s=10)
axes[0].plot([0, 100], [0, 100], 'r--', lw=2)
axes[0].set_xlabel('Actual Interest Value')
axes[0].set_ylabel('Predicted Interest Value (7 hari)')
axes[0].set_title(f'Train Set (R2 = {train_metrics["R2"]:.4f})')
axes[0].grid(True, alpha=0.3)

# Test set
axes[1].scatter(y_test, y_pred_test, alpha=0.3, s=10, color='orange')
axes[1].plot([0, 100], [0, 100], 'r--', lw=2)
axes[1].set_xlabel('Actual Interest Value')
axes[1].set_ylabel('Predicted Interest Value (7 hari)')
axes[1].set_title(f'Test Set (R2 = {test_metrics["R2"]:.4f})')
axes[1].grid(True, alpha=0.3)

plt.tight_layout()
plt.show()

In [None]:
# Time series plot untuk sample keyword
sample_keyword = test_df['keyword'].iloc[0]
sample_category = test_df['category'].iloc[0]

sample_data = test_df[
    (test_df['keyword'] == sample_keyword) & 
    (test_df['category'] == sample_category)
].copy().head(30)  # Ambil 30 hari untuk visualisasi

sample_data['predicted_7d'] = model.predict(
    sample_data[feature_cols + categorical_features],
    num_iteration=model.best_iteration
)

plt.figure(figsize=(14, 6))
plt.plot(sample_data['date'], sample_data['target_7d'], label='Actual (7 hari ke depan)', marker='o', linewidth=2)
plt.plot(sample_data['date'], sample_data['predicted_7d'], label='Predicted (7 hari ke depan)', marker='s', linewidth=2, alpha=0.7)
plt.title(f'Prediksi 7 Hari: "{sample_keyword}" ({sample_category})', fontsize=14, fontweight='bold')
plt.xlabel('Tanggal')
plt.ylabel('Interest Value')
plt.legend()
plt.grid(True, alpha=0.3)
plt.xticks(rotation=45)
plt.tight_layout()
plt.show()

## 9. Identifikasi Hari Terbaik untuk Posting

In [None]:
# Fungsi untuk mendapatkan rekomendasi hari terbaik
def get_best_days(df, model, feature_cols, categorical_features, keyword, category, n_days=7):
    """
    Prediksi dan rekomendasikan hari terbaik untuk posting
    """
    # Filter data untuk keyword dan category tertentu
    latest_data = df[
        (df['keyword'] == keyword) & 
        (df['category'] == category)
    ].tail(1)
    
    if len(latest_data) == 0:
        return None
    
    # Prediksi
    predicted_value = model.predict(
        latest_data[feature_cols + categorical_features],
        num_iteration=model.best_iteration
    )[0]
    
    # Tanggal 7 hari ke depan
    future_date = latest_data['date'].iloc[0] + pd.Timedelta(days=7)
    day_name = future_date.strftime('%A, %d %B %Y')
    
    return {
        'keyword': keyword,
        'category': category,
        'current_date': latest_data['date'].iloc[0],
        'predicted_date': future_date,
        'day_name': day_name,
        'predicted_interest': predicted_value,
        'recommendation': 'POSTING' if predicted_value > 60 else 'HINDARI' if predicted_value < 40 else 'NETRAL'
    }

# Test untuk beberapa keywords
sample_keywords = test_df[['keyword', 'category']].drop_duplicates().head(5)

print("\nRekomendasi Waktu Posting (7 Hari ke Depan):")
print("="*80)

recommendations = []
for _, row in sample_keywords.iterrows():
    rec = get_best_days(test_df, model, feature_cols, categorical_features, 
                        row['keyword'], row['category'])
    if rec:
        recommendations.append(rec)
        print(f"\nKeyword: {rec['keyword']} | Category: {rec['category']}")
        print(f"  Tanggal Prediksi: {rec['day_name']}")
        print(f"  Predicted Interest: {rec['predicted_interest']:.2f}")
        print(f"  Rekomendasi: {rec['recommendation']}")

# Convert ke DataFrame
rec_df = pd.DataFrame(recommendations)
rec_df

## 10. Simpan Model

In [None]:
# Simpan model
model_path = '../models/lightgbm_daily_7days.txt'
model.save_model(model_path)

# Simpan metrics
metrics = {
    'model': 'LightGBM',
    'forecast_horizon': '7 days',
    'train': train_metrics,
    'test': test_metrics,
    'feature_importance': importance_df.to_dict('records')[:20]
}

import json
with open('../models/lightgbm_daily_7days_metrics.json', 'w') as f:
    json.dump(metrics, f, indent=2)

print(f"Model disimpan ke: {model_path}")
print(f"Metrics disimpan ke: ../models/lightgbm_daily_7days_metrics.json")

## 11. Ringkasan

### Performa Model:
- Algoritma: LightGBM
- Forecast Horizon: 7 hari ke depan
- Test RMSE: Lihat output di atas
- Test R2: Lihat output di atas
- Waktu Training: Cepat (biasanya < 1 menit)

### Output Model:
- Prediksi interest value 7 hari ke depan
- Rekomendasi: POSTING / HINDARI / NETRAL
- Untuk ensemble dengan model lain (LSTM, Neural Prophet)

### Next Steps:
1. Bandingkan dengan LSTM dan Neural Prophet (forecast horizon sama: 7 hari)
2. Ensemble 3 model untuk prediksi lebih robust
3. Kombinasikan dengan hourly model untuk rekomendasi jam posting
4. Deploy untuk production