In [4]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.metrics import r2_score, mean_absolute_percentage_error, mean_squared_error
from statsmodels.tsa.stattools import adfuller
from pmdarima import auto_arima
from statsmodels.tsa.seasonal import seasonal_decompose
from sklearn.preprocessing import MinMaxScaler
from sklearn.model_selection import TimeSeriesSplit
import tensorflow as tf
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import LSTM, Dense, Dropout
from tensorflow.keras.callbacks import EarlyStopping
from xgboost import XGBRegressor
from prophet import Prophet
import optuna
from keras_tuner import RandomSearch

In [5]:
train = pd.read_csv(r"D:\Du An\Data FLow\v1\data\train.csv")
test = pd.read_csv(r"D:\Du An\Data FLow\v1\data\test.csv")

In [6]:
def ensure_stationarity(df, column, max_diff=2):
    df_adjusted = df.copy()
    series = df[column].dropna()
    diffs = 0
    p_value = adfuller(series)[1]

    print(f"\nChecking stationarity for {column}:")
    print(f"Initial p-value: {p_value:.4f}")

    while p_value > 0.05 and diffs < max_diff:
        series = series.diff().dropna()
        df_adjusted[column] = df_adjusted[column].diff()
        diffs += 1
        p_value = adfuller(series)[1] if len(series) > 0 else 1.0
        print(f"After diff {diffs}: p-value = {p_value:.4f}")

    if diffs == max_diff and p_value > 0.05:
        print(f"Warning: {column} still non-stationary after {max_diff} differences")
    else:
        print(f"{column} is stationary after {diffs} differences")

    df_adjusted = df_adjusted.dropna().fillna(method='ffill')
    return df_adjusted

In [7]:
def evaluate(y_true, y_pred, model_name):
    r2 = r2_score(y_true, y_pred)
    rmse = np.sqrt(mean_squared_error(y_true, y_pred))
    print(f"{model_name} Results:")
    print(f"  R2: {r2:.4f}")
    print(f"  RMSE: {rmse:.4f}")
    return y_pred

In [8]:
def plot_results(y_true, y_pred, model_name):
    plt.figure(figsize=(10, 6))
    plt.plot(y_true, label='Thực tế', color='blue')
    plt.plot(y_pred, label=model_name, linestyle='--', color='orange')
    plt.title(f'Dự báo Doanh thu với {model_name}')
    plt.xlabel('Thời gian')
    plt.ylabel('Doanh thu')
    plt.legend()
    plt.grid(True, alpha=0.3)
    plt.show()

# --- Model 1: LSTM ---

In [9]:
def run_lstm():
    from sklearn.preprocessing import MinMaxScaler
    from tensorflow.keras.models import Sequential
    from tensorflow.keras.layers import LSTM, Dense, Dropout
    from tensorflow.keras.callbacks import EarlyStopping

    print("\n=== LSTM Model ===")
    train_stationary = ensure_stationarity(train, 'Revenue')
    test_stationary = ensure_stationarity(test, 'Revenue')

    scaler = MinMaxScaler()
    train_scaled = scaler.fit_transform(train_stationary[['Revenue']])
    test_scaled = scaler.transform(test_stationary[['Revenue']])

    def create_sequences(data, seq_length=10):
        X, y = [], []
        for i in range(len(data) - seq_length):
            X.append(data[i:i + seq_length])
            y.append(data[i + seq_length, 0])
        return np.array(X), np.array(y)

    seq_length = 10
    X_train, y_train = create_sequences(train_scaled, seq_length)
    X_test, y_test = create_sequences(test_scaled, seq_length)

    model = Sequential([
        LSTM(50, return_sequences=True, input_shape=(seq_length, 1)),
        Dropout(0.2),
        LSTM(50),
        Dense(1)
    ])
    model.compile(optimizer='adam', loss='mse')
    model.fit(X_train, y_train, epochs=20, batch_size=32,
              callbacks=[EarlyStopping(patience=5, restore_best_weights=True)], verbose=1)

    y_pred_scaled = model.predict(X_test, verbose=0)
    y_pred = scaler.inverse_transform(y_pred_scaled)[:, 0]
    y_true = test_stationary['Revenue'][seq_length:]

    y_pred = evaluate(y_true, y_pred, "LSTM")
    plot_results(y_true, y_pred, "LSTM")

In [11]:
run_lstm()


=== LSTM Model ===


MemoryError: Unable to allocate 812. MiB for an array with shape (901677, 118) and data type float64

# --- Model 2: ARIMA ---

In [12]:
def run_arima():
    from statsmodels.tsa.arima.model import ARIMA

    print("\n=== ARIMA Model ===")
    train_stationary = ensure_stationarity(train, 'Revenue')
    test_stationary = ensure_stationarity(test, 'Revenue')

    model = ARIMA(train_stationary['Revenue'], order=(1, 1, 1)).fit()
    y_pred = model.forecast(steps=len(test_stationary))
    y_true = test_stationary['Revenue']

    y_pred = evaluate(y_true, y_pred, "ARIMA")
    plot_results(y_true, y_pred, "ARIMA")

In [13]:
run_arima()


=== ARIMA Model ===


MemoryError: Unable to allocate 812. MiB for an array with shape (901677, 118) and data type float64

# --- Model 3: XGBoost ---

In [None]:
def run_xgboost():
    from xgboost import XGBRegressor

    print("\n=== XGBoost Model ===")
    train_stationary = ensure_stationarity(train, 'Revenue')
    test_stationary = ensure_stationarity(test, 'Revenue')
    train_stationary_units = ensure_stationarity(train, 'Units')
    test_stationary_units = ensure_stationarity(test, 'Units')

    model = XGBRegressor(n_estimators=100, max_depth=3, learning_rate=0.1)
    model.fit(train_stationary_units[['Units']], train_stationary['Revenue'])
    y_pred = model.predict(test_stationary_units[['Units']])
    y_true = test_stationary['Revenue']

    y_pred = evaluate(y_true, y_pred, "XGBoost")
    plot_results(y_true, y_pred, "XGBoost")

In [None]:
run_xgboost()

# --- Model 4: PatchTST (Transformer) ---

In [None]:
def run_patchtst():
    from neuralforecast import NeuralForecast
    from neuralforecast.models import PatchTST

    print("\n=== PatchTST Model ===")
    train_stationary = ensure_stationarity(train, 'Revenue')
    test_stationary = ensure_stationarity(test, 'Revenue')

    train_df = pd.DataFrame({
        'ds': pd.date_range(start='2020-01-01', periods=len(train_stationary), freq='D'),  # Adjust start date
        'y': train_stationary['Revenue'],
        'unique_id': '1'
    })
    test_df = pd.DataFrame({
        'ds': pd.date_range(start=train_df['ds'].iloc[-1] + pd.Timedelta(days=1),
                           periods=len(test_stationary), freq='D'),
        'unique_id': '1'
    })

    nf = NeuralForecast(models=[PatchTST(input_size=10, h=1, max_steps=500)], freq='D')
    nf.fit(df=train_df)
    y_pred = nf.predict(df=test_df)['PatchTST'].values
    y_true = test_stationary['Revenue']

    y_pred = evaluate(y_true, y_pred, "PatchTST")
    plot_results(y_true, y_pred, "PatchTST")

In [None]:
run_patchtst()

# --- Model 5: Prophet ---

In [None]:
def run_prophet():
    from prophet import Prophet

    print("\n=== Prophet Model ===")
    train_stationary = ensure_stationarity(train, 'Revenue')
    test_stationary = ensure_stationarity(test, 'Revenue')

    train_df = pd.DataFrame({
        'ds': pd.date_range(start='2020-01-01', periods=len(train_stationary), freq='D'),  # Adjust start date
        'y': train_stationary['Revenue']
    })

    model = Prophet(yearly_seasonality=True, weekly_seasonality=True)
    model.fit(train_df)
    future = model.make_future_dataframe(periods=len(test_stationary), freq='D')
    forecast = model.predict(future)
    y_pred = forecast['yhat'][-len(test_stationary):].values
    y_true = test_stationary['Revenue']

    y_pred = evaluate(y_true, y_pred, "Prophet")
    plot_results(y_true, y_pred, "Prophet")

In [None]:
run_prophet()