In [1]:
import pandas as pd
import numpy as np
import tensorflow as tf
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import LSTM, Dense, Dropout, Bidirectional
from tensorflow.keras.optimizers import Adam
from tensorflow.keras.callbacks import EarlyStopping, ReduceLROnPlateau
from sklearn.preprocessing import MinMaxScaler
import matplotlib.pyplot as plt

# ƒê·ªçc d·ªØ li·ªáu t·ª´ file CSV
train_file = "D:/Du An/Data FLow/v1/data/train.csv"
test_file = "D:/Du An/Data FLow/v1/data/test.csv"

df_train = pd.read_csv(train_file)
df_test = pd.read_csv(test_file)

# Chuy·ªÉn ƒë·ªïi c·ªôt Date th√†nh ki·ªÉu datetime
df_train['Date'] = pd.to_datetime(df_train['Date'])
df_test['Date'] = pd.to_datetime(df_test['Date'])

# Th√™m th√¥ng tin ng√†y trong tu·∫ßn
df_train['DayOfWeek'] = df_train['Date'].dt.dayofweek
df_test['DayOfWeek'] = df_test['Date'].dt.dayofweek

# S·∫Øp x·∫øp theo th·ªùi gian
df_train.sort_values(by='Date', inplace=True)
df_test.sort_values(by='Date', inplace=True)

# Ch·ªçn d·ªØ li·ªáu doanh thu v√† s·ªë l∆∞·ª£ng b√°n theo ng√†y
sales_train = df_train[['Date', 'Revenue', 'Units', 'DayOfWeek']].groupby('Date').sum()
sales_test = df_test[['Date', 'Revenue', 'Units', 'DayOfWeek']].groupby('Date').sum()

# Chu·∫©n h√≥a d·ªØ li·ªáu
scaler = MinMaxScaler(feature_range=(0, 1))
sales_train_scaled = scaler.fit_transform(sales_train)
sales_test_scaled = scaler.transform(sales_test)

# T·∫°o t·∫≠p d·ªØ li·ªáu cho LSTM
def create_lstm_dataset(data, time_step=30):
    X, y = [], []
    for i in range(len(data) - time_step):
        X.append(data[i:(i + time_step), :])
        y.append(data[i + time_step, :])
    return np.array(X), np.array(y)

time_step = 30
X_train, y_train = create_lstm_dataset(sales_train_scaled, time_step)
X_test, y_test = create_lstm_dataset(sales_test_scaled, time_step)

# X√¢y d·ª±ng m√¥ h√¨nh Bi-Directional LSTM
model = Sequential([
    Bidirectional(LSTM(128, return_sequences=True, input_shape=(time_step, X_train.shape[2]))),
    Dropout(0.2),
    Bidirectional(LSTM(64, return_sequences=True)),
    Dropout(0.2),
    Bidirectional(LSTM(32, return_sequences=False)),
    Dropout(0.2),
    Dense(y_train.shape[1])
])

model.compile(optimizer=Adam(learning_rate=0.001), loss='mse')

# Th√™m EarlyStopping v√† ReduceLROnPlateau
early_stopping = EarlyStopping(monitor='val_loss', patience=10, restore_best_weights=True)
reduce_lr = ReduceLROnPlateau(monitor='val_loss', factor=0.5, patience=5, min_lr=1e-5)

# Hu·∫•n luy·ªán m√¥ h√¨nh
history = model.fit(X_train, y_train, epochs=100, batch_size=32, validation_data=(X_test, y_test), verbose=1, callbacks=[early_stopping, reduce_lr])

# D·ª± b√°o
y_pred = model.predict(X_test)

# Chuy·ªÉn ƒë·ªïi d·ªØ li·ªáu v·ªÅ d·∫°ng g·ªëc
y_test_inv = scaler.inverse_transform(np.pad(y_test, ((0, 0), (0, 1)), mode='constant'))[:, :-1]
y_pred_inv = scaler.inverse_transform(np.pad(y_pred, ((0, 0), (0, 1)), mode='constant'))[:, :-1]

# V·∫Ω bi·ªÉu ƒë·ªì
dates = sales_test.index[time_step:]
plt.figure(figsize=(12, 6))
plt.plot(dates, y_test_inv[:, 0], label='Th·ª±c t·∫ø (Revenue)')
plt.plot(dates, y_pred_inv[:, 0], label='D·ª± b√°o (Revenue)')
plt.xlabel('Date')
plt.ylabel('Revenue')
plt.legend()
plt.show()

  super().__init__(**kwargs)


Epoch 1/100
[1m116/116[0m [32m‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ[0m[37m[0m [1m50s[0m 172ms/step - loss: 0.0049 - val_loss: 0.0015 - learning_rate: 0.0010
Epoch 2/100
[1m116/116[0m [32m‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ[0m[37m[0m [1m16s[0m 140ms/step - loss: 0.0042 - val_loss: 0.0015 - learning_rate: 0.0010
Epoch 3/100
[1m116/116[0m [32m‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ[0m[37m[0m [1m17s[0m 148ms/step - loss: 0.0042 - val_loss: 0.0015 - learning_rate: 0.0010
Epoch 4/100
[1m116/116[0m [32m‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ[0m[37m[0m [1m17s[0m 144ms/step - loss: 0.0048 - val_loss: 0.0014 - learning_rate: 0.0010
Epoch 5/100
[1m116/116[0m [32m‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ[0m[37m[0m [1m16s[0m 136ms/step - loss: 0.0041 - val_loss: 0.0014 - learning_rate: 0.0010
Epoch 6/100
[1m116/116[0m [32m‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ

ValueError: operands could not be broadcast together with shapes (505,4) (3,) (505,4) 

In [None]:
import lightgbm as lgb
import xgboost as xgb
from sklearn.ensemble import RandomForestRegressor

from sklearn.model_selection import train_test_split
from sklearn.preprocessing import MinMaxScaler
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score

In [None]:
# T·∫°o c√°c ƒë·∫∑c tr∆∞ng th·ªùi gian
def create_features(df):
    df['Year'] = df['Date'].dt.year
    df['Month'] = df['Date'].dt.month
    df['Day'] = df['Date'].dt.day
    df['DayOfWeek'] = df['Date'].dt.dayofweek
    return df

df_train = create_features(df_train)
df_test = create_features(df_test)

# Ch·ªçn ƒë·∫∑c tr∆∞ng v√† target
features = ['Year', 'Month', 'Day', 'DayOfWeek', 'Units']
target = 'Revenue'

# Chu·∫©n h√≥a d·ªØ li·ªáu ƒë·∫ßu v√†o
scaler = MinMaxScaler()
df_train[features] = scaler.fit_transform(df_train[features])
df_test[features] = scaler.transform(df_test[features])

# Chia t·∫≠p d·ªØ li·ªáu
X_train, X_val, y_train, y_val = train_test_split(df_train[features], df_train[target], test_size=0.2, random_state=42)

# Kh·ªüi t·∫°o m√¥ h√¨nh Random Forest
rf_model = RandomForestRegressor(n_estimators=200, max_depth=10, random_state=42)

# Hu·∫•n luy·ªán
rf_model.fit(X_train, y_train)

# D·ª± b√°o
y_pred_rf = rf_model.predict(X_val)

# ƒê√°nh gi√° m√¥ h√¨nh
mae = mean_absolute_error(y_val, y_pred_rf)
mse = mean_squared_error(y_val, y_pred_rf)
rmse = np.sqrt(mse)
r2 = r2_score(y_val, y_pred_rf)

print(f"MAE: {mae}")
print(f"RMSE: {rmse}")
print(f"R-squared: {r2}")


In [None]:
# S·∫Øp x·∫øp d·ªØ li·ªáu theo th·ªùi gian
df_train.sort_values(by='Date', inplace=True)
df_test.sort_values(by='Date', inplace=True)

# T·∫°o ƒë·∫∑c tr∆∞ng th·ªùi gian
def create_features(df):
    df['Year'] = df['Date'].dt.year
    df['Month'] = df['Date'].dt.month
    df['Day'] = df['Date'].dt.day
    df['DayOfWeek'] = df['Date'].dt.dayofweek
    return df

df_train = create_features(df_train)
df_test = create_features(df_test)

# T·∫°o lag features (l·∫•y gi√° tr·ªã doanh thu c·ªßa ng√†y tr∆∞·ªõc ƒë√≥ l√†m input)
def create_lag_features(df, lag_days=[1, 7, 14]):
    for lag in lag_days:
        df[f'Revenue_Lag{lag}'] = df['Revenue'].shift(lag)
    return df

df_train = create_lag_features(df_train)
df_test = create_lag_features(df_test)

# Lo·∫°i b·ªè c√°c h√†ng c√≥ gi√° tr·ªã NaN do shift()
df_train.dropna(inplace=True)
df_test.dropna(inplace=True)

# Ch·ªçn c√°c c·ªôt ƒë·∫ßu v√†o v√† ƒë·∫ßu ra
features = ['Year', 'Month', 'Day', 'DayOfWeek', 'Units', 'Revenue_Lag1', 'Revenue_Lag7', 'Revenue_Lag14']
target = 'Revenue'

# Chu·∫©n h√≥a d·ªØ li·ªáu
scaler = MinMaxScaler()
df_train[features] = scaler.fit_transform(df_train[features])
df_test[features] = scaler.transform(df_test[features])

# Chia t·∫≠p d·ªØ li·ªáu train v√† validation
X_train, X_val, y_train, y_val = train_test_split(df_train[features], df_train[target], test_size=0.2, random_state=42, shuffle=False)

# Kh·ªüi t·∫°o m√¥ h√¨nh XGBoost
xgb_model = xgb.XGBRegressor(objective='reg:squarederror', 
                             n_estimators=200, 
                             learning_rate=0.05, 
                             max_depth=6, 
                             subsample=0.8, 
                             colsample_bytree=0.8, 
                             random_state=42)

# Hu·∫•n luy·ªán m√¥ h√¨nh
xgb_model.fit(X_train, y_train, eval_set=[(X_val, y_val)], early_stopping_rounds=10, verbose=True)

# D·ª± b√°o
y_pred_xgb = xgb_model.predict(X_val)

# ƒê√°nh gi√° m√¥ h√¨nh
mae = mean_absolute_error(y_val, y_pred_xgb)
mse = mean_squared_error(y_val, y_pred_xgb)
rmse = np.sqrt(mse)
r2 = r2_score(y_val, y_pred_xgb)

print(f"MAE: {mae}")
print(f"RMSE: {rmse}")
print(f"R-squared: {r2}")

In [None]:
# T·∫°o ƒë·∫∑c tr∆∞ng th·ªùi gian
def create_features(df):
    df['Year'] = df['Date'].dt.year
    df['Month'] = df['Date'].dt.month
    df['Day'] = df['Date'].dt.day
    df['DayOfWeek'] = df['Date'].dt.dayofweek
    return df

df_train = create_features(df_train)
df_test = create_features(df_test)

# T·∫°o lag features (gi√° tr·ªã doanh thu c·ªßa ng√†y tr∆∞·ªõc ƒë√≥ l√†m input)
def create_lag_features(df, lag_days=[1, 7, 14]):
    for lag in lag_days:
        df[f'Revenue_Lag{lag}'] = df['Revenue'].shift(lag)
    return df

df_train = create_lag_features(df_train)
df_test = create_lag_features(df_test)

# Lo·∫°i b·ªè c√°c h√†ng NaN do shift()
df_train.dropna(inplace=True)
df_test.dropna(inplace=True)

# Ch·ªçn c√°c c·ªôt ƒë·∫∑c tr∆∞ng v√† nh√£n
features = ['Year', 'Month', 'Day', 'DayOfWeek', 'Units', 'Revenue_Lag1', 'Revenue_Lag7', 'Revenue_Lag14']
target = 'Revenue'

# Chu·∫©n h√≥a d·ªØ li·ªáu
scaler = MinMaxScaler()
df_train[features] = scaler.fit_transform(df_train[features])
df_test[features] = scaler.transform(df_test[features])

# Chia d·ªØ li·ªáu th√†nh t·∫≠p train v√† validation
X_train, X_val, y_train, y_val = train_test_split(df_train[features], df_train[target], test_size=0.2, random_state=42, shuffle=False)

# Kh·ªüi t·∫°o m√¥ h√¨nh LightGBM
lgb_model = lgb.LGBMRegressor(
    boosting_type='gbdt', 
    objective='regression', 
    n_estimators=500, 
    learning_rate=0.05, 
    max_depth=6, 
    num_leaves=31,
    subsample=0.8, 
    colsample_bytree=0.8,
    random_state=42
)

# Hu·∫•n luy·ªán m√¥ h√¨nh
lgb_model.fit(X_train, y_train, eval_set=[(X_val, y_val)], eval_metric='rmse', early_stopping_rounds=20, verbose=10)

# D·ª± b√°o
y_pred_lgb = lgb_model.predict(X_val)

# ƒê√°nh gi√° m√¥ h√¨nh
mae = mean_absolute_error(y_val, y_pred_lgb)
mse = mean_squared_error(y_val, y_pred_lgb)
rmse = np.sqrt(mse)
r2 = r2_score(y_val, y_pred_lgb)

print(f"üìå MAE: {mae}")
print(f"üìå RMSE: {rmse}")
print(f"üìå R-squared: {r2}")

# Tr·ª±c quan h√≥a k·∫øt qu·∫£
import matplotlib.pyplot as plt

dates = df_train['Date'].iloc[-len(y_val):]  # L·∫•y ng√†y t∆∞∆°ng ·ª©ng v·ªõi t·∫≠p validation
plt.figure(figsize=(12, 6))
plt.plot(dates, y_val.values, label='Th·ª±c t·∫ø (Revenue)', color='blue')
plt.plot(dates, y_pred_lgb, label='D·ª± b√°o (Revenue)', color='red')
plt.xlabel('Date')
plt.ylabel('Revenue')
plt.legend()
plt.show()