In [1]:
import pandas as pd
import numpy as np
import tensorflow as tf
from tensorflow.keras.layers import Input, LSTM, Dense, RepeatVector, TimeDistributed
from tensorflow.keras.models import Model
from sklearn.preprocessing import MinMaxScaler
from sklearn.metrics import mean_squared_error
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import train_test_split
from sklearn.cluster import KMeans
import tensorflow.keras.backend as K

# Load and preprocess data
data = pd.read_csv(r"file_transposed_updated1 (2).csv")
revenue_columns = ['Tổng', 'Thương nghiệp', 'Khách sạn nhà hàng', 'Du lịch', 'Dịch vụ']

# Scale data
scaler = MinMaxScaler()
data[revenue_columns] = scaler.fit_transform(data[revenue_columns])

# Create LSTM dataset with look-back
look_back = 12
def create_lstm_dataset(data, look_back=12):
    X = []
    y = []
    for i in range(len(data) - look_back):
        X.append(data[i:(i + look_back)][revenue_columns].values)
        y.append(data.iloc[i + look_back][revenue_columns[0]])  # Use 'Tổng' as the target
    return np.array(X), np.array(y)

X, y = create_lstm_dataset(data, look_back=look_back)

# Define the Autoencoder model
timesteps = X.shape[1]
features = X.shape[2]
num_clusters = 3  # Number of clusters

input_layer = Input(shape=(timesteps, features))
encoded = LSTM(256, activation='relu', return_sequences=True)(input_layer)
encoded = LSTM(128, activation='relu', return_sequences=True)(encoded)
latent_space = LSTM(64, activation='relu', return_sequences=False)(encoded)

decoded = RepeatVector(timesteps)(latent_space)
decoded = LSTM(128, activation='relu', return_sequences=True)(decoded)
decoded = LSTM(256, activation='relu', return_sequences=True)(decoded)
output_layer = TimeDistributed(Dense(features))(decoded)

# Compile the Autoencoder model
autoencoder = Model(inputs=input_layer, outputs=output_layer)
autoencoder.compile(optimizer='adam', loss='mse')

# Pre-train the Autoencoder
autoencoder.fit(X, X, epochs=50, batch_size=16, verbose=1)

# Extract latent space for clustering
encoder = Model(inputs=input_layer, outputs=latent_space)
latent_space_output = encoder.predict(X)

# Apply KMeans clustering in the latent space
kmeans = KMeans(n_clusters=num_clusters, random_state=0)
cluster_labels = kmeans.fit_predict(latent_space_output)

# Prepare data for prediction
X_clustered = np.hstack([X[:, -1, :], cluster_labels.reshape(-1, 1)])  # Add cluster labels to the last timestep
X_clustered = pd.DataFrame(X_clustered, columns=revenue_columns + ['Cluster'])

# Prepare features (X) and target (y) for Random Forest prediction
X_rf = X_clustered.drop(columns=['Cluster']).values  # Features
y_rf = y  # Target


# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X_rf, y_rf, test_size=0.2, random_state=0)

# Define the Random Forest model for prediction
model = RandomForestRegressor(n_estimators=100, random_state=0)
model.fit(X_train.reshape(X_train.shape[0], -1), y_train)  # Flatten for Random Forest

# Make predictions
y_pred_rf = model.predict(X_test.reshape(X_test.shape[0], -1))  # Flatten for predictions

# Calculate RMSE for the prediction model
rmse_rf = np.sqrt(mean_squared_error(y_test, y_pred_rf))
print("RMSE for Random Forest with KMeans:", rmse_rf) 

Epoch 1/50
[1m11/11[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m33s[0m 92ms/step - loss: 0.2500
Epoch 2/50
[1m11/11[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 96ms/step - loss: 0.0974
Epoch 3/50
[1m11/11[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 91ms/step - loss: 0.0740
Epoch 4/50
[1m11/11[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 91ms/step - loss: 0.0561
Epoch 5/50
[1m11/11[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 91ms/step - loss: 0.0395
Epoch 6/50
[1m11/11[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 99ms/step - loss: 0.0303
Epoch 7/50
[1m11/11[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 94ms/step - loss: 0.0252
Epoch 8/50
[1m11/11[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 91ms/step - loss: 0.0227
Epoch 9/50
[1m11/11[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 99ms/step - loss: 0.0154
Epoch 10/50
[1m11/11[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 95ms/step - loss: 0.012



RMSE for Random Forest with KMeans: 0.04738743341596637


In [None]:
import pandas as pd
import numpy as np
import tensorflow as tf
from tensorflow.keras.layers import Input, LSTM, Dense, RepeatVector, TimeDistributed
from tensorflow.keras.models import Model
from sklearn.preprocessing import MinMaxScaler
from sklearn.metrics import mean_squared_error
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import train_test_split
import tensorflow.keras.backend as K

# Load and preprocess data
data = pd.read_csv(r"file_transposed_updated1 (2).csv")
revenue_columns = ['Tổng', 'Thương nghiệp', 'Khách sạn nhà hàng', 'Du lịch', 'Dịch vụ']

# Scale data
scaler = MinMaxScaler()
data[revenue_columns] = scaler.fit_transform(data[revenue_columns])

# Create LSTM dataset with look-back
look_back = 12
def create_lstm_dataset(data, look_back=12):
    X = []
    y = []
    for i in range(len(data) - look_back):
        X.append(data[i:(i + look_back)][revenue_columns].values)
        y.append(data.iloc[i + look_back][revenue_columns[0]])  # Use 'Tổng' as the target
    return np.array(X), np.array(y)

X, y = create_lstm_dataset(data, look_back=look_back)

# Define the Autoencoder model
timesteps = X.shape[1]
features = X.shape[2]
num_clusters = 3  # Number of clusters

input_layer = Input(shape=(timesteps, features))
encoded = LSTM(256, activation='relu', return_sequences=True)(input_layer)
encoded = LSTM(128, activation='relu', return_sequences=True)(encoded)
latent_space = LSTM(64, activation='relu', return_sequences=False)(encoded)

decoded = RepeatVector(timesteps)(latent_space)
decoded = LSTM(128, activation='relu', return_sequences=True)(decoded)
decoded = LSTM(256, activation='relu', return_sequences=True)(decoded)
output_layer = TimeDistributed(Dense(features))(decoded)

# Define the Clustering Layer for DTC
class ClusteringLayer(tf.keras.layers.Layer):
    def __init__(self, num_clusters, **kwargs):
        super(ClusteringLayer, self).__init__(**kwargs)
        self.num_clusters = num_clusters

    def build(self, input_shape):
        self.clusters = self.add_weight(shape=(self.num_clusters, input_shape[-1]),
                                        initializer='he_normal',
                                        trainable=True)

    def call(self, inputs):
        q = 1.0 / (1.0 + (K.sum(K.square(K.expand_dims(inputs, axis=1) - self.clusters), axis=2)))
        q = q ** ((1.0 + 1.0) / 2.0)
        q = K.transpose(K.transpose(q) / K.sum(q, axis=1))
        return q

# Instantiate and connect clustering layer
clustering_layer = ClusteringLayer(num_clusters, name='clustering')(latent_space)

# Compile the Autoencoder model with clustering
autoencoder = Model(inputs=input_layer, outputs=[output_layer, clustering_layer])
autoencoder.compile(optimizer='adam', loss=['mse', 'kld'], loss_weights=[1.0, 0.5])

# Pre-train the Autoencoder without clustering
autoencoder_pretrain = Model(inputs=input_layer, outputs=output_layer)
autoencoder_pretrain.compile(optimizer='adam', loss='mse')
autoencoder_pretrain.fit(X, X, epochs=50, batch_size=16, verbose=1)

# Fine-tune the model with clustering
autoencoder.fit(X, [X, np.zeros((X.shape[0], num_clusters))], epochs=100, batch_size=16, verbose=1)

# Extract latent space for clustering
encoder = Model(inputs=input_layer, outputs=latent_space)
latent_space_output = encoder.predict(X)

# Get clustering output from the model
q_values = autoencoder.predict(X)[1]  # Use the second output for clustering
cluster_labels = np.argmax(q_values, axis=1)

# Prepare data for prediction
X_clustered = np.hstack([X[:, -1, :], cluster_labels.reshape(-1, 1)])  # Add cluster labels to the last timestep
X_clustered = pd.DataFrame(X_clustered, columns=revenue_columns + ['Cluster'])

# Prepare features (X) and target (y) for LSTM prediction
X_lstm = X_clustered.drop(columns=['Cluster']).values  # Features
y_lstm = y  # Target

# Ensure the number of samples is divisible by look_back
num_samples = X_lstm.shape[0]
if num_samples % look_back != 0:
    X_lstm = X_lstm[:num_samples - (num_samples % look_back)]  # Truncate to make it divisible

# Reshape X for LSTM
X_lstm = X_lstm.reshape((-1, look_back, features))  # Reshape to (num_samples // look_back, look_back, features)

# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X_lstm, y_lstm[:X_lstm.shape[0]], test_size=0.2, random_state=0)

# Define the Random Forest model for prediction
model = RandomForestRegressor(n_estimators=100, random_state=0)
model.fit(X_train.reshape(X_train.shape[0], -1), y_train)  # Flatten for Random Forest

# Make predictions
y_pred_rf = model.predict(X_test.reshape(X_test.shape[0], -1))  # Flatten for predictions

# Calculate RMSE for the prediction model
rmse_rf = np.sqrt(mean_squared_error(y_test, y_pred_rf))
print("RMSE for Random Forest with DTC:", rmse_rf)




Epoch 1/50
[1m11/11[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m15s[0m 49ms/step - loss: 0.2378
Epoch 2/50
[1m11/11[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 41ms/step - loss: 0.0919
Epoch 3/50
[1m11/11[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 39ms/step - loss: 0.0744
Epoch 4/50
[1m11/11[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 41ms/step - loss: 0.0532
Epoch 5/50
[1m11/11[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 41ms/step - loss: 0.0383
Epoch 6/50
[1m11/11[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 45ms/step - loss: 0.0241
Epoch 7/50
[1m11/11[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 43ms/step - loss: 0.0216
Epoch 8/50
[1m11/11[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 39ms/step - loss: 0.0164
Epoch 9/50
[1m11/11[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 39ms/step - loss: 0.0132
Epoch 10/50
[1m11/11[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 42ms/step - loss: 0.01

ssss


In [None]:
import matplotlib.pyplot as plt

# Define a range of num_clusters to test (starting from 2, since 0 or 1 cluster doesn't make sense in clustering context)
num_clusters_list = list(range(2, 21))  # Example range from 2 to 20
rmse_dtc_list = []

for num_clusters in num_clusters_list:
    # Define and compile the clustering layer for the current num_clusters
    clustering_layer = ClusteringLayer(num_clusters, name='clustering')(latent_space)
    autoencoder = Model(inputs=input_layer, outputs=[output_layer, clustering_layer])
    autoencoder.compile(optimizer='adam', loss=['mse', 'kld'], loss_weights=[1.0, 0.5])
    
    # Fine-tune the model with clustering
    autoencoder.fit(X, [X, np.zeros((X.shape[0], num_clusters))], epochs=10, batch_size=16, verbose=0)  # Shorter training for illustration
    
    # Extract clustering output from the model
    q_values = autoencoder.predict(X)[1]
    cluster_labels = np.argmax(q_values, axis=1)

    # Prepare data for prediction with Random Forest
    X_clustered = np.hstack([X[:, -1, :], cluster_labels.reshape(-1, 1)])
    X_clustered = pd.DataFrame(X_clustered, columns=revenue_columns + ['Cluster'])
    X_rf_dtc = X_clustered.drop(columns=['Cluster']).values  # Features
    X_train, X_test, y_train, y_test = train_test_split(X_rf_dtc, y, test_size=0.2, random_state=0)
    
    # Train Random Forest model and calculate RMSE
    model = RandomForestRegressor(n_estimators=100, random_state=0)
    model.fit(X_train.reshape(X_train.shape[0], -1), y_train)
    y_pred_rf = model.predict(X_test.reshape(X_test.shape[0], -1))
    rmse_dtc = np.sqrt(mean_squared_error(y_test, y_pred_rf))
    rmse_dtc_list.append(rmse_dtc)

# Plotting the results
plt.figure(figsize=(10, 5))
plt.plot(num_clusters_list, rmse_dtc_list, marker='o', linestyle='-', color='b', label='RMSE for Random Forest with DTC')
plt.xlabel('Number of Clusters (num_clusters)')
plt.ylabel('RMSE')
plt.title('RMSE for Random Forest with DTC vs. Number of Clusters')
plt.legend()
plt.grid(True)
plt.show()



KeyboardInterrupt: 

sss

In [10]:
rmse_dtc_list

[]

In [None]:
import pandas as pd
import numpy as np
import tensorflow as tf
from tensorflow.keras.layers import Input, LSTM, Dense, RepeatVector, TimeDistributed
from tensorflow.keras.models import Model
from sklearn.preprocessing import MinMaxScaler
from sklearn.metrics import mean_squared_error
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import train_test_split
import tensorflow.keras.backend as K

# Load and preprocess data
data = pd.read_csv(r"file_transposed_updated1 (2).csv")
revenue_columns = ['Tổng', 'Thương nghiệp', 'Khách sạn nhà hàng', 'Du lịch', 'Dịch vụ']

# Scale data
scaler = MinMaxScaler()
data[revenue_columns] = scaler.fit_transform(data[revenue_columns])

# Create LSTM dataset with look-back
look_back = 12
def create_lstm_dataset(data, look_back=12):
    X = []
    y = []
    for i in range(len(data) - look_back):
        X.append(data[i:(i + look_back)][revenue_columns].values)
        y.append(data.iloc[i + look_back][revenue_columns].values)  # Use all columns as the target
    return np.array(X), np.array(y)

X, y = create_lstm_dataset(data, look_back=look_back)

# Define the Autoencoder model
timesteps = X.shape[1]
features = X.shape[2]
num_clusters = 3  # Number of clusters

input_layer = Input(shape=(timesteps, features))
encoded = LSTM(256, activation='relu', return_sequences=True)(input_layer)
encoded = LSTM(128, activation='relu', return_sequences=True)(encoded)
latent_space = LSTM(64, activation='relu', return_sequences=False)(encoded)

decoded = RepeatVector(timesteps)(latent_space)
decoded = LSTM(128, activation='relu', return_sequences=True)(decoded)
decoded = LSTM(256, activation='relu', return_sequences=True)(decoded)
output_layer = TimeDistributed(Dense(features))(decoded)

# Define the Clustering Layer for DTC
class ClusteringLayer(tf.keras.layers.Layer):
    def __init__(self, num_clusters, **kwargs):
        super(ClusteringLayer, self).__init__(**kwargs)
        self.num_clusters = num_clusters

    def build(self, input_shape):
        self.clusters = self.add_weight(shape=(self.num_clusters, input_shape[-1]),
                                        initializer='he_normal',
                                        trainable=True)

    def call(self, inputs):
        q = 1.0 / (1.0 + (K.sum(K.square(K.expand_dims(inputs, axis=1) - self.clusters), axis=2)))
        q = q ** ((1.0 + 1.0) / 2.0)
        q = K.transpose(K.transpose(q) / K.sum(q, axis=1))
        return q

# Instantiate and connect clustering layer
clustering_layer = ClusteringLayer(num_clusters, name='clustering')(latent_space)

# Compile the Autoencoder model with clustering
autoencoder = Model(inputs=input_layer, outputs=[output_layer, clustering_layer])
autoencoder.compile(optimizer='adam', loss=['mse', 'kld'], loss_weights=[1.0, 0.5])

# Pre-train the Autoencoder without clustering
autoencoder_pretrain = Model(inputs=input_layer, outputs=output_layer)
autoencoder_pretrain.compile(optimizer='adam', loss='mse')
autoencoder_pretrain.fit(X, X, epochs=50, batch_size=16, verbose=1)

# Fine-tune the model with clustering
autoencoder.fit(X, [X, np.zeros((X.shape[0], num_clusters))], epochs=100, batch_size=16, verbose=1)

# Extract latent space for clustering
encoder = Model(inputs=input_layer, outputs=latent_space)
latent_space_output = encoder.predict(X)

# Get clustering output from the model
q_values = autoencoder.predict(X)[1]  # Use the second output for clustering
cluster_labels = np.argmax(q_values, axis=1)

# Prepare data for prediction
X_clustered = np.hstack([X[:, -1, :], cluster_labels.reshape(-1, 1)])  # Add cluster labels to the last timestep
X_clustered = pd.DataFrame(X_clustered, columns=revenue_columns + ['Cluster'])

# Prepare features (X) and target (y) for LSTM prediction
X_lstm = X_clustered.drop(columns=['Cluster']).values  # Features
y_lstm = y  # Target

# Ensure the number of samples is divisible by look_back
num_samples = X_lstm.shape[0]
if num_samples % look_back != 0:
    X_lstm = X_lstm[:num_samples - (num_samples % look_back)]  # Truncate to make it divisible

# Reshape X for LSTM
X_lstm = X_lstm.reshape((-1, look_back, features))  # Reshape to (num_samples // look_back, look_back, features)

# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X_lstm, y_lstm[:X_lstm.shape[0]], test_size=0.2, random_state=0)

# Define the Random Forest model for prediction
model = RandomForestRegressor(n_estimators=100, random_state=0)
model.fit(X_train.reshape(X_train.shape[0], -1), y_train)  # Flatten for Random Forest

# Make predictions
y_pred_rf = model.predict(X_test.reshape(X_test.shape[0], -1))  # Flatten for predictions

# Calculate RMSE for each predicted column
for i, col in enumerate(revenue_columns):
    rmse = np.sqrt(mean_squared_error(y_test[:, i], y_pred_rf[:, i]))
    print(f"RMSE for {col}:", rmse)



Epoch 1/50
[1m11/11[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m4s[0m 19ms/step - loss: 0.2372
Epoch 2/50
[1m11/11[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 18ms/step - loss: 0.1085
Epoch 3/50
[1m11/11[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 18ms/step - loss: 0.0990
Epoch 4/50
[1m11/11[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 18ms/step - loss: 0.0714
Epoch 5/50
[1m11/11[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 18ms/step - loss: 0.0607
Epoch 6/50
[1m11/11[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 18ms/step - loss: 0.0374
Epoch 7/50
[1m11/11[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 20ms/step - loss: 0.0268
Epoch 8/50
[1m11/11[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 18ms/step - loss: 0.0213
Epoch 9/50
[1m11/11[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 19ms/step - loss: 0.0157
Epoch 10/50
[1m11/11[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 20ms/step - loss: 0.014

In [12]:
# Tính RMSE trên tập huấn luyện
y_train_pred = model.predict(X_train.reshape(X_train.shape[0], -1))  # Dự đoán trên tập huấn luyện
rmse_train = np.sqrt(mean_squared_error(y_train, y_train_pred))
print("RMSE for Random Forest on Training Set:", rmse_train)

# Tính RMSE trên tập kiểm tra
y_test_pred = model.predict(X_test.reshape(X_test.shape[0], -1))  # Dự đoán trên tập kiểm tra
rmse_test = np.sqrt(mean_squared_error(y_test, y_test_pred))
print("RMSE for Random Forest on Testing Set:", rmse_test)


RMSE for Random Forest on Training Set: 0.004248759704764113
RMSE for Random Forest on Testing Set: 0.003977276856062791


So với việc sử dụng các phương pháp phân cụm truyền thống như Kmean, chỉ số RMSE của mô hình đã được cải thiện

Dự đoán doanh thu (Tổng) cho từng tháng năm 2025:
January: 146115.00000000012
February: 144104.52000000008
March: 143182.36000000007
April: 142320.66000000003
May: 141669.76
June: 140462.71000000002
July: 140641.88000000003
August: 140607.07000000004


  new_sequence[-1, 0] = pred  # Thêm dự đoán mới vào vị trí cuối cùng
  new_sequence[-1, 0] = pred  # Thêm dự đoán mới vào vị trí cuối cùng
  new_sequence[-1, 0] = pred  # Thêm dự đoán mới vào vị trí cuối cùng
  new_sequence[-1, 0] = pred  # Thêm dự đoán mới vào vị trí cuối cùng
  new_sequence[-1, 0] = pred  # Thêm dự đoán mới vào vị trí cuối cùng
  new_sequence[-1, 0] = pred  # Thêm dự đoán mới vào vị trí cuối cùng
  new_sequence[-1, 0] = pred  # Thêm dự đoán mới vào vị trí cuối cùng
  new_sequence[-1, 0] = pred  # Thêm dự đoán mới vào vị trí cuối cùng
