In [21]:
from sklearn.ensemble import RandomForestRegressor
from tensorflow.keras.models import Sequential
from sklearn.model_selection import train_test_split
from tensorflow.keras.layers import Dense, Dropout, BatchNormalization
from tensorflow.keras.callbacks import EarlyStopping, ReduceLROnPlateau
from sklearn.model_selection import train_test_split
from math import radians
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import mean_squared_error
from collections import Counter
import os
import random
import pandas as pd
import numpy as np
import tensorflow as tf
pd.set_option("display.max_columns", None)
pd.set_option("display.max_rows", None)

In [22]:
random.seed(42)
np.random.seed(42)
tf.random.set_seed(42)
os.environ['PYTHONHASHSEED'] = str(42)
os.environ['TF_DETERMINISTIC_OPS'] = '1'

In [23]:
# Load training and test datasets
X_train = pd.read_csv('data/X_train.csv', low_memory=False).drop(columns=['ID'], errors='ignore')
Y_train = pd.read_csv('data/y_train.csv', low_memory=False)['TARGET']
X_test = pd.read_csv('data/X_test.csv', low_memory=False)
test_ids = X_test.pop('ID') if 'ID' in X_test.columns else np.arange(len(X_test))
test_ids

array([    0,     1,     2, ..., 63768, 63769, 63770])

## 'buildingType', 'communityAverage', 'elevator', 'fiveYearsProperty', 'subway', 'livingRoom', 'bathRoom'

In [24]:
valid_values_buildingType = [1, 2, 3, 4]
valid_values_bathRoom = [0, 1, 2, 3, 4, 5, 6, 7]
cols_to_process = ['buildingType', 'elevator', 'fiveYearsProperty', 'subway', 'livingRoom', 'bathRoom']
for df in [X_train, X_test]:
    for col in cols_to_process:
        df[col] = pd.to_numeric(df[col], errors='coerce')
        mode = df[col].mode()[0]
        df[col] = df[col].fillna(mode)
        df[col] = df[col].astype(int)
    if 'buildingType' in df.columns:
        mode_value = df['buildingType'].mode()[0]
        df.loc[~df['buildingType'].isin(valid_values_buildingType), 'buildingType'] = mode_value
    if 'bathRoom' in df.columns:
        mode_value = df['bathRoom'].mode()[0]
        df.loc[~df['bathRoom'].isin(valid_values_bathRoom), 'bathRoom'] = mode_value
    if 'ladderRatio' in df.columns:
        median_ladder_ratio = df['ladderRatio'].median()
        df.loc[df['ladderRatio'] > 10, 'ladderRatio'] = median_ladder_ratio
    if 'communityAverage' in df.columns:
        df['communityAverage'] = pd.to_numeric(df['communityAverage'], errors='coerce')
        median_community_average = df['communityAverage'].median()
        df['communityAverage'] = df['communityAverage'].fillna(median_community_average)

In [25]:
capital_Lng = np.radians(116.4074)
capital_Lat = np.radians(39.9042)

for df in [X_train, X_test]:
    df['Lat'] = pd.to_numeric(df['Lat'], errors='coerce')
    df['Lng'] = pd.to_numeric(df['Lng'], errors='coerce')
    df['distanceToCapital'] = np.arccos(
        np.sin(np.radians(df['Lat'])) * np.sin(capital_Lat) +
        np.cos(np.radians(df['Lat'])) * np.cos(capital_Lat) *
        np.cos(capital_Lng - np.radians(df['Lng']))
    ) * 6371.0088

## convert tradeTime, constructionTime

In [26]:
for df in [X_train, X_test]:
    if 'tradeTime' in df.columns:
        df['tradeTime'] = pd.to_datetime(df['tradeTime'], errors='coerce')
    if 'constructionTime' in df.columns:
        df['constructionTime'] = pd.to_numeric(df['constructionTime'], errors='coerce')
        df['constructionTime'] = df['constructionTime'].apply(lambda x: np.nan if x < 1900 else x)
        mode_value = df['constructionTime'].mode()[0]
        df['constructionTime'] = df['constructionTime'].fillna(mode_value)
    if 'tradeTime' in df.columns and 'constructionTime' in df.columns:
        df['ageOfBuilding'] = df['tradeTime'].dt.year - df['constructionTime']
    df['year'] = df['tradeTime'].dt.year
    df['month'] = df['tradeTime'].dt.month
    df['day'] = df['tradeTime'].dt.day

In [27]:
for df in [X_train, X_test]:
    if 'ageOfBuilding' in df.columns:
        mode_age = df['ageOfBuilding'].mode()[0]
        df.loc[df['ageOfBuilding'] < 0, 'ageOfBuilding'] = mode_age

## handle column floor

In [28]:

# Floor Mapping
floor_mapping = {
    "顶": 5,  # "đỉnh"
    "高": 4,  # "cao"
    "中": 3,  # "trung"
    "低": 2,  # "thấp"
    "底": 1,  # "đáy"
    "未知": 0,  # "unknown"
    "钢混结构": 6,  # Reinforced Concrete Structure
    "混合结构": 7,  # Mixed Structure
}

for df in [X_train, X_test]:
    df['floor_numeric'] = df['floor'].str.split(" ").str[0].map(floor_mapping)
    df['actual_floor'] = df['floor'].str.extract(r'(\d+)$').fillna(0).astype(int)

## handle column drawingRoom

In [29]:
drawing_room_mapping = {
    "顶": 5,  # "đỉnh"
    "高": 4,  # "cao"
    "中": 3,  # "trung"
    "低": 2,  # "thấp"
    "底": 1,  # "đáy"
}
for df in [X_train, X_test]:
    df['drawingRoom_prefix'] = df['drawingRoom'].str.split(" ").str[0]  # Tách chữ
    df['drawingRoom_numeric'] = df['drawingRoom'].str.extract(r'(\d+)$')  # Tách số
    # Ánh xạ
    df['drawingRoom_prefix'] = df['drawingRoom_prefix'].map(drawing_room_mapping)
    # không có chữ => 0
    df['drawingRoom_prefix'] = df['drawingRoom_prefix'].fillna(0).astype(int)
    # Chuyển phần số thành kiểu int (nếu cần)
    df['drawingRoom_numeric'] = pd.to_numeric(df['drawingRoom_numeric'], errors='coerce')

In [30]:
X_train = X_train.drop(columns='tradeTime')
X_train = X_train.drop(columns='drawingRoom')
X_train = X_train.drop(columns='floor')

X_test = X_test.drop(columns='tradeTime')
X_test = X_test.drop(columns='drawingRoom')
X_test = X_test.drop(columns='floor')

In [31]:
X_train.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 255080 entries, 0 to 255079
Data columns (total 26 columns):
 #   Column               Non-Null Count   Dtype  
---  ------               --------------   -----  
 0   Lng                  255080 non-null  float64
 1   Lat                  255080 non-null  float64
 2   followers            255080 non-null  int64  
 3   square               255080 non-null  float64
 4   livingRoom           255080 non-null  int32  
 5   kitchen              255080 non-null  int64  
 6   bathRoom             255080 non-null  int32  
 7   buildingType         255080 non-null  int32  
 8   constructionTime     255080 non-null  float64
 9   renovationCondition  255080 non-null  int64  
 10  buildingStructure    255080 non-null  int64  
 11  ladderRatio          255080 non-null  float64
 12  elevator             255080 non-null  int32  
 13  fiveYearsProperty    255080 non-null  int32  
 14  subway               255080 non-null  int32  
 15  district         

In [32]:
X_test.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 63771 entries, 0 to 63770
Data columns (total 26 columns):
 #   Column               Non-Null Count  Dtype  
---  ------               --------------  -----  
 0   Lng                  63771 non-null  float64
 1   Lat                  63771 non-null  float64
 2   followers            63771 non-null  int64  
 3   square               63771 non-null  float64
 4   livingRoom           63771 non-null  int32  
 5   kitchen              63771 non-null  int64  
 6   bathRoom             63771 non-null  int32  
 7   buildingType         63771 non-null  int32  
 8   constructionTime     63771 non-null  float64
 9   renovationCondition  63771 non-null  int64  
 10  buildingStructure    63771 non-null  int64  
 11  ladderRatio          63771 non-null  float64
 12  elevator             63771 non-null  int32  
 13  fiveYearsProperty    63771 non-null  int32  
 14  subway               63771 non-null  int32  
 15  district             63771 non-null 

In [33]:
print(X_train.shape)
print(X_test.shape)
print(Y_train.shape)

(255080, 26)
(63771, 26)
(255080,)


In [34]:
def print_rmse(y_true, y_pred, model_name="Model"):
    rmse = np.sqrt(mean_squared_error(y_true, y_pred))
    print(f"{model_name} rmse is: {rmse}")
    return rmse

#### Build model MLP

In [35]:
def build_mlp_model(input_shape):
    model = Sequential([
        Dense(512, activation='relu', input_shape=(input_shape,)),
        BatchNormalization(),
        Dropout(0.4),
        Dense(256, activation='relu'),
        Dropout(0.3),
        Dense(128, activation='relu'),
        Dropout(0.3),
        Dense(1)
    ])
    return model

#### compile, train MLP

In [36]:
def compile_and_train_mlp(model, X_train, y_train, X_val, y_val, epochs=50, batch_size=32):
    model.compile(optimizer='adam', loss='mse')
    early_stopping = EarlyStopping(monitor='val_loss', patience=8, restore_best_weights=True)
    reduce_lr = ReduceLROnPlateau(monitor='val_loss', factor=0.5, patience=4, min_lr=1e-6)
    
    model.fit(
        X_train, y_train,
        validation_data=(X_val, y_val),
        epochs=epochs,
        batch_size=batch_size,
        callbacks=[early_stopping, reduce_lr],
        verbose=1
    )
    return model

#### Train model RandomForest

In [37]:
def train_random_forest(X_train, y_train, X_val, y_val):
    rf_model = RandomForestRegressor(
        n_estimators=900, 
        max_depth=20, 
        min_samples_split=10, 
        min_samples_leaf=2, 
        random_state=42, 
        n_jobs=-1
    )
    # Fit the model
    rf_model.fit(X_train, y_train)
    # Predict on validation set
    y_pred_val = rf_model.predict(X_val)
    # Print and return RMSE
    val_rmse = print_rmse(y_val, y_pred_val, model_name="RandomForest")    
    return rf_model, val_rmse

#### Lưu CSV

In [38]:
def save_submission(y_pred, test_ids, filename):
    submission = pd.DataFrame({
        "ID": test_ids,
        "TARGET": y_pred
    })
    submission.to_csv(filename, index=False)

#### Chia dữ liệu thành tập test và tập validation

In [39]:
X_train_split, X_val_split, y_train_split, y_val_split = train_test_split(X_train, Y_train, test_size=0.2, random_state=42)

#### Train model RF

In [40]:
rf_model, rf_rmse = train_random_forest(X_train_split, y_train_split, X_val_split, y_val_split)

RandomForest rmse is: 52.87643785394698


#### Dự đoán từ mô hình Random Forest để sử dụng như đặc trưng mới cho MLP

In [41]:
rf_train_predictions = rf_model.predict(X_train_split).reshape(-1, 1)
rf_val_predictions = rf_model.predict(X_val_split).reshape(-1, 1)
rf_test_predictions = rf_model.predict(X_test).reshape(-1, 1)

#### Ghép các dự đoán vào tập huấn luyện và validation

In [42]:
X_train_split_with_rf = np.hstack((X_train_split, rf_train_predictions))
X_val_split_with_rf = np.hstack((X_val_split, rf_val_predictions))
X_test_with_rf = np.hstack((X_test, rf_test_predictions))

#### scale data

In [43]:
scaler = StandardScaler()
X_train_split_with_rf = scaler.fit_transform(X_train_split_with_rf)
X_test_with_rf = scaler.transform(X_test_with_rf)
X_val_split_with_rf = scaler.transform(X_val_split_with_rf)

#### Train model MLP sử dụng dự đoán từ Random Forest

In [44]:
mlp_model = build_mlp_model(X_train_split_with_rf.shape[1])
mlp_model = compile_and_train_mlp(mlp_model, X_train_split_with_rf, y_train_split, X_val_split_with_rf, y_val_split)

  super().__init__(activity_regularizer=activity_regularizer, **kwargs)


Epoch 1/50
[1m6377/6377[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m20s[0m 3ms/step - loss: 11901.6172 - val_loss: 3562.1689 - learning_rate: 0.0010
Epoch 2/50
[1m6377/6377[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m18s[0m 3ms/step - loss: 4984.6143 - val_loss: 3264.8691 - learning_rate: 0.0010
Epoch 3/50
[1m6377/6377[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m18s[0m 3ms/step - loss: 4678.4834 - val_loss: 3365.9207 - learning_rate: 0.0010
Epoch 4/50
[1m6377/6377[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m18s[0m 3ms/step - loss: 4572.3438 - val_loss: 3168.9092 - learning_rate: 0.0010
Epoch 5/50
[1m6377/6377[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m18s[0m 3ms/step - loss: 4285.0425 - val_loss: 3782.2393 - learning_rate: 0.0010
Epoch 6/50
[1m6377/6377[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m18s[0m 3ms/step - loss: 4195.8618 - val_loss: 3315.1504 - learning_rate: 0.0010
Epoch 7/50
[1m6377/6377[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m18s[0m 3

#### Lưu model, dự đoán X_test

In [45]:
mlp_model.save('weight/model_mlp_model_7.h5')
print("Model saved successfully!")



Model saved successfully!


#### Dự đoán trên tập validation bằng model MLP

In [46]:
# dự đoán dựa trên tập validation
y_pred_val_mlp = mlp_model.predict(X_val_split_with_rf).flatten()
mlp_rmse = print_rmse(y_val_split, y_pred_val_mlp, model_name="MLP with RF Feature")
# dự đoán data test
y_pred_test_mlp = mlp_model.predict(X_test_with_rf).flatten()

[1m1595/1595[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 1ms/step
MLP with RF Feature rmse is: 55.10913556630966
[1m1993/1993[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m3s[0m 2ms/step


#### save CSV

In [47]:
save_submission(y_pred_test_mlp, test_ids, filename='Latest_submission.csv')

#### LP with RF Feature rmse is: 55.10913556630966 modle_mlp_h2
#### RandomForest rmse is: 52.87643785394698