In [1]:
!pip install catboost

Collecting catboost
  Downloading catboost-1.2.7-cp310-cp310-manylinux2014_x86_64.whl.metadata (1.2 kB)
Downloading catboost-1.2.7-cp310-cp310-manylinux2014_x86_64.whl (98.7 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m98.7/98.7 MB[0m [31m7.6 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: catboost
Successfully installed catboost-1.2.7


In [2]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import warnings
from sklearn.pipeline import Pipeline
from xgboost import XGBRegressor
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor
from sklearn.linear_model import LinearRegression
from lightgbm import LGBMRegressor
from catboost import CatBoostRegressor
from sklearn.preprocessing import StandardScaler, OrdinalEncoder
from sklearn.compose import ColumnTransformer
from sklearn.model_selection import train_test_split,GridSearchCV,RandomizedSearchCV
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score

warnings.filterwarnings('ignore')


Dask dataframe query planning is disabled because dask-expr is not installed.

You can install it with `pip install dask[dataframe]` or `conda install dask`.
This will raise in a future version.



In [6]:
df = pd.read_csv('/content/phone_clean.csv')

In [7]:
df

Unnamed: 0,Brand,Name,Status,Guarantee,Color,Capacity,Origin,Location,Price
0,xiaomi,redmi turbo 3,moi,>12 thang,mau khac,512.0,trung quoc,binh duong,5690000
1,oppo,f11,moi,2 thang,xanh duong,256.0,viet nam,tp ho chi minh,1250000
2,apple,iphone 6 plus,da su dung chua sua chua,het bao hanh,vang,64.0,my,tp ho chi minh,750000
3,samsung,galaxy a6,da su dung chua sua chua,bao hanh hang,vang hong,32.0,viet nam,tp ho chi minh,650000
4,xiaomi,dong khac,da su dung chua sua chua,het bao hanh,xanh duong,128.0,viet nam,tp ho chi minh,1500000
...,...,...,...,...,...,...,...,...,...
2375,samsung,galaxy s21,da su dung chua sua chua,con bao hanh,trang,256.0,han quoc,tp ho chi minh,4990000
2376,samsung,galaxy note 10 plus,da su dung chua sua chua,con bao hanh,xanh duong,256.0,han quoc,tp ho chi minh,5500000
2377,apple,iphone 12 pro max,da su dung chua sua chua,>12 thang,vang,128.0,my,dong nai,11990000
2378,samsung,galaxy a71,moi,het bao hanh,xanh la,128.0,viet nam,binh duong,1900000


In [8]:
importance_column = df.columns.drop('Capacity')

In [9]:
df = df[importance_column]
df

Unnamed: 0,Brand,Name,Status,Guarantee,Color,Origin,Location,Price
0,xiaomi,redmi turbo 3,moi,>12 thang,mau khac,trung quoc,binh duong,5690000
1,oppo,f11,moi,2 thang,xanh duong,viet nam,tp ho chi minh,1250000
2,apple,iphone 6 plus,da su dung chua sua chua,het bao hanh,vang,my,tp ho chi minh,750000
3,samsung,galaxy a6,da su dung chua sua chua,bao hanh hang,vang hong,viet nam,tp ho chi minh,650000
4,xiaomi,dong khac,da su dung chua sua chua,het bao hanh,xanh duong,viet nam,tp ho chi minh,1500000
...,...,...,...,...,...,...,...,...
2375,samsung,galaxy s21,da su dung chua sua chua,con bao hanh,trang,han quoc,tp ho chi minh,4990000
2376,samsung,galaxy note 10 plus,da su dung chua sua chua,con bao hanh,xanh duong,han quoc,tp ho chi minh,5500000
2377,apple,iphone 12 pro max,da su dung chua sua chua,>12 thang,vang,my,dong nai,11990000
2378,samsung,galaxy a71,moi,het bao hanh,xanh la,viet nam,binh duong,1900000


In [10]:
X = df.iloc[:, :-1]
y = df.iloc[:, -1]

print(f"Kích thước X: {X.shape}")
print(f"Kích thước y: {y.shape}")

Kích thước X: (2380, 7)
Kích thước y: (2380,)


In [11]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

print(f"Kích thước X_train: {X_train.shape}")
print(f"Kích thước y_train: {y_train.shape}\n")

print(f"Kích thước X_test: {X_test.shape}")
print(f"Kích thước y_test: {y_test.shape}")

Kích thước X_train: (1904, 7)
Kích thước y_train: (1904,)

Kích thước X_test: (476, 7)
Kích thước y_test: (476,)


In [None]:
def create_regression_pipeline_with_auto_preprocessing(model_name, X):

    numeric_features = X.select_dtypes(include=['int64', 'float64']).columns.tolist()
    categorical_features = X.select_dtypes(include=['object']).columns.tolist()

    numeric_transformer = Pipeline(steps=[
        ('scaler', StandardScaler())
    ])

    categorical_transformer = Pipeline(steps=[
        ('ordinal', OrdinalEncoder(handle_unknown='use_encoded_value', unknown_value=-1))
    ])

    preprocessor = ColumnTransformer(
        transformers=[
            ('num', numeric_transformer, numeric_features),
            ('cat', categorical_transformer, categorical_features)
        ]
    )

    regression_models = {
        'Catboost': CatBoostRegressor(silent=True),
        'XGBoost': XGBRegressor(random_state=42,verbose = -1),
        'LightGBM': LGBMRegressor(random_state=42,verbose = -1),
        'DecisionTree': DecisionTreeRegressor(random_state=42),
        'RandomForest': RandomForestRegressor(random_state=42),
        'LinearRegression': LinearRegression(),
    }

    if model_name not in regression_models:
        raise ValueError(
            f"Invalid model name. Supported models: {', '.join(regression_models.keys())}")

    regression_pipeline = Pipeline(steps=[
        ('preprocessor', preprocessor),
        ('regressor', regression_models[model_name])
    ])

    pipeline_dict = {model_name: regression_pipeline}

    return pipeline_dict


def create_all_regression_pipelines_with_auto_preprocessing(X):
    regression_model_names = ['RandomForest', 'Catboost', 'LinearRegression',
                              'LightGBM', 'DecisionTree', 'XGBoost']

    all_pipelines = {}

    for model_name in regression_model_names:
        pipeline_dict = create_regression_pipeline_with_auto_preprocessing(
            model_name, X)

        all_pipelines.update(pipeline_dict)

    return all_pipelines


def train_evaluate_model_with_df(model_name, model, X_train, y_train, X_test, y_test, results_df):
    model.fit(X_train, y_train)

    y_pred = model.predict(X_test)

    mse = mean_squared_error(y_test, y_pred)
    mae = mean_absolute_error(y_test, y_pred)
    r2 = r2_score(y_test, y_pred)

    model_results = pd.DataFrame(
        {'Mô hình': [model_name], 'MSE': mse, 'MAE': mae, 'R2': r2})
    results_df = pd.concat([results_df, model_results], ignore_index=True)

    return results_df


def run_pipelines_with_metrics_to_dataframe(all_pipelines, X_train, y_train, X_test, y_test):
    results_df = pd.DataFrame(columns=['Mô hình', 'MSE', 'MAE', 'R2'])

    for model_name, pipeline in all_pipelines.items():
        results_df = train_evaluate_model_with_df(
            model_name, pipeline, X_train, y_train, X_test, y_test, results_df)

    results_df = results_df.sort_values(by='R2', ascending=False).reset_index(drop=True)

    return results_df


In [None]:
all_pipelines = create_all_regression_pipelines_with_auto_preprocessing(X_train)

results_df = run_pipelines_with_metrics_to_dataframe(all_pipelines, X_train, y_train, X_test, y_test)

results_df

Unnamed: 0,Mô hình,MSE,MAE,R2
0,Catboost,5836750000000.0,1610563.0,0.878869
1,LightGBM,5895884000000.0,1631720.0,0.877642
2,RandomForest,6400315000000.0,1583279.0,0.867174
3,XGBoost,6701503000000.0,1690962.0,0.860923
4,DecisionTree,12788780000000.0,1916380.0,0.734593
5,LinearRegression,39199620000000.0,4622620.0,0.186486


In [12]:
encoder = OrdinalEncoder()
columns = df.columns.drop('Price')
df_encoded = encoder.fit_transform(df.drop('Price', axis=1))

df_encoded = pd.DataFrame(df_encoded, columns=columns)
df_encoded['Price'] = df['Price']
df_encoded

Unnamed: 0,Brand,Name,Status,Guarantee,Color,Origin,Location,Price
0,26.0,317.0,2.0,5.0,6.0,8.0,6.0,5690000
1,16.0,62.0,2.0,1.0,12.0,9.0,36.0,1250000
2,0.0,218.0,0.0,8.0,9.0,4.0,36.0,750000
3,19.0,110.0,0.0,6.0,10.0,9.0,36.0,650000
4,26.0,60.0,0.0,8.0,12.0,9.0,36.0,1500000
...,...,...,...,...,...,...,...,...
2375,19.0,155.0,0.0,7.0,8.0,3.0,36.0,4990000
2376,19.0,139.0,0.0,7.0,12.0,3.0,36.0,5500000
2377,0.0,196.0,0.0,5.0,9.0,4.0,14.0,11990000
2378,19.0,114.0,2.0,8.0,13.0,9.0,6.0,1900000


In [13]:
X = df_encoded.iloc[:, :-1]
y = df_encoded.iloc[:, -1]
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [None]:
from sklearn.model_selection import KFold

In [None]:
kf = KFold(n_splits=5, shuffle=True, random_state=1)
cat_model = CatBoostRegressor(silent =True)

mses = []
maes = []
r2s = []

for fold, (idx_tr, idx_va) in enumerate(kf.split(X, y)):
    X_tr = X.iloc[idx_tr]
    X_va = X.iloc[idx_va]
    y_tr = y[idx_tr]
    y_va = y[idx_va]

    cat_model.fit(X_tr, y_tr)
    y_pred = cat_model.predict(X_va)

    mse = mean_squared_error(y_va, y_pred)
    mae = mean_absolute_error(y_va, y_pred)
    r2 = r2_score(y_va, y_pred)

    print(f"# Fold {fold + 1}:")
    print(f"Mean Squared Error (MSE): {mse:.3f}")
    print(f"Mean Absolute Error (MAE): {mae:.3f}")
    print(f"R-squared (R2) Score: {r2:.3f} ")

    mses.append(mse)
    maes.append(mae)
    r2s.append(r2)

mean_mse = np.mean(mses)
mean_mae = np.mean(maes)
mean_r2 = np.mean(r2s)

print(f"# Overall:")

print(f"# Mean MSE: {mean_mse:.3f}")
print(f"# Mean MAE: {mean_mae:.3f}")
print(f"# Mean R2: {mean_r2:.3f}")


# Fold 1:
Mean Squared Error (MSE): 5553481836839.782
Mean Absolute Error (MAE): 1563275.917
R-squared (R2) Score: 0.865 
# Fold 2:
Mean Squared Error (MSE): 6698083133108.219
Mean Absolute Error (MAE): 1592726.665
R-squared (R2) Score: 0.850 
# Fold 3:
Mean Squared Error (MSE): 7715994794599.301
Mean Absolute Error (MAE): 1636844.653
R-squared (R2) Score: 0.833 
# Fold 4:
Mean Squared Error (MSE): 6370673011891.082
Mean Absolute Error (MAE): 1531434.820
R-squared (R2) Score: 0.853 
# Fold 5:
Mean Squared Error (MSE): 7275346297228.509
Mean Absolute Error (MAE): 1716253.496
R-squared (R2) Score: 0.831 
# Overall:
# Mean MSE: 6722715814733.378
# Mean MAE: 1608107.110
# Mean R2: 0.846


In [None]:
params = {
    # "iterations": Số vòng lặp boosting (tương đương với số lượng cây trong mô hình)
    "iterations": range(100, 500, 50),  # Tăng dần từ 100 đến 500

    # "learning_rate": Tốc độ học (step size) điều chỉnh trọng số các cây
    # Giá trị nhỏ giúp cải thiện độ chính xác nhưng cần số lượng vòng lặp lớn hơn.
    "learning_rate": [0.0001, 0.001, 0.01, 0.1, 1],

    # "depth": Độ sâu tối đa của cây quyết định.
    # Cây sâu có khả năng học tốt hơn nhưng dễ overfitting nếu không điều chỉnh tốt.
    "depth": range(3, 21, 3),

    # "l2_leaf_reg": Chính quy L2 điều chỉnh các trọng số trong cây.
    # Phạt các giá trị lớn để giảm overfitting.
    "l2_leaf_reg": [1e-5, 1e-2, 0.1, 1, 10, 100],

    # "bagging_temperature": Điều chỉnh mức độ ngẫu nhiên trong việc lấy mẫu bootstrap.
    # Giá trị cao hơn giúp giảm overfitting bằng cách tạo ra sự ngẫu nhiên trong dữ liệu.
    "bagging_temperature": [0.0, 0.5, 1.0, 2.0, 5.0],

    # "random_strength": Mức độ ngẫu nhiên để giảm sự phức tạp của mô hình.
    # Giảm quá mức sự phân tách của cây dẫn đến overfitting.
    "random_strength": [0.1, 1, 10, 50],

    # "colsample_bylevel": Phần trăm cột được chọn cho mỗi mức cây.
    # Giảm overfitting bằng cách không sử dụng toàn bộ đặc trưng ở mỗi level.
    "colsample_bylevel": [0.5, 0.7, 0.9, 1.0],

    # "leaf_estimation_iterations": Số vòng lặp cho việc tối ưu hóa các lá trong cây.
    # Tăng giá trị này có thể cải thiện độ chính xác, nhưng cũng làm tăng chi phí tính toán.
    "leaf_estimation_iterations": [1, 5, 10],

    # "leaf_estimation_method": Phương pháp tối ưu hóa cây lá. "Newton" hoặc "Gradient".
    # Phương pháp "Newton" có thể mang lại kết quả tốt hơn trong nhiều trường hợp.
    "leaf_estimation_method": ['Newton', 'Gradient'],

    # "min_data_in_leaf": Số lượng mẫu tối thiểu trong mỗi lá cây.
    # Điều chỉnh tham số này giúp ngăn cây quá sâu và tránh overfitting.
    "min_data_in_leaf": [1, 5, 10, 50],

}

rs = RandomizedSearchCV(CatBoostRegressor(),
                  params,
                  cv=10,
                  verbose=3,
                  n_jobs=-1,
                  n_iter=25)

rs.fit(X_train, y_train)

print(f"Best parameters: {rs.best_params_}")
print(f"Best score: {rs.best_score_}")

Fitting 10 folds for each of 25 candidates, totalling 250 fits
0:	learn: 6335939.8584264	total: 1.72ms	remaining: 428ms
1:	learn: 6227448.0319654	total: 2.86ms	remaining: 355ms
2:	learn: 6071074.4631907	total: 3.81ms	remaining: 314ms
3:	learn: 5766550.2255013	total: 4.73ms	remaining: 291ms
4:	learn: 5498125.1327410	total: 6.82ms	remaining: 334ms
5:	learn: 5302552.5230841	total: 8.37ms	remaining: 340ms
6:	learn: 5105727.8575685	total: 10.3ms	remaining: 356ms
7:	learn: 5041832.4505193	total: 12.2ms	remaining: 370ms
8:	learn: 4921593.3976536	total: 13.9ms	remaining: 372ms
9:	learn: 4826575.1657165	total: 15.8ms	remaining: 378ms
10:	learn: 4701365.2118389	total: 17.4ms	remaining: 378ms
11:	learn: 4577705.1109566	total: 18.9ms	remaining: 375ms
12:	learn: 4526973.8242141	total: 20.5ms	remaining: 374ms
13:	learn: 4425821.4742422	total: 22.3ms	remaining: 376ms
14:	learn: 4396839.2051095	total: 23.9ms	remaining: 374ms
15:	learn: 4358008.9069463	total: 25.6ms	remaining: 374ms
16:	learn: 4270237.

In [None]:
print(f'Điểm số tốt nhất: {rs.best_score_:.3f}')

best_params_df = pd.DataFrame([rs.best_params_]).transpose()

best_params_df.columns = ['Value']

print('Các tham số tốt nhất:')
best_params_df

Điểm số tốt nhất: 0.822
Các tham số tốt nhất:


Unnamed: 0,Value
random_strength,50
min_data_in_leaf,50
learning_rate,1
leaf_estimation_method,Newton
leaf_estimation_iterations,1
l2_leaf_reg,100
iterations,400
depth,3
colsample_bylevel,1.0
bagging_temperature,1.0


In [None]:
kf = KFold(n_splits=5, shuffle=True, random_state=1)
best_params = rs.best_params_
cat_best = CatBoostRegressor(**best_params, silent =True)

mses = []
maes = []
r2s = []

for fold, (idx_tr, idx_va) in enumerate(kf.split(X, y)):
    X_tr = X.iloc[idx_tr]
    X_va = X.iloc[idx_va]
    y_tr = y[idx_tr]
    y_va = y[idx_va]

    cat_best.fit(X_tr, y_tr)
    y_pred = cat_best.predict(X_va)

    mse = mean_squared_error(y_va, y_pred)
    mae = mean_absolute_error(y_va, y_pred)
    r2 = r2_score(y_va, y_pred)

    print(f"# Fold {fold + 1}:")
    print(f"Mean Squared Error (MSE): {mse:.3f}")
    print(f"Mean Absolute Error (MAE): {mae:.3f}")
    print(f"R-squared (R2) Score: {r2:.3f} ")

    mses.append(mse)
    maes.append(mae)
    r2s.append(r2)

mean_mse = np.mean(mses)
mean_mae = np.mean(maes)
mean_r2 = np.mean(r2s)

print(f"# Overall:")

print(f"# Mean MSE: {mean_mse:.3f}")
print(f"# Mean MAE: {mean_mae:.3f}")
print(f"# Mean R2: {mean_r2:.3f}")


# Fold 1:
Mean Squared Error (MSE): 6388936683066.950
Mean Absolute Error (MAE): 1671477.466
R-squared (R2) Score: 0.844 
# Fold 2:
Mean Squared Error (MSE): 6738811908062.370
Mean Absolute Error (MAE): 1673375.278
R-squared (R2) Score: 0.849 
# Fold 3:
Mean Squared Error (MSE): 6810880318454.369
Mean Absolute Error (MAE): 1665632.110
R-squared (R2) Score: 0.853 
# Fold 4:
Mean Squared Error (MSE): 6429455198583.385
Mean Absolute Error (MAE): 1617179.556
R-squared (R2) Score: 0.852 
# Fold 5:
Mean Squared Error (MSE): 9161183899734.025
Mean Absolute Error (MAE): 1903395.722
R-squared (R2) Score: 0.788 
# Overall:
# Mean MSE: 7105853601580.219
# Mean MAE: 1706212.027
# Mean R2: 0.837


In [14]:
y_pred_best = cat_model.predict(X_test)
mse_best = mean_squared_error(y_test, y_pred_best)
mae_best = mean_absolute_error(y_test, y_pred_best)
r2_best = r2_score(y_test, y_pred_best)

print(f"Mean Squared Error (MSE): {mse_best:.3f}")
print(f"Mean Absolute Error (MAE): {mae_best:.3f}")
print(f"R-squared (R2) Score: {r2_best:.3f} ")

Mean Squared Error (MSE): 3022349234576.937
Mean Absolute Error (MAE): 1161522.247
R-squared (R2) Score: 0.937 


In [None]:
import joblib

best_model_filename = 'cat_best.pkl'
joblib.dump(cat_model, best_model_filename)

['cat_best.pkl']