In [1]:
import pandas as pd
import numpy as np
import warnings

# Tắt tất cả các cảnh báo
warnings.filterwarnings("ignore")

In [2]:
# Đọc dữ liệu từ file csv
df = pd.read_csv('smarthub/spiders/smarthub_products.csv')
df.head()

Unnamed: 0,product_name,brand,price,ram,storage,screen_size,resolution,chipset,os,rear_camera,front_camera,battery,dimensions,weight,rating_value,rating_count,image_url
0,Xiaomi Redmi Note 15 Pro 5G,Xiaomi,9349000,12,256.0,6.83,1280x2772,Dimensity 7400-Ultra 8 nhân,Xiaomi HyperOS 2.2,Chính 200MP & Phụ 8 MP,20 MP,6580,163.61x78.09x7.96,204,5,1,https://www.duchuymobile.com/images/detailed/8...
1,Realme P4 Power 5G,Realme,7919000,8,128.0,6.8,1280x2800,Dimensity 7400 Ultra (4 nm),"Android 16, Realme UI 7.0",50 + 8 MP,"16 MP, f/2.4",10001,162.3x76.2x9.1,219,5,1,https://www.duchuymobile.com/images/detailed/8...
2,OPPO A6T 5G,OPPO,4999000,4,128.0,6.75,720x1570,Dimensity 6300 (6 nm),Android 15,"50 MP, Auxiliary lens",5MP,6500,166.6x78.5x8.6,212,5,1,https://www.duchuymobile.com/images/detailed/8...
3,Nubia V80 Max,Nubia,2669000,8,128.0,6.9,729x1640,Unisoc T7250,Android 16,50 MP + 2 MP,16MP,6000,171.8x78.4x8.5,191,5,1,https://www.duchuymobile.com/images/detailed/8...
4,Nubia V80 Max,Nubia,2939000,8,256.0,6.9,729x1640,Unisoc T7250,Android 16,50 MP + 2 MP,16MP,6000,171.8x78.4x8.5,191,5,1,https://www.duchuymobile.com/images/detailed/8...


In [3]:
# Kiểm tra các dòng dữ liệu có giá trị bị thiếu (NaN)
df.isnull().sum()

product_name     2
brand            2
price            0
ram              6
storage          6
screen_size      4
resolution      17
chipset          5
os               5
rear_camera      4
front_camera     4
battery          5
dimensions      46
weight          63
rating_value     0
rating_count     0
image_url        0
dtype: int64

In [4]:
# Loại bỏ các dòng có giá trị bị thiếu
df = df.dropna()

# Loại bỏ các dòng trùng lặp product_name
df = df.drop_duplicates(subset=["product_name"])
# Loại bỏ các sản phẩm không phải điện thoại (ví dụ: Ốp Lưng)
df = df[~df["product_name"].str.contains("Ốp Lưng", case=False, na=False)]

# Xử lý cột battery
df["battery"] = pd.to_numeric(df["battery"].astype(str).str.replace(".", "", regex=False), errors="coerce")
df["battery"].fillna(df["battery"].median(), inplace=True)

# Xử lý cột weight
df = df[pd.to_numeric(df["weight"], errors="coerce").notna()]

# Xử lý cột ram
df = df[pd.to_numeric(df["ram"], errors="coerce").notna()]

In [5]:
# Kiểm tra các dòng dữ liệu có giá trị bị thiếu (NaN)
df.isnull().sum()

product_name    0
brand           0
price           0
ram             0
storage         0
screen_size     0
resolution      0
chipset         0
os              0
rear_camera     0
front_camera    0
battery         0
dimensions      0
weight          0
rating_value    0
rating_count    0
image_url       0
dtype: int64

In [6]:
# Đặt seed để kết quả ổn định khi chạy lại
np.random.seed(42)

# 1️. Tạo tỷ lệ giảm giá ngẫu nhiên (8% - 30%)
discount_rate = np.random.uniform(0.08, 0.30, len(df))

# 2️. Tính original_price từ price
df["original_price"] = (df["price"] / (1 - discount_rate)).round(-4)

# Đảm bảo original_price luôn lớn hơn price
df["original_price"] = np.where(
    df["original_price"] <= df["price"],
    df["price"] * 1.1,
    df["original_price"]
).round(-4)

# 3. Tạo stock ngẫu nhiên (0 - 120)
df["stock"] = np.random.randint(0, 121, len(df))

df.head()

Unnamed: 0,product_name,brand,price,ram,storage,screen_size,resolution,chipset,os,rear_camera,front_camera,battery,dimensions,weight,rating_value,rating_count,image_url,original_price,stock
0,Xiaomi Redmi Note 15 Pro 5G,Xiaomi,9349000,12,256.0,6.83,1280x2772,Dimensity 7400-Ultra 8 nhân,Xiaomi HyperOS 2.2,Chính 200MP & Phụ 8 MP,20 MP,6580.0,163.61x78.09x7.96,204,5,1,https://www.duchuymobile.com/images/detailed/8...,11160000.0,76
1,Realme P4 Power 5G,Realme,7919000,8,128.0,6.8,1280x2800,Dimensity 7400 Ultra (4 nm),"Android 16, Realme UI 7.0",50 + 8 MP,"16 MP, f/2.4",10001.0,162.3x76.2x9.1,219,5,1,https://www.duchuymobile.com/images/detailed/8...,11140000.0,50
2,OPPO A6T 5G,OPPO,4999000,4,128.0,6.75,720x1570,Dimensity 6300 (6 nm),Android 15,"50 MP, Auxiliary lens",5MP,6500.0,166.6x78.5x8.6,212,5,1,https://www.duchuymobile.com/images/detailed/8...,6590000.0,62
3,Nubia V80 Max,Nubia,2669000,8,128.0,6.9,729x1640,Unisoc T7250,Android 16,50 MP + 2 MP,16MP,6000.0,171.8x78.4x8.5,191,5,1,https://www.duchuymobile.com/images/detailed/8...,3390000.0,95
5,Nubia V80 Design,Nubia,2939000,8,256.0,6.75,900x1940,Unisoc T7280 (12 nm),Android 16,50 MP + 2 MP,16MP,5000.0,166x79x7.7,191,5,1,https://www.duchuymobile.com/images/detailed/8...,3320000.0,102


In [7]:
# Đảm bảo thứ tự cột: Đưa original_price, stock, image_url về cuối cùng
cols = [c for c in df.columns if c not in ["original_price", "stock", "image_url"]]
df = df[cols + ["original_price", "stock", "image_url"]]

In [8]:
# Lưu dữ liệu vào file csv mới
df.to_csv("smarthub_products_final.csv", index=False, encoding="utf-8-sig")

In [9]:
# Thống kê số lượng sản phẩm theo từng thương hiệu
df["brand"].value_counts()

brand
Xiaomi      23
HONOR       12
Motorola    11
Realme      11
OPPO        10
Vivo         7
Apple        7
Nubia        5
Samsung      4
Tecno        3
Huawei       1
Google       1
Nothing      1
Infinix      1
OnePlus      1
RedMagic     1
Name: count, dtype: int64