In [27]:
import pandas as pd
import numpy as np

In [28]:
name_input = "giaydep.csv"
name_output = "giaydep_pre.csv"

In [29]:
data = pd.read_csv(name_input)
data.head(3)

FileNotFoundError: [Errno 2] No such file or directory: 'quanao.csv'

In [None]:
#Check missing value
data.isnull().sum().sort_values(ascending=False)

Ship_price        423
Price_original    184
Four_star         124
Three_star        124
Two_star          124
One_star          124
Five_star         124
Sale_off          107
Preview            97
Return             85
Sale_rating        85
Ship_on_time       83
Chat_response      83
Total_sold         70
Location            0
Title               0
Price_sale          0
Link                0
Type                0
dtype: int64

- Remove the columns **Title** & **Link** because they are not necessary.
- Remove columns **Return** because:
    * it has too many NULL values, nearly 50% of the total data.
    * it is not relevant to the hypothesis being used.


In [None]:
#Drop Title, Link, Return
new_data = data.drop(columns=['Title', 'Link', 'Return'], axis=1)
new_data.head(3)

Unnamed: 0,Type,Price_sale,Sale_off,Total_sold,Preview,Location,Price_original,Ship_price,Sale_rating,Ship_on_time,Chat_response,One_star,Two_star,Three_star,Four_star,Five_star
0,giày dép,174 ₫,,494 Đã bán,,Nước ngoài,,,Không đủ thông tin,Không đủ thông tin,Không đủ thông tin,0.0,0.0,0.0,0.0,0.0
1,giày dép,371 ₫,40% Off,319 Đã bán,,Nước ngoài,618 ₫,17.000 ₫,Không đủ thông tin,Không đủ thông tin,Không đủ thông tin,0.0,0.0,0.0,0.0,0.0
2,giày dép,32.000 ₫,54% Off,83 Đã bán,(20),Hồ Chí Minh,69.000 ₫,16.500 ₫,93%,92%,100%,0.0,0.0,1.0,0.0,19.0


In [None]:
#Display the unique values of columns: Sale_rating, Ship_on_time, và Chat_response
unique_sale_rating = new_data['Sale_rating'].unique()
unique_ship_on_time = new_data['Ship_on_time'].unique()
unique_chat_response = new_data['Chat_response'].unique()

print(unique_sale_rating, "\n")
print(unique_ship_on_time, "\n")
print(unique_chat_response, "\n")

['Không đủ thông tin' '93%' '90%' '94%' '95%' '99%' '96%' '98%' '100%'
 '97%' '91%' '85%' '81%' '88%' '92%' 'Nhà bán hàng mới' '87%' nan '73%'
 '77%' '89%' '60%'] 

['Không đủ thông tin' '92%' '95%' '93%' '99%' '100%' '88%' '85%' '91%'
 '96%' '98%' '94%' '3%' '79%' '82%' '89%' nan '97%' '74%' '33%' '90%'
 '41%' '81%' '84%' '76%' '29%' '71%' '66%' '25%' '87%' '13%' '50%' '72%'
 '75%' '44%' '83%'] 

['Không đủ thông tin' '100%' '92%' '99%' '50%' '97%' '87%' '78%' '80%'
 '46%' '66%' '34%' '94%' '91%' '90%' '83%' '86%' '71%' nan '88%' '76%'
 '93%' '98%' '33%' '84%' '44%' '95%' '75%' '72%' '40%' '70%' '85%' '96%'] 



- We will replace the cells **Không đủ thông tin** & **Nhà bán hàng mới** to NULL.

In [None]:
new_data['Sale_rating'].replace(['Nhà bán hàng mới', 'Không đủ thông tin'], np.nan, inplace=True)
new_data['Ship_on_time'].replace(['Không đủ thông tin'], np.nan, inplace=True)
new_data['Chat_response'].replace(['Không đủ thông tin'], np.nan, inplace=True)

In [None]:
#Check columns: Sale_rating, Ship_on_time, và Chat_response
unique_sale_rating = new_data['Sale_rating'].unique()
unique_ship_on_time = new_data['Ship_on_time'].unique()
unique_chat_response = new_data['Chat_response'].unique()

print(unique_sale_rating, "\n")
print(unique_ship_on_time, "\n")
print(unique_chat_response, "\n")

[nan '93%' '90%' '94%' '95%' '99%' '96%' '98%' '100%' '97%' '91%' '85%'
 '81%' '88%' '92%' '87%' '73%' '77%' '89%' '60%'] 

[nan '92%' '95%' '93%' '99%' '100%' '88%' '85%' '91%' '96%' '98%' '94%'
 '3%' '79%' '82%' '89%' '97%' '74%' '33%' '90%' '41%' '81%' '84%' '76%'
 '29%' '71%' '66%' '25%' '87%' '13%' '50%' '72%' '75%' '44%' '83%'] 

[nan '100%' '92%' '99%' '50%' '97%' '87%' '78%' '80%' '46%' '66%' '34%'
 '94%' '91%' '90%' '83%' '86%' '71%' '88%' '76%' '93%' '98%' '33%' '84%'
 '44%' '95%' '75%' '72%' '40%' '70%' '85%' '96%'] 



In [None]:
#NOTE: ai làm trang sức, laptop, điện thoại, tivi thì thêm phần này nha. Chỉnh cái 12345 sao lại thành số luôn á. 
#Cho đồng nhất với ở trên.

# Hàm để loại bỏ ký tự không mong muốn và lấy giá trị bên trong
def extract_number(val):
    if isinstance(val, str):
        return val.strip("[]'")  # loại bỏ các ký tự không mong muốn
    return val

# Áp dụng hàm extract_number cho từng cột
for column in new_data.columns[11:16]:
    new_data[column] = new_data[column].apply(extract_number)

In [None]:
# Thay thế các giá trị 'None' và 'none' bằng NaN trước
new_data.replace('None', pd.NA, inplace=True)
new_data.replace('none', pd.NA, inplace=True)

In [None]:
# Điền 0 vào các giá trị None của cột sao đánh giá
for i in range(11,16):
    new_data.iloc[:,i].fillna(0, inplace=True)

In [None]:
print(new_data.dtypes)

Type               object
Price_sale         object
Sale_off           object
Total_sold         object
Preview            object
Location           object
Price_original     object
Ship_price         object
Sale_rating        object
Ship_on_time       object
Chat_response      object
One_star          float64
Two_star          float64
Three_star        float64
Four_star         float64
Five_star         float64
dtype: object


In [None]:
new_data

Unnamed: 0,Type,Price_sale,Sale_off,Total_sold,Preview,Location,Price_original,Ship_price,Sale_rating,Ship_on_time,Chat_response,One_star,Two_star,Three_star,Four_star,Five_star
0,giày dép,174 ₫,,494 Đã bán,,Nước ngoài,,,,,,0.0,0.0,0.0,0.0,0.0
1,giày dép,371 ₫,40% Off,319 Đã bán,,Nước ngoài,618 ₫,17.000 ₫,,,,0.0,0.0,0.0,0.0,0.0
2,giày dép,32.000 ₫,54% Off,83 Đã bán,(20),Hồ Chí Minh,69.000 ₫,16.500 ₫,93%,92%,100%,0.0,0.0,1.0,0.0,19.0
3,giày dép,12.000 ₫,73% Off,640 Đã bán,(173),Nước ngoài,45.000 ₫,,90%,,100%,4.0,2.0,3.0,5.0,159.0
4,giày dép,383 ₫,40% Off,250 Đã bán,,Nước ngoài,638 ₫,17.000 ₫,,,,0.0,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1195,giày dép,89.000 ₫,26% Off,3.2K Đã bán,(872),Hà Nội,120.000 ₫,38.000 ₫,95%,100%,100%,11.0,10.0,25.0,54.0,772.0
1196,giày dép,118.505 ₫,65% Off,,(1),Nước ngoài,343.400 ₫,,99%,,100%,0.0,0.0,0.0,0.0,1.0
1197,giày dép,134.100 ₫,58% Off,18 Đã bán,(9),Hồ Chí Minh,320.000 ₫,17.000 ₫,95%,94%,100%,0.0,0.0,0.0,0.0,9.0
1198,giày dép,56.500 ₫,50% Off,,,Nước ngoài,113.000 ₫,17.000 ₫,96%,94%,98%,0.0,0.0,0.0,0.0,0.0


In [None]:
# Thay thế 'quần áo nam' bằng 'quần áo', 'giày' bằng 'giày dép'
new_data['Type'] = new_data['Type'].replace('quần áo nam', 'quần áo')
new_data['Type'] = new_data['Type'].replace('giày', 'giày dép')
new_data

Unnamed: 0,Type,Price_sale,Sale_off,Total_sold,Preview,Location,Price_original,Ship_price,Sale_rating,Ship_on_time,Chat_response,One_star,Two_star,Three_star,Four_star,Five_star
0,giày dép,174 ₫,,494 Đã bán,,Nước ngoài,,,,,,0.0,0.0,0.0,0.0,0.0
1,giày dép,371 ₫,40% Off,319 Đã bán,,Nước ngoài,618 ₫,17.000 ₫,,,,0.0,0.0,0.0,0.0,0.0
2,giày dép,32.000 ₫,54% Off,83 Đã bán,(20),Hồ Chí Minh,69.000 ₫,16.500 ₫,93%,92%,100%,0.0,0.0,1.0,0.0,19.0
3,giày dép,12.000 ₫,73% Off,640 Đã bán,(173),Nước ngoài,45.000 ₫,,90%,,100%,4.0,2.0,3.0,5.0,159.0
4,giày dép,383 ₫,40% Off,250 Đã bán,,Nước ngoài,638 ₫,17.000 ₫,,,,0.0,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1195,giày dép,89.000 ₫,26% Off,3.2K Đã bán,(872),Hà Nội,120.000 ₫,38.000 ₫,95%,100%,100%,11.0,10.0,25.0,54.0,772.0
1196,giày dép,118.505 ₫,65% Off,,(1),Nước ngoài,343.400 ₫,,99%,,100%,0.0,0.0,0.0,0.0,1.0
1197,giày dép,134.100 ₫,58% Off,18 Đã bán,(9),Hồ Chí Minh,320.000 ₫,17.000 ₫,95%,94%,100%,0.0,0.0,0.0,0.0,9.0
1198,giày dép,56.500 ₫,50% Off,,,Nước ngoài,113.000 ₫,17.000 ₫,96%,94%,98%,0.0,0.0,0.0,0.0,0.0


In [None]:
# Hàm để trích xuất phần trăm giảm giá từ chuỗi
def extract_discount(promo):
    # Kiểm tra nếu chuỗi có chứa phần trăm giảm giá
    if isinstance(promo, str) and '% Off' in promo:
        # Tách chuỗi bằng ký tự xuống dòng (\r\n) và lấy phần đầu tiên
        return promo.split('\r\n')[0]
    else:
        return None

In [None]:
# Áp dụng hàm để thay thế giá trị trong cột 'Promotion'
new_data['Sale_off'] = new_data['Sale_off'].apply(extract_discount)
new_data

Unnamed: 0,Type,Price_sale,Sale_off,Total_sold,Preview,Location,Price_original,Ship_price,Sale_rating,Ship_on_time,Chat_response,One_star,Two_star,Three_star,Four_star,Five_star
0,giày dép,174 ₫,,494 Đã bán,,Nước ngoài,,,,,,0.0,0.0,0.0,0.0,0.0
1,giày dép,371 ₫,40% Off,319 Đã bán,,Nước ngoài,618 ₫,17.000 ₫,,,,0.0,0.0,0.0,0.0,0.0
2,giày dép,32.000 ₫,54% Off,83 Đã bán,(20),Hồ Chí Minh,69.000 ₫,16.500 ₫,93%,92%,100%,0.0,0.0,1.0,0.0,19.0
3,giày dép,12.000 ₫,73% Off,640 Đã bán,(173),Nước ngoài,45.000 ₫,,90%,,100%,4.0,2.0,3.0,5.0,159.0
4,giày dép,383 ₫,40% Off,250 Đã bán,,Nước ngoài,638 ₫,17.000 ₫,,,,0.0,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1195,giày dép,89.000 ₫,26% Off,3.2K Đã bán,(872),Hà Nội,120.000 ₫,38.000 ₫,95%,100%,100%,11.0,10.0,25.0,54.0,772.0
1196,giày dép,118.505 ₫,65% Off,,(1),Nước ngoài,343.400 ₫,,99%,,100%,0.0,0.0,0.0,0.0,1.0
1197,giày dép,134.100 ₫,58% Off,18 Đã bán,(9),Hồ Chí Minh,320.000 ₫,17.000 ₫,95%,94%,100%,0.0,0.0,0.0,0.0,9.0
1198,giày dép,56.500 ₫,50% Off,,,Nước ngoài,113.000 ₫,17.000 ₫,96%,94%,98%,0.0,0.0,0.0,0.0,0.0


In [None]:
# Thay thế 'Miễn phí' bằng '0 đ'
new_data['Ship_price'] = new_data['Ship_price'].replace('Miễn phí', '0 ₫')
new_data

Unnamed: 0,Type,Price_sale,Sale_off,Total_sold,Preview,Location,Price_original,Ship_price,Sale_rating,Ship_on_time,Chat_response,One_star,Two_star,Three_star,Four_star,Five_star
0,giày dép,174 ₫,,494 Đã bán,,Nước ngoài,,,,,,0.0,0.0,0.0,0.0,0.0
1,giày dép,371 ₫,40% Off,319 Đã bán,,Nước ngoài,618 ₫,17.000 ₫,,,,0.0,0.0,0.0,0.0,0.0
2,giày dép,32.000 ₫,54% Off,83 Đã bán,(20),Hồ Chí Minh,69.000 ₫,16.500 ₫,93%,92%,100%,0.0,0.0,1.0,0.0,19.0
3,giày dép,12.000 ₫,73% Off,640 Đã bán,(173),Nước ngoài,45.000 ₫,,90%,,100%,4.0,2.0,3.0,5.0,159.0
4,giày dép,383 ₫,40% Off,250 Đã bán,,Nước ngoài,638 ₫,17.000 ₫,,,,0.0,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1195,giày dép,89.000 ₫,26% Off,3.2K Đã bán,(872),Hà Nội,120.000 ₫,38.000 ₫,95%,100%,100%,11.0,10.0,25.0,54.0,772.0
1196,giày dép,118.505 ₫,65% Off,,(1),Nước ngoài,343.400 ₫,,99%,,100%,0.0,0.0,0.0,0.0,1.0
1197,giày dép,134.100 ₫,58% Off,18 Đã bán,(9),Hồ Chí Minh,320.000 ₫,17.000 ₫,95%,94%,100%,0.0,0.0,0.0,0.0,9.0
1198,giày dép,56.500 ₫,50% Off,,,Nước ngoài,113.000 ₫,17.000 ₫,96%,94%,98%,0.0,0.0,0.0,0.0,0.0


In [None]:
new_data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1200 entries, 0 to 1199
Data columns (total 16 columns):
 #   Column          Non-Null Count  Dtype  
---  ------          --------------  -----  
 0   Type            1200 non-null   object 
 1   Price_sale      1200 non-null   object 
 2   Sale_off        1093 non-null   object 
 3   Total_sold      1130 non-null   object 
 4   Preview         1103 non-null   object 
 5   Location        1200 non-null   object 
 6   Price_original  1016 non-null   object 
 7   Ship_price      777 non-null    object 
 8   Sale_rating     1046 non-null   object 
 9   Ship_on_time    741 non-null    object 
 10  Chat_response   953 non-null    object 
 11  One_star        1200 non-null   float64
 12  Two_star        1200 non-null   float64
 13  Three_star      1200 non-null   float64
 14  Four_star       1200 non-null   float64
 15  Five_star       1200 non-null   float64
dtypes: float64(5), object(11)
memory usage: 150.1+ KB


In [None]:
new_data.to_csv(name_output, index=False, encoding='utf-8-sig')