In [34]:
import pandas as pd
import re

In [35]:
df_camera_back = pd.read_csv('camera_back.csv')
print(df_camera_back.head())

   product_id                                        camera_back
0       59254  Camera chính: 48MP, f/1.6, 26mm, Focus Pixels ...
1       76439  Camera chính: 50MP, 23mm, ƒ/1 7 Cảm biến hình ...
2       59294  Camera chính: 200MP, Laser AF, OISCamera: 50MP...
3       59257  Camera chính: 48MP, f/1.78, 24mm, chống rung q...
4       59258  Camera chính: 48MP, f/1.78, 24mm, 2µm, chống r...


In [36]:
# Điều chỉnh quá trình trích xuất để xử lý các loại camera dựa trên các từ khóa tiếng Việt cụ thể
def extract_camera_by_vn_type(camera_string):
    camera_info = {
        'main_camera_mp': None,
        'wide_camera_mp': None,
        'tele_camera_mp': None,
        'macro_camera_mp': None,
        'main_camera_focal_length': None,
        'wide_camera_focal_length': None,
        'tele_camera_focal_length': None,
        'macro_camera_focal_length': None,
        'main_camera_aperture': None,
        'wide_camera_aperture': None,
        'tele_camera_aperture': None,
        'macro_camera_aperture': None,
    }

    # Tách chuỗi camera dựa trên dấu phẩy để xử lý từng phần riêng lẻ
    parts = camera_string.split(',')
    
    # Mẫu để khớp megapixel, tiêu cự và khẩu độ
    megapixel_pattern = r"(\d+MP)"
    focal_length_pattern = r"(\d+mm)"
    aperture_pattern = r"(f/[\d.]+|ƒ/[\d.]+)"

    # Duyệt qua từng phần để phát hiện loại camera và dữ liệu liên quan
    for part in parts:
        megapixel_match = re.search(megapixel_pattern, part)
        focal_length_match = re.search(focal_length_pattern, part)
        aperture_match = re.search(aperture_pattern, part)
        
        if 'góc siêu rộng' in part.lower() or 'góc rộng' in part.lower():
            camera_info['wide_camera_mp'] = megapixel_match.group(1) if megapixel_match else None
            camera_info['wide_camera_focal_length'] = focal_length_match.group(1) if focal_length_match else None
            camera_info['wide_camera_aperture'] = aperture_match.group(1) if aperture_match else None
        elif 'tele' in part.lower():
            camera_info['tele_camera_mp'] = megapixel_match.group(1) if megapixel_match else None
            camera_info['tele_camera_focal_length'] = focal_length_match.group(1) if focal_length_match else None
            camera_info['tele_camera_aperture'] = aperture_match.group(1) if aperture_match else None
        elif 'macro' in part.lower():
            camera_info['macro_camera_mp'] = megapixel_match.group(1) if megapixel_match else None
            camera_info['macro_camera_focal_length'] = focal_length_match.group(1) if focal_length_match else None
            camera_info['macro_camera_aperture'] = aperture_match.group(1) if aperture_match else None
        elif 'chính' in part.lower():
            camera_info['main_camera_mp'] = megapixel_match.group(1) if megapixel_match else None
            camera_info['main_camera_focal_length'] = focal_length_match.group(1) if focal_length_match else None
            camera_info['main_camera_aperture'] = aperture_match.group(1) if aperture_match else None

    return camera_info

# Áp dụng hàm đã cập nhật vào cột camera_primary
df_camera_back['camera_back'] = df_camera_back['camera_back'].fillna('').astype(str)
camera_details = df_camera_back['camera_back'].apply(extract_camera_by_vn_type)

# Chuyển đổi các chi tiết đã trích xuất thành DataFrame và nối nó với dữ liệu gốc
camera_details_df = pd.DataFrame(list(camera_details))
camera_data_split_corrected_vn = pd.concat([df_camera_back, camera_details_df], axis=1)

# Hiển thị dữ liệu đã được chỉnh sửa với trích xuất từ khóa tiếng Việt
camera_data_split_corrected_vn.head()


Unnamed: 0,product_id,camera_back,main_camera_mp,wide_camera_mp,tele_camera_mp,macro_camera_mp,main_camera_focal_length,wide_camera_focal_length,tele_camera_focal_length,macro_camera_focal_length,main_camera_aperture,wide_camera_aperture,tele_camera_aperture,macro_camera_aperture
0,59254,"Camera chính: 48MP, f/1.6, 26mm, Focus Pixels ...",48MP,12MP,,,,,,,,,,
1,76439,"Camera chính: 50MP, 23mm, ƒ/1 7 Cảm biến hình ...",50MP,12MP,50MP,,,,,,,ƒ/1,ƒ/1,
2,59294,"Camera chính: 200MP, Laser AF, OISCamera: 50MP...",200MP,10MP,,,,,,,,,,
3,59257,"Camera chính: 48MP, f/1.78, 24mm, chống rung q...",48MP,48MP,12MP,,,,,,,ƒ/1.6,,
4,59258,"Camera chính: 48MP, f/1.78, 24mm, 2µm, chống r...",48MP,48MP,12MP,,,,,,,ƒ/1.6,,


In [37]:
# main_camera_mp
camera_data_split_corrected_vn['main_camera_mp'] = camera_data_split_corrected_vn['main_camera_mp'].str.replace('MP', '', regex=False)

# wide_camera_mp
camera_data_split_corrected_vn['wide_camera_mp'] = camera_data_split_corrected_vn['wide_camera_mp'].str.replace('MP', '', regex=False)

# macro_camera_mp
camera_data_split_corrected_vn['macro_camera_mp'] = camera_data_split_corrected_vn['macro_camera_mp'].str.replace('MP', '', regex=False)

# tele_camera_mp
camera_data_split_corrected_vn['tele_camera_mp'] = camera_data_split_corrected_vn['tele_camera_mp'].str.replace('MP', '', regex=False)

# main_camera_focal_length
camera_data_split_corrected_vn['main_camera_focal_length'] = camera_data_split_corrected_vn['main_camera_focal_length'].str.replace('mm', '', regex=False)

# wide_camera_focal_length
camera_data_split_corrected_vn['wide_camera_focal_length'] = camera_data_split_corrected_vn['wide_camera_focal_length'].str.replace('mm', '', regex=False)

# tele_camera_focal_length
camera_data_split_corrected_vn['tele_camera_focal_length'] = camera_data_split_corrected_vn['tele_camera_focal_length'].str.replace('mm', '', regex=False)

# macro_camera_focal_length
camera_data_split_corrected_vn['macro_camera_focal_length'] = camera_data_split_corrected_vn['macro_camera_focal_length'].str.replace('mm', '', regex=False)

# main_camera_aperture
camera_data_split_corrected_vn['main_camera_aperture'] = camera_data_split_corrected_vn['main_camera_aperture'].str.replace('f/', '', regex=False).str.replace('ƒ/', '', regex=False)

# wide_camera_aperture
camera_data_split_corrected_vn['wide_camera_aperture'] = camera_data_split_corrected_vn['wide_camera_aperture'].str.replace('f/', '', regex=False).str.replace('ƒ/', '', regex=False)

# tele_camera_aperture
camera_data_split_corrected_vn['tele_camera_aperture'] = camera_data_split_corrected_vn['tele_camera_aperture'].str.replace('f/', '', regex=False).str.replace('ƒ/', '', regex=False)

# macro_camera_aperture
camera_data_split_corrected_vn['macro_camera_aperture'] = camera_data_split_corrected_vn['macro_camera_aperture'].str.replace('f/', '', regex=False).str.replace('ƒ/', '', regex=False)

# Remove leading single quotes from all columns
camera_data_split = camera_data_split_corrected_vn.applymap(lambda x: x.lstrip("'") if isinstance(x, str) else x)

# Display the cleaned data
camera_data_split.head()

  camera_data_split = camera_data_split_corrected_vn.applymap(lambda x: x.lstrip("'") if isinstance(x, str) else x)


Unnamed: 0,product_id,camera_back,main_camera_mp,wide_camera_mp,tele_camera_mp,macro_camera_mp,main_camera_focal_length,wide_camera_focal_length,tele_camera_focal_length,macro_camera_focal_length,main_camera_aperture,wide_camera_aperture,tele_camera_aperture,macro_camera_aperture
0,59254,"Camera chính: 48MP, f/1.6, 26mm, Focus Pixels ...",48,12,,,,,,,,,,
1,76439,"Camera chính: 50MP, 23mm, ƒ/1 7 Cảm biến hình ...",50,12,50.0,,,,,,,1.0,1.0,
2,59294,"Camera chính: 200MP, Laser AF, OISCamera: 50MP...",200,10,,,,,,,,,,
3,59257,"Camera chính: 48MP, f/1.78, 24mm, chống rung q...",48,48,12.0,,,,,,,1.6,,
4,59258,"Camera chính: 48MP, f/1.78, 24mm, 2µm, chống r...",48,48,12.0,,,,,,,1.6,,


In [38]:
camera_data_split = camera_data_split.drop(columns=['camera_back'])
camera_data_split.head()

Unnamed: 0,product_id,main_camera_mp,wide_camera_mp,tele_camera_mp,macro_camera_mp,main_camera_focal_length,wide_camera_focal_length,tele_camera_focal_length,macro_camera_focal_length,main_camera_aperture,wide_camera_aperture,tele_camera_aperture,macro_camera_aperture
0,59254,48,12,,,,,,,,,,
1,76439,50,12,50.0,,,,,,,1.0,1.0,
2,59294,200,10,,,,,,,,,,
3,59257,48,48,12.0,,,,,,,1.6,,
4,59258,48,48,12.0,,,,,,,1.6,,


In [39]:
# Đổi tên các cột
camera_data_split.rename(columns={
    'main_camera_mp': 'main_camera_mp (MP)',
    'wide_camera_mp': 'wide_camera_mp (MP)',
    'macro_camera_mp': 'macro_camera_mp (MP)',
    'tele_camera_mp': 'tele_camera_mp (MP)',
    'main_camera_focal_length': 'main_camera_focal_length (mm)',
    'wide_camera_focal_length': 'wide_camera_focal_length (mm)',
    'tele_camera_focal_length': 'tele_camera_focal_length (mm)',
    'macro_camera_focal_length': 'macro_camera_focal_length (mm)',
    'main_camera_aperture': 'main_camera_aperture (f/)',
    'wide_camera_aperture': 'wide_camera_aperture (f/)',
    'tele_camera_aperture': 'tele_camera_aperture (f/) ',
    'macro_camera_aperture': 'macro_camera_aperture (f/)',
}, inplace=True)


In [40]:
camera_data_split.fillna('N/A', inplace=True)
camera_data_split.head()

Unnamed: 0,product_id,main_camera_mp (MP),wide_camera_mp (MP),tele_camera_mp (MP),macro_camera_mp (MP),main_camera_focal_length (mm),wide_camera_focal_length (mm),tele_camera_focal_length (mm),macro_camera_focal_length (mm),main_camera_aperture (f/),wide_camera_aperture (f/),tele_camera_aperture (f/),macro_camera_aperture (f/)
0,59254,48,12,,,,,,,,,,
1,76439,50,12,50.0,,,,,,,1.0,1.0,
2,59294,200,10,,,,,,,,,,
3,59257,48,48,12.0,,,,,,,1.6,,
4,59258,48,48,12.0,,,,,,,1.6,,


In [41]:
# Save the modified DataFrame to a new CSV file
camera_data_split.to_csv('camera_back_processed.csv', index=False, encoding='utf-8-sig')