### 使用流程
1. 爬取資料
2. 按類別清理資料

In [21]:
import pandas as pd
import requests
from bs4 import BeautifulSoup
import re
from datetime import datetime
import warnings
import sqlite3  
output_path='./output/'
db_name = 'data/sqlchain.db'
conn = sqlite3.connect(db_name)  

In [22]:
# 基本設定
pd.set_option('display.max_columns', None)
pd.set_option('display.max_rows', None)
pd.set_option('display.float_format',lambda x: '%.2f' %x)
warnings.filterwarnings("ignore")

In [23]:
# 初始化空的DataFrame
df_output = pd.DataFrame(columns=['class', 'name', 'price'])

### 1.爬取資料

In [26]:
# 獲取今天日期
today_date = datetime.today().strftime('%Y-%m-%d')

headers = {
    'user-agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/100.0.4896.60 Safari/537.36'}

#爬蟲
def item_crawler(df, value, class_string):
    res = requests.get("https://coolpc.com.tw/evaluate.php", headers=headers)
    soup = BeautifulSoup(res.text, 'lxml')
    
    data_list = []  # 用於存儲清單

    for item in soup.select('#tbdy > tr:nth-child('+str(value)+')'):
        for opt in item.select('td:nth-child(3) > select'):
            for opt_item in opt.find_all(value=True, disabled=False):
                total_result = re.sub(r"共有.*\n", "", opt_item.text, 0, re.MULTILINE)
                blank_result = re.sub(r"^\s*\n", "", total_result, 0, re.MULTILINE)
                if len(blank_result) != 0:
                    name_string = blank_result.split(',')[0]
                    price_string = blank_result.split("$").pop().split(" ")[0]
                    data_list.append({'class': class_string, 'name': name_string, 'price': price_string})
    
    # 轉換DataFrame
    if data_list:
        df_temp = pd.DataFrame(data_list)
        return pd.concat([df, df_temp], ignore_index=True)
    else:
        return df  

# 爬取不同品類資料
df_output_1=item_crawler(df_output, 4, "處理器 CPU")
df_output_2=item_crawler(df_output, 5, "主機板 MB")
df_output_3=item_crawler(df_output, 6, "記憶體 RAM")
df_output_4=item_crawler(df_output, 7, "固態硬碟 M.2｜SSD")
df_output_5=item_crawler(df_output, 8, "傳統內接硬碟 HDD")
df_output_6=item_crawler(df_output, 12, "顯示卡 VGA")
df_output_7=item_crawler(df_output,10, "散熱器｜散熱墊｜散熱膏")
df_output_8=item_crawler(df_output,15, "電源供應器")
df_output_9=item_crawler(df_output,14, "機殼 CASE")
df_output_10=item_crawler(df_output,13, "螢幕｜投影機｜壁掛")
#合併資料
df=pd.concat([df_output_1,df_output_2,df_output_3,df_output_4,df_output_5,df_output_6,df_output_7,df_output_8,df_output_9,df_output_10])
df['etl_date']=today_date


## 2.清理資料

### 處理CPU資料

In [38]:
df_cpu=df.loc[df['class']=='處理器 CPU']
import re

# Function to extract CPU model, brand, and processing performance
def extract_cpu_details(row):
    # Define regex patterns for extracting the details
    model_pattern = r'(Intel\s+[\w-]+|AMD\s+[\w-]+)'  # Example: Intel i3-14100F or AMD Ryzen 5 5600X
    brand_pattern = r'(微星|華碩|技嘉|藍寶|七彩虹|Intel|AMD)'  # Example brands in Chinese
    performance_pattern_general = r'\d+核/\d+緒\s*[\d.]*GHz|\d+核/\d+緒'  # Generalized to capture core/thread and frequency

    
    model = re.search(model_pattern, row['name'])
    brand = re.search(brand_pattern, row['name'])
    performance = re.search(performance_pattern_general, row['name'])

    model = model.group(0) if model else None
    brand = brand.group(0) if brand else None
    performance = performance.group(0).replace(' ', '') if performance else None
    return pd.Series([model, brand, performance])

# Apply the function to the dataframe
df_cpu[['model','brand','performance']]=df_cpu.apply(extract_cpu_details, axis=1)
print('筆數',df_cpu.shape[0])
df_cpu.head()

筆數 80


Unnamed: 0,class,name,price,etl_date,model,brand,performance
0,處理器 CPU,酷碼 MasterLiquid 240L Core ARGB 水冷/厚:5.2/S型雙腔冷頭...,2190,2024-03-17,,,
1,處理器 CPU,酷碼 MasterLiquid 360L Core ARGB 水冷/厚:5.2/S型雙腔冷頭...,2490,2024-03-17,,,
2,處理器 CPU,微星 MAG CoreLiquid P240 /鏡面烤漆冷頭/厚:5.2(註冊3+2年) 任...,1790,2024-03-17,,微星,
3,處理器 CPU,Intel Processor 300【2核/4緒】3.9GHz/6M/UHD710/46W...,2990,2024-03-17,Intel Processor,Intel,2核/4緒
4,處理器 CPU,Intel i3-14100F【4核/8緒】3.5GHz(↑4.7GHz)/12M/無內顯/...,3990,2024-03-17,Intel i3-14100F,Intel,4核/8緒


The history saving thread hit an unexpected error (OperationalError('attempt to write a readonly database')).History will not be written to the database.


In [39]:
#清理時脈的資料
def extract_clock_speeds_refined(name):
    # Update regular expression to more accurately capture clock speeds, attempting to exclude model numbers
    # This includes looking for patterns that are more likely to represent clock speeds, such as numbers followed by "GHz" or "G" not directly preceded by other digits
    matches = re.findall(r"(?:[^0-9]|^)(\d+(?:\.\d+)?)(GHz|G)(?:\(↑(\d+(?:\.\d+)?)G\))?", name)
    base_clock, boost_clock = None, None

    for match in matches:
        clock_speed, unit, boost = match
        clock_speed = float(clock_speed)

        if unit in ["GHz", "G"]:  # Handle both "GHz" and simplified "G" units
            if base_clock is None:
                base_clock = clock_speed
            # Update boost clock if explicitly mentioned or if the found clock speed is higher than the current boost clock
            if boost:
                boost_clock_val = float(boost)
                if boost_clock is None or boost_clock_val > boost_clock:
                    boost_clock = boost_clock_val
            elif clock_speed > base_clock:
                boost_clock = clock_speed

    # Ensure boost_clock is at least equal to base_clock if defined
    if boost_clock is None and base_clock is not None:
        boost_clock = base_clock

    return pd.Series([base_clock, boost_clock])

df_cpu[['base_clock', 'boost_clock']] = df_cpu['name'].apply(extract_clock_speeds_refined)
df_cpu[['name', 'base_clock', 'boost_clock']].head()

Unnamed: 0,name,base_clock,boost_clock
0,酷碼 MasterLiquid 240L Core ARGB 水冷/厚:5.2/S型雙腔冷頭...,,
1,酷碼 MasterLiquid 360L Core ARGB 水冷/厚:5.2/S型雙腔冷頭...,,
2,微星 MAG CoreLiquid P240 /鏡面烤漆冷頭/厚:5.2(註冊3+2年) 任...,,
3,Intel Processor 300【2核/4緒】3.9GHz/6M/UHD710/46W...,3.9,3.9
4,Intel i3-14100F【4核/8緒】3.5GHz(↑4.7GHz)/12M/無內顯/...,3.5,4.7


### 資料清理

In [40]:
# 定義需要移除的關鍵字列表
print('資料清理前筆數',df_cpu.shape[0])
keywords_to_remove = ['【代理盒】', '【搭K版CPU專案】','限與K版CPU同發票','任搭CPU','【任搭版支持價】','【搭K版CPU專案】','代理盒裝','(需跟主機板在同一張發票開立)','【搭7950X3D省$300】','【搭7950X3D省$300】','(須跟CPU在同一張發票開立)','【搭7950X3D省$2000】','【搭AMD 8系列省$500】']

# 使用正則表達式匹配任何一個關鍵字
pattern = '|'.join(keywords_to_remove)

# 在`name`欄位中將匹配到的關鍵字替換為空字串
df_cpu['name'] = df_cpu['name'].str.replace(pattern, '', regex=True)

資料清理前筆數 80


In [41]:
# 定義需要移除的關鍵字列表
keywords_to_remove = ['狂專案', '代理盒','CPU專案','發票','任搭','特價','省','搭']

# 建立正則表達式，匹配包含任一指定關鍵字的【】大括號及其內容
# 這裡使用了正則表達式的非貪婪匹配和前瞻（lookahead）來確保只匹配包含指定關鍵字的大括號內容
pattern = r'\【.*?(' + '|'.join(keywords_to_remove) + r').*?\】'

# 在`name`欄位中將匹配到的部分替換為空字串
df_cpu['name'] = df_cpu['name'].str.replace(pattern, '', regex=True)

In [42]:
# 定義需要移除的關鍵字列表
keywords_to_remove = ['任搭CPU', '優惠價','限與K版U同發票']

# 使用正則表達式匹配任何一個關鍵字
pattern = '|'.join(keywords_to_remove)

# 在`name`欄位中將匹配到的關鍵字替換為空字串
df_cpu['name'] = df_cpu['name'].str.replace(pattern, '', regex=True)
#清除model資料為空的row data
df_cpu=df_cpu.loc[~df_cpu['model'].isna()]
print('資料清理後筆數',df_cpu.shape[0])

資料清理後筆數 55


In [43]:
df_cpu.head()

Unnamed: 0,class,name,price,etl_date,model,brand,performance,base_clock,boost_clock
3,處理器 CPU,Intel Processor 300【2核/4緒】3.9GHz/6M/UHD710/46W,2990,2024-03-17,Intel Processor,Intel,2核/4緒,3.9,3.9
4,處理器 CPU,Intel i3-14100F【4核/8緒】3.5GHz(↑4.7GHz)/12M/無內顯/...,3990,2024-03-17,Intel i3-14100F,Intel,4核/8緒,3.5,4.7
5,處理器 CPU,Intel i3-14100【4核/8緒】3.5GHz(↑4.7GHz)/12M/UHD73...,4800,2024-03-17,Intel i3-14100,Intel,4核/8緒,3.5,4.7
6,處理器 CPU,Intel i5-14400F【10核/16緒】2.5GHz(↑4.7G)/20M/無內顯/...,6800,2024-03-17,Intel i5-14400F,Intel,10核/16緒,2.5,4.7
7,處理器 CPU,Intel i5-14400【10核/16緒】2.5GHz(↑4.7G)/20M/UHD73...,7600,2024-03-17,Intel i5-14400,Intel,10核/16緒,2.5,4.7


In [44]:
#匯出資料
df_cpu.to_csv(output_path+'cpu.csv',index=False,encoding='utf-8-sig')

### 處理GPU資料

In [57]:
# Filter the dataframe for GPU entries
df_gpu = df.loc[df['class'] == '顯示卡 VGA']
df_gpu=df_gpu.loc[(~df_gpu['name'].str.contains('支撐架'))&(~df_gpu['name'].str.contains('顯示盒'))&(~df_gpu['name'].str.contains('ARC'))&(~df_gpu['name'].str.contains('Arc'))&(~df_gpu['name'].str.contains('Phantom'))]
# Re-define the regex patterns with adjustments to improve model capture
gpu_model_pattern = r'(NVIDIA\s*(RTX|GTX|GT)\s*\d+|AMD\s*(RX|Vega|Navi)\s*\d+|RTX\s*\d+|RX\s*\d+|GTX\s*\d+|GT\s*\d+)' # Adjusted for potential spacing variations
gpu_brand_pattern = r'(華碩|微星|技嘉|藍寶|七彩虹|ZOTAC|EVGA|麗臺|蓋酷)'  # Capture popular GPU brands
memory_pattern = r'\b[O0C]*\d+G(B)?'  # Memory size, e.g., "8GB"
core_frequency_pattern = r'\b\d+MHz|\b\d+GHz'  # Core frequency, e.g., "2595MHz"
cuda_cores_pattern = r'\bCUDA:\d+'  # CUDA cores, specifically for NVIDIA GPUs, e.g., "CUDA:3328"

# Re-define the function to extract and normalize GPU details with improved pattern
def normalize_gpu_details(row):
    model = re.search(gpu_model_pattern, row['name'])
    brand = re.search(gpu_brand_pattern, row['name'])
    memory = re.search(memory_pattern, row['name'])
    core_frequency = re.search(core_frequency_pattern, row['name'])
    cuda_cores = re.search(cuda_cores_pattern, row['name'])

    # Normalize extracted data
    id=model.group(0) if model else 'Unknown'
    model = model.group(0) if model else 'Unknown'
    brand = brand.group(0) if brand else 'Unknown'
    memory = memory.group(0) if memory else '0GB'
    core_frequency = core_frequency.group(0) if core_frequency else '0MHz'
    cuda_cores = cuda_cores.group(0) if cuda_cores else 'CUDA:0'
    
    return pd.Series([model, brand, memory, core_frequency, cuda_cores], index=['model', 'brand', 'memory', 'core_frequency', 'CUDA cores'])

# Apply the improved extraction function to the GPU dataframe
df_gpu[['model', 'brand', 'memory', 'corefrequency', 'cudacores']] = df_gpu.apply(normalize_gpu_details, axis=1)
print('清理完後資料筆數',df_gpu.shape[0])
df_gpu.head()

清理完後資料筆數 270


Unnamed: 0,class,name,price,etl_date,model,brand,memory,corefrequency,cudacores
13,顯示卡 VGA,微星 N210-MD1G/D3(589MHz/1G DDR3/風扇版/14.5cm/三年保),1250,2024-03-17,Unknown,微星,1G,589MHz,CUDA:0
14,顯示卡 VGA,華碩 GT710-SL-2GD3-BRK-EVO(954MHz/2G DDR3/17cm/註...,1690,2024-03-17,GT710,華碩,2G,954MHz,CUDA:0
15,顯示卡 VGA,華碩 GT710-SL-2GD5-BRK-EVO(954MHz/2G DDR5/17cm/註...,1790,2024-03-17,GT710,華碩,2G,954MHz,CUDA:0
16,顯示卡 VGA,技嘉 N710D3-2GL(954MHz/2G DDR3/14.4cm/三年保),1790,2024-03-17,Unknown,技嘉,2G,954MHz,CUDA:0
17,顯示卡 VGA,微星 GT710 1GD3H LP(954MHz/1G DDR3/靜音版/14.6cm/三年保),1450,2024-03-17,GT710,微星,1G,954MHz,CUDA:0


In [58]:
# Extract numeric values from the 'memory' and 'corefrequency' columns
# Also, preserve the units in separate columns
# Extract numbers and units
df_gpu['memory_value'] = df_gpu['memory'].str.extract('(\d+)').astype(float) # Extract numerical values
df_gpu['memory_unit'] = df_gpu['memory'].str.extract('([a-zA-Z]+)') # Extract units

df_gpu['corefrequency_value'] = df_gpu['corefrequency'].str.extract('(\d+)').astype(float) # Extract numerical values
df_gpu['corefrequency_unit'] = df_gpu['corefrequency'].str.extract('([a-zA-Z]+)') # Extract units

# Function to extract DDR number
def extract_ddr_number_safe(name):
    try:
        # Attempt to find DDR number as before
        matches = re.findall("ddr(\d+)", name, re.IGNORECASE)
        if matches:
            return matches[0]
    except TypeError as e:
        # If any error occurs, return None and effectively skip this entry
        return None
    return None


# Extract DDR information

df_gpu['ddr'] = df_gpu['name'].apply(extract_ddr_number_safe)
# Display the modified DataFrame to verify the changes
df_gpu.drop(columns=['corefrequency'],inplace=True)
df_gpu.head()

Unnamed: 0,class,name,price,etl_date,model,brand,memory,cudacores,memory_value,memory_unit,corefrequency_value,corefrequency_unit,ddr
13,顯示卡 VGA,微星 N210-MD1G/D3(589MHz/1G DDR3/風扇版/14.5cm/三年保),1250,2024-03-17,Unknown,微星,1G,CUDA:0,1.0,G,589.0,MHz,3
14,顯示卡 VGA,華碩 GT710-SL-2GD3-BRK-EVO(954MHz/2G DDR3/17cm/註...,1690,2024-03-17,GT710,華碩,2G,CUDA:0,2.0,G,954.0,MHz,3
15,顯示卡 VGA,華碩 GT710-SL-2GD5-BRK-EVO(954MHz/2G DDR5/17cm/註...,1790,2024-03-17,GT710,華碩,2G,CUDA:0,2.0,G,954.0,MHz,5
16,顯示卡 VGA,技嘉 N710D3-2GL(954MHz/2G DDR3/14.4cm/三年保),1790,2024-03-17,Unknown,技嘉,2G,CUDA:0,2.0,G,954.0,MHz,3
17,顯示卡 VGA,微星 GT710 1GD3H LP(954MHz/1G DDR3/靜音版/14.6cm/三年保),1450,2024-03-17,GT710,微星,1G,CUDA:0,1.0,G,954.0,MHz,3


In [59]:
#匯出gpu資料
df_gpu.to_csv(output_path+'gpu.csv',index=False,encoding='utf-8-sig')

### 處理硬碟資料

In [6]:
df_hdd=df.loc[(df['class'].str.contains('固態硬碟 M.2｜SSD'))|(df['class'].str.contains('傳統內接硬碟 HDD'))]
# Define regex patterns for GPU details extraction
# Define regex patterns for extracting the desired information
brand_pattern = r"^\D+"
capacity_pattern = r"(\d+(?:GB|TB))"
read_speed_pattern_adjusted = r"讀:(\d+MB|\d+M)"
write_speed_pattern_adjusted = r"寫:(\d+MB|\d+M)"

# Update the extraction function to include drive type based on the "Class" column and the "Name" column
def extract_info_with_type(row):
    name = row['name']
    class_info = row['class']
    drive_type = "SSD" if "SSD" in class_info or "固態硬碟" in name else "HDD" if "HDD" in class_info or "傳統硬碟" in name else "Unknown"

    brand = re.match(brand_pattern, name)
    capacity = re.search(capacity_pattern, name)
    read_speed = re.search(read_speed_pattern_adjusted, name)
    write_speed = re.search(write_speed_pattern_adjusted, name)
    return {
        "brand": brand.group(0).strip() if brand else None,
        "capacity": capacity.group(1) if capacity else None,
#        "Read_Speed": read_speed.group(1).replace('M', 'MB') if read_speed and 'M' in read_speed.group(1) else read_speed.group(1) if read_speed else None,
#        "Write_Speed":  write_speed.group(1).replace('M', 'MB') if write_speed and 'M' in write_speed.group(1) else write_speed.group(1) if write_speed else None,
        "drive_type": drive_type
    }

# Apply updated extraction to the DataFrame including 'Class' information
extracted_info_with_type = df_hdd.apply(extract_info_with_type, axis=1)

# Convert the extracted info into a DataFrame
df_extracted_info_with_type = pd.DataFrame(list(extracted_info_with_type))
df_hdd.reset_index(inplace=True)
df_extracted_info_with_type.reset_index(inplace=True)
df_hdd=pd.concat([df_hdd,df_extracted_info_with_type],axis=1)
print('資料筆數',df_hdd.shape[0])
df_hdd.head()

資料筆數 242


Unnamed: 0,index,class,name,price,etl_date,index.1,brand,capacity,drive_type
0,0,固態硬碟 M.2｜SSD,UMAX S330 240GB /2.5吋 讀:520MB寫:450MB/3D NAND F...,490,2024-03-12,0,UMAX S,240GB,SSD
1,1,固態硬碟 M.2｜SSD,UMAX S330 480GB /2.5吋 讀:560MB寫:450MB/3D NAND F...,880,2024-03-12,1,UMAX S,480GB,SSD
2,2,固態硬碟 M.2｜SSD,UMAX S330 960GB /2.5吋 讀:560MB寫:500MB/3D NAND F...,1600,2024-03-12,2,UMAX S,960GB,SSD
3,3,固態硬碟 M.2｜SSD,威剛 Ultimate SU650 120G/2.5吋/讀:520M/寫:320M/TLC顆...,350,2024-03-12,3,威剛 Ultimate SU,,SSD
4,4,固態硬碟 M.2｜SSD,威剛 Ultimate SU650 240G/2.5吋/讀:520M/寫:450M/TLC顆...,599,2024-03-12,4,威剛 Ultimate SU,,SSD


In [7]:
def extract_read_write_speeds(name):
    # Pattern to match read and write speeds like "讀:7400/寫:6800" or "讀:520M/寫:450M"
    pattern = r'讀:(\d+M?)/寫:(\d+M?)'
    match = re.search(pattern, name)
    if match:
        read_speed, write_speed = match.groups()
        # Remove 'M' if present and convert to integer
        return read_speed, write_speed
    return None, None
read_write_speeds = df_hdd['name'].apply(lambda x: pd.Series(extract_read_write_speeds(x), index=['read_speed', 'write_speed']))
df_hdd=pd.concat([df_hdd,read_write_speeds],axis=1)

In [8]:
read_speed_pattern_final = r"讀[:：](\d+)(MB|M)"
write_speed_pattern_final = r"寫[:：](\d+)(MB|M)"
# function
def extract_final_read_write_speed(row):
    read_speed = re.search(read_speed_pattern_final, row['name'])
    write_speed = re.search(write_speed_pattern_final, row['name'])
    
    if read_speed:
        row['read_speed'] = int(read_speed.group(1))  
    if write_speed:
        row['write_speed'] = int(write_speed.group(1)) 
    
    return row
df_hdd = df_hdd.apply(extract_final_read_write_speed, axis=1)
df_hdd.drop(columns=['index'],inplace=True)
df_hdd.head()

Unnamed: 0,class,name,price,etl_date,brand,capacity,drive_type,read_speed,write_speed
0,固態硬碟 M.2｜SSD,UMAX S330 240GB /2.5吋 讀:520MB寫:450MB/3D NAND F...,490,2024-03-12,UMAX S,240GB,SSD,520,450
1,固態硬碟 M.2｜SSD,UMAX S330 480GB /2.5吋 讀:560MB寫:450MB/3D NAND F...,880,2024-03-12,UMAX S,480GB,SSD,560,450
2,固態硬碟 M.2｜SSD,UMAX S330 960GB /2.5吋 讀:560MB寫:500MB/3D NAND F...,1600,2024-03-12,UMAX S,960GB,SSD,560,500
3,固態硬碟 M.2｜SSD,威剛 Ultimate SU650 120G/2.5吋/讀:520M/寫:320M/TLC顆...,350,2024-03-12,威剛 Ultimate SU,,SSD,520,320
4,固態硬碟 M.2｜SSD,威剛 Ultimate SU650 240G/2.5吋/讀:520M/寫:450M/TLC顆...,599,2024-03-12,威剛 Ultimate SU,,SSD,520,450


In [9]:
# Define a function to extract model information based on patterns observed in the 'name' column
def extract_model(name):
    # Attempt to extract model based on observed patterns
    # The pattern is designed to capture a wide range of model formats
    pattern = re.compile(r'(\b[A-Za-z]+[\s-]?[A-Za-z]*\s?\d+[A-Za-z]*\b)')
    match = pattern.search(name)
    if match:
        return match.group(0).strip()
    return "Unknown"  # Return "Unknown" if no pattern matches

# Apply the function to the 'name' column to extract the model
df_hdd['model'] = df_hdd['name'].apply(extract_model)
df_hdd['id']=df_hdd['model']
df_hdd.head()

Unnamed: 0,class,name,price,etl_date,brand,capacity,drive_type,read_speed,write_speed,model,id
0,固態硬碟 M.2｜SSD,UMAX S330 240GB /2.5吋 讀:520MB寫:450MB/3D NAND F...,490,2024-03-12,UMAX S,240GB,SSD,520,450,UMAX S330,UMAX S330
1,固態硬碟 M.2｜SSD,UMAX S330 480GB /2.5吋 讀:560MB寫:450MB/3D NAND F...,880,2024-03-12,UMAX S,480GB,SSD,560,450,UMAX S330,UMAX S330
2,固態硬碟 M.2｜SSD,UMAX S330 960GB /2.5吋 讀:560MB寫:500MB/3D NAND F...,1600,2024-03-12,UMAX S,960GB,SSD,560,500,UMAX S330,UMAX S330
3,固態硬碟 M.2｜SSD,威剛 Ultimate SU650 120G/2.5吋/讀:520M/寫:320M/TLC顆...,350,2024-03-12,威剛 Ultimate SU,,SSD,520,320,Ultimate SU650,Ultimate SU650
4,固態硬碟 M.2｜SSD,威剛 Ultimate SU650 240G/2.5吋/讀:520M/寫:450M/TLC顆...,599,2024-03-12,威剛 Ultimate SU,,SSD,520,450,Ultimate SU650,Ultimate SU650


In [13]:
# 使用正則表達式提取容量的數字部分和單位
df_hdd['capacity_value'] = df_hdd['capacity'].str.extract(r'(\d+)').astype(float)
df_hdd['capacity_unit'] = df_hdd['capacity'].str.extract(r'([A-Za-z]+)')
# 將TB單位轉換為GB
df_hdd.loc[df_hdd['capacity_unit'] == 'TB', 'capacity_value'] *= 1000
df_hdd.loc[df_hdd['capacity_unit'] == 'TB', 'capacity_unit'] = 'GB'

# 再次檢視修改後的數據
df_hdd[['capacity', 'capacity_value', 'capacity_unit']].head()



Unnamed: 0,capacity,capacity_value,capacity_unit
0,240GB,240.0,GB
1,480GB,480.0,GB
2,960GB,960.0,GB
3,,,
4,,,


In [14]:
# 匯入到csv
df_hdd.to_csv(output_path+'hdd.csv',encoding='utf-8-sig',index=False)

### 處理散熱器資料

In [45]:
df_cool=df.loc[df['class']=='散熱器｜散熱墊｜散熱膏']
# Define a function to extract brand, TDP, RPM, and type from the name field
def extract_cooler_info(name):
    # Attempt to extract brand by known brands or the first word if not listed
    brands = ["酷碼", "微星", "Intel", "AMD"]
    brand = next((b for b in brands if b in name), name.split()[0])
    
    # TDP extraction - Looking for patterns like numbers followed by W (e.g., "46W")
    tdp_match = re.search(r'\d+W', name)
    tdp = tdp_match.group(0) if tdp_match else "Unknown"
    
    # RPM extraction - Typically not directly stated, might not be extractable without clear indicators
    rpm = "Unknown" # Placeholder as RPM might not be clearly stated
    
    # Type extraction based on known types
    if "水冷" in name:
        type_ = "水冷式"
    elif "氣冷" in name or "風扇" in name:
        type_ = "氣冷式"
    elif "塔散" in name:
        type_ = "塔散"
    else:
        type_ = "Unknown"
    
    return pd.Series([brand, tdp, rpm, type_], index=['Brand', 'TDP', 'RPM', 'Type'])

# Apply the function to the dataframe
df_cool[['brand', 'tdp', 'rpm', 'type']] = df_cool['name'].apply(extract_cooler_info)
df_cool['model']=df_cool['name']
print('資料筆數',df_cool.shape[0])
df_cool.head()

資料筆數 146


Unnamed: 0,class,name,price,etl_date,brand,tdp,rpm,type,model
0,散熱器｜散熱墊｜散熱膏,利民 M.2 2280 TYPE A B SSD 固態硬碟散熱片/鋁合金/單雙面皆適用,199,2024-03-17,利民,Unknown,Unknown,Unknown,利民 M.2 2280 TYPE A B SSD 固態硬碟散熱片/鋁合金/單雙面皆適用
1,散熱器｜散熱墊｜散熱膏,利民 M.2 2280 SSD 固態硬碟散熱片/鋁合金/單雙面皆適用,350,2024-03-17,利民,Unknown,Unknown,Unknown,利民 M.2 2280 SSD 固態硬碟散熱片/鋁合金/單雙面皆適用
2,散熱器｜散熱墊｜散熱膏,利民 M.2 2280 PRO SSD 固態硬碟散熱片/鋁合金+8 mm純銅導管/單雙面皆適用,400,2024-03-17,利民,Unknown,Unknown,Unknown,利民 M.2 2280 PRO SSD 固態硬碟散熱片/鋁合金+8 mm純銅導管/單雙面皆適用
3,散熱器｜散熱墊｜散熱膏,利民 HR-09 2280 SSD 固態硬碟散熱器/6 mm熱導管/電鍍鰭片/單雙面皆適用,550,2024-03-17,利民,Unknown,Unknown,Unknown,利民 HR-09 2280 SSD 固態硬碟散熱器/6 mm熱導管/電鍍鰭片/單雙面皆適用
4,散熱器｜散熱墊｜散熱膏,利民 HR-10 2280 PRO SSD 固態硬碟散熱器/4導管/3CM PWM風扇/單雙...,690,2024-03-17,利民,Unknown,Unknown,氣冷式,利民 HR-10 2280 PRO SSD 固態硬碟散熱器/4導管/3CM PWM風扇/單雙...


In [46]:
# 匯入到csv
df_cool.to_csv(output_path+'cool.csv',encoding='utf-8-sig',index=False)

### 處理記憶體資料

In [52]:
df_ram=df.loc[df['class']=='記憶體 RAM']
def extract_info_comprehensive(name):
    # Define a comprehensive list of brands, removing capacity indications from the brand field
    brands = ["UMAX", "金士頓", "美光Micron", "威剛", "KLEVV", "十銓", "十銓ELITE", "十銓T", "芝奇G", "美光", "芝奇"]
    brand = "Unknown"
    for b in brands:
        if b in name:
            # Handle special cases where the brand may be followed by capacity info
            brand = b.replace("32GB", "").replace("16GB", "").replace("64GB", "").strip()
            break
    
    # Extract module specification (e.g., DDR4, DDR5)
    module_spec_match = re.search(r"DDR\d+", name)
    module_spec = module_spec_match.group(0) if module_spec_match else "Unknown"
    
    # Extract capacity
    capacity_match = re.search(r"\d+GB", name)
    capacity = capacity_match.group(0) if capacity_match else "Unknown"
    
    return brand, module_spec, capacity

# Re-apply the updated function to the dataframe to handle all brands correctly
df_ram[['brand', 'module_Spec', 'capacity']] = df_ram.apply(lambda row: pd.Series(extract_info_comprehensive(row['name'])), axis=1)
print('資料筆數',df_ram.shape[0])
df_ram.head()

Unnamed: 0,class,name,price,etl_date,brand,module_Spec,capacity
0,記憶體 RAM,UMAX 單條8GB DDR5-5600/CL46【具XMP、EXPO參數】,749,2024-03-03,UMAX,DDR5,8GB
1,記憶體 RAM,UMAX 單條16GB DDR5-5600/CL46【具XMP、EXPO參數】,1299,2024-03-03,UMAX,DDR5,16GB
2,記憶體 RAM,UMAX 單條32GB DDR5-5600/CL46【具XMP、EXPO參數】,2599,2024-03-03,UMAX,DDR5,32GB
3,記憶體 RAM,金士頓 單條16GB DDR5-5600(CL36) FURY Beast (獸獵者)【具X...,1580,2024-03-03,金士頓,DDR5,16GB
4,記憶體 RAM,金士頓 單條16GB DDR5-5600(CL36) FURY Beast RGB (獸獵者...,1750,2024-03-03,金士頓,DDR5,16GB


In [177]:
df_ram['capacity_value'] = df_ram['capacity'].str.extract(r'(\d+)')

# Fill NaN values in 'capacity_value' with a placeholder (e.g., -1) before converting to int
df_ram['capacity_value'] = df_ram['capacity_value'].fillna(-1).astype(int)

# Extract unit from 'capacity' column
df_ram['capacity_unit'] = df_ram['capacity'].str.extract(r'([A-Za-z]+)')

In [180]:
df_ram.head()

Unnamed: 0,class,name,price,etl_date,brand,module_Spec,capacity,capacity_value,capacity_unit
0,記憶體 RAM,UMAX 單條8GB DDR5-5600/CL46【具XMP、EXPO參數】,749,2024-03-03,UMAX,DDR5,8GB,8,GB
1,記憶體 RAM,UMAX 單條16GB DDR5-5600/CL46【具XMP、EXPO參數】,1299,2024-03-03,UMAX,DDR5,16GB,16,GB
2,記憶體 RAM,UMAX 單條32GB DDR5-5600/CL46【具XMP、EXPO參數】,2599,2024-03-03,UMAX,DDR5,32GB,32,GB
3,記憶體 RAM,金士頓 單條16GB DDR5-5600(CL36) FURY Beast (獸獵者)【具X...,1580,2024-03-03,金士頓,DDR5,16GB,16,GB
4,記憶體 RAM,金士頓 單條16GB DDR5-5600(CL36) FURY Beast RGB (獸獵者...,1750,2024-03-03,金士頓,DDR5,16GB,16,GB


In [181]:
df_ram.to_csv(output_path+'ram.csv',encoding='utf-8-sig',index=False)

### 處理機殼資料

In [28]:
df_case=df.loc[df['class']=='機殼 CASE']
# Define a function to extract the desired information from the 'Name' column
def extract_case_info(name):
    # Split the name by spaces and slashes to attempt to isolate components
    parts = name.split('/')
    brand = parts[0].split(' ')[0]  # Assuming the first word is the brand
    
    # Initialize placeholders for the extracted information
    case_type = None
    size = None
    material = None
    
    # Attempt to identify case type, size, and material from the parts
    for part in parts:
        if '顯卡長' in part or 'U高' in part:
            size = part  # This part likely contains size information
        if '玻璃' in part:
            material = '玻璃'  # Identifying glass as a material
        if 'ATX' in part or 'E-ATX' in part or 'ITX' in part:
            case_type = part  # Identifying the case type
    
    return pd.Series([brand, case_type, size, material], index=['Brand', 'Case Type', 'Size', 'Material'])

# Apply the function to the 'Name' column and create new columns for the extracted information
df_case[['brand', 'casetype', 'size', 'material']] = df_case['name'].apply(extract_case_info)
print('資料筆數',df_case.shape[0])
#df_case.head(10)

資料筆數 678


In [30]:
df_case.to_csv(output_path+'chassis.csv',encoding='utf-8-sig',index=False)

In [31]:
df_case.head()

Unnamed: 0,class,name,price,etl_date,brand,casetype,size,material
0,機殼 CASE,Fractal Design Vector RS Dark TG /深色玻璃/顯卡長44/C...,3290,2024-03-17,Fractal,E-ATX 原價$6690,CPU高18.5,玻璃
1,機殼 CASE,COUGAR Duoface RGB 黑 卡33/U高19/雙面板/Logo燈效/玻璃透側/...,1990,2024-03-17,COUGAR,E-ATX 原價$2490！,U高19,玻璃
2,機殼 CASE,COUGAR Duoface RGB 白 卡33/U高19/雙面板/Logo燈效/玻璃透側/...,2090,2024-03-17,COUGAR,E-ATX 原價$2590！,U高19,玻璃
3,機殼 CASE,COUGAR CONQUER(5LMR) 顯卡長35/CPU高19/玻璃透側/ATX 特價原...,5990,2024-03-17,COUGAR,ATX 特價原價$8990！,CPU高19,玻璃
4,機殼 CASE,Apexgaming X魔幻 X1 顯卡長43.2/U高17.2/玻璃透側/E-ATX 下殺...,999,2024-03-17,Apexgaming,E-ATX 下殺原價$1990！,U高17.2,玻璃


### 處理主機板資料

In [60]:
df_motherboard=df.loc[df['class']=='主機板 MB']
# Define regex patterns for extracting the required information
model_pattern = r'([A-Z]+-\w+)'  # Generic pattern for motherboard model
size_pattern = r'(E-ATX|ATX|mATX|ITX)'  # Motherboard sizes
brand_pattern = r'(華碩|微星|技嘉|藍寶|七彩虹|ASRock|EVGA|MSI|GIGABYTE|ASUS)'  # Common motherboard brands
# Company might be the same as brand in this context, or we need specific mapping if different

# Function to extract and normalize motherboard details
def normalize_motherboard_details(row):
    model_match = re.search(model_pattern, row['name'])
    size_match = re.search(size_pattern, row['name'])
    brand_match = re.search(brand_pattern, row['name'])
    
    model = model_match.group(0) if model_match else 'Unknown'
    size = size_match.group(0) if size_match else 'Unknown'
    brand = brand_match.group(0) if brand_match else 'Unknown'
    company = brand  # Assuming company is the same as brand for simplicity
    
    return pd.Series([model, size, brand, company], index=['Model', 'Size', 'Brand', 'Company'])

df_motherboard[['model', 'size', 'brand', 'company']] = df_motherboard.apply(normalize_motherboard_details, axis=1)
print('資料筆數',df_motherboard.shape[0])
df_motherboard.head()

Unnamed: 0,class,name,price,etl_date,model,size,brand,company
0,主機板 MB,華碩 Tinker Fanless Aluminum Case 樹莓派鋁質機殼 (2/2S ...,399,2024-03-03,Unknown,Unknown,華碩,華碩
1,主機板 MB,華碩 PRO WS W680M-ACE SE(M-ATX/1A1H1P/雙Intel 2.5...,12990,2024-03-03,M-ACE,ATX,華碩,華碩
2,主機板 MB,華碩 PRO WS W680-ACE(ATX/1A1H1P/雙Intel 2.5Gb/註四年...,9990,2024-03-03,Unknown,ATX,華碩,華碩
3,主機板 MB,華碩 Pro WS W790E-SAGE SE(EEB/8*DDR5/2*Intel 10G...,39990,2024-03-03,E-SAGE,Unknown,華碩,華碩
4,主機板 MB,華擎 W790 WS(EEB/8*DDR5/2*Marvell 10G+Intel 2.5G...,34990,2024-03-03,Unknown,Unknown,Unknown,Unknown


In [62]:
df_motherboard.to_csv(output_path+'motherboard.csv',encoding='utf-8-sig',index=False)

# 處理電源的資料

In [193]:
df_battery=df.loc[df['class']=='電源供應器']
df_battery.head()

Unnamed: 0,class,name,price,etl_date
0,電源供應器,華碩 ROG STRIX 850W AURA Edition 雙8/金牌/全模組/ATX3....,4990,2024-03-03
1,電源供應器,美洲獅 COUGAR GEX 750W 雙8/金牌/全模組/主日系/5年保▼下殺到 3/31...,2590,2024-03-03
2,電源供應器,美洲獅 COUGAR GEX 1050W 雙8/金牌/全模組/主日系/7年保 原價$5490！,3990,2024-03-03
3,電源供應器,銀欣 750W 金牌/半模組/主日系/扁平線材/5年保(ET750-HG) 下殺原價$3090,2390,2024-03-03
4,電源供應器,台達 超實在 1100W 雙8/白金/全模組/ATX3.0(PCIe 5.0)/10年保 限...,4990,2024-03-03


In [202]:
# Define a function to extract power (in watts), cooling method, and model from the name field
df_battery=df.loc[df['class']=='電源供應器']
def extract_info(name):
    # Initialize a dictionary to store the extracted information
    info = {
        'power_watts': None,
        'cooling_method': None,  # Placeholder, as cooling method may not be explicit in the name
        'model': None  # Placeholder, extracting model might require specific patterns per brand
    }
    
    # Extract power in watts
    power_search = re.search(r'(\d+)W', name)
    if power_search:
        info['power_watts'] = power_search.group(1)
    
    # Example for extracting cooling method, though this may not apply directly without specific indicators
    # if '水冷' in name:
    #     info['cooling_method'] = '水冷'
    # elif '風冷' in name:
    #     info['cooling_method'] = '風冷'
    
    # Model extraction would require more specific patterns, which are not implemented here due to the variability
    
    return info
# Apply the function to the 'name' column of the dataframe
df_battery['extracted_info'] = df_battery['name'].apply(extract_info)
# Split the 'extracted_info' dictionary into separate columns in the dataframe
df_battery[['tdp', 'cooling_method', 'model']] = df_battery['extracted_info'].apply(pd.Series)
df_battery.drop(columns=['extracted_info'],inplace=True)
# Display the updated dataframe structure to confirm the changes
df_battery.head()


Unnamed: 0,class,name,price,etl_date,tdp,cooling_method,model
0,電源供應器,華碩 ROG STRIX 850W AURA Edition 雙8/金牌/全模組/ATX3....,4990,2024-03-03,850,,
1,電源供應器,美洲獅 COUGAR GEX 750W 雙8/金牌/全模組/主日系/5年保▼下殺到 3/31...,2590,2024-03-03,750,,
2,電源供應器,美洲獅 COUGAR GEX 1050W 雙8/金牌/全模組/主日系/7年保 原價$5490！,3990,2024-03-03,1050,,
3,電源供應器,銀欣 750W 金牌/半模組/主日系/扁平線材/5年保(ET750-HG) 下殺原價$3090,2390,2024-03-03,750,,
4,電源供應器,台達 超實在 1100W 雙8/白金/全模組/ATX3.0(PCIe 5.0)/10年保 限...,4990,2024-03-03,1100,,


In [204]:
df_battery.to_csv(output_path+'battery.csv',encoding='utf-8-sig',index=False)

# 處理螢幕的資料

In [33]:
df_monitor=df.loc[df['class']=='螢幕｜投影機｜壁掛']

In [34]:
# Define a function to extract brand, model, and screen resolution from the name field
def extract_monitor_info(name):
    # Initialize a dictionary to store the extracted information
    info = {
        'brand': None,
        'model': None,
        'resolution': None
    }
    
    # Attempt to extract brand and model
    # Assuming brand and model are at the beginning, separated by a space or specific characters
    brand_model_search = re.search(r'(\w+)\s+(\w+)', name)
    if brand_model_search:
        info['brand'] = brand_model_search.group(1)
        info['model'] = brand_model_search.group(2)
    
    # Attempt to extract screen resolution (e.g., 1080p, 1440p, 4K)
    resolution_search = re.search(r'(\d{3,4}p|\d+Hz)', name)
    if resolution_search:
        info['resolution'] = resolution_search.group(1)
    
    return info

# Apply the function to the 'name' column of the dataframe
df_monitor['extracted_info'] = df_monitor['name'].apply(extract_monitor_info)

# Split the 'extracted_info' dictionary into separate columns in the dataframe
df_monitor[['brand', 'model', 'hz']] = df_monitor['extracted_info'].apply(pd.Series)

# Drop the 'extracted_info' column as its data has been expanded into separate columns
df_monitor.drop(columns=['extracted_info'], inplace=True)

# Display the dataframe with the new columns to check the results
df_monitor[['name', 'brand', 'model', 'hz']].head()

Unnamed: 0,name,brand,model,hz
0,【主機搭購】BenQ GW2475H(1A2H/5ms/IPS/無喇叭)不閃屏.低藍光.護眼...,BenQ,GW2475H,
1,【主機搭購】BenQ BL2480(Plus) (1A1H1P/5ms/IPS/含喇叭)三介...,BenQ,BL2480,
2,【主機搭購】BenQ MOBIUZ EX2710S(2H1P/IPS/165Hz/含喇叭/F...,BenQ,MOBIUZ,165Hz
3,【主機搭購】BenQ MOBIUZ EX2710R(2H1P/1ms/VA曲面/165Hz/...,BenQ,MOBIUZ,165Hz
4,【主機搭購】ACER EK241Y E(1A1H/1ms/IPS/100Hz/無喇叭/Fre...,ACER,EK241Y,100Hz


In [35]:
df_monitor.to_csv(output_path+'monitor.csv',encoding='utf-8-sig',index=False)