In [10]:
import pandas as pd
import requests
from bs4 import BeautifulSoup
import re
from datetime import datetime
import warnings
import sqlite3  
output_path='./output/'
db_name = 'data/cpu_and_gpu.db'
conn = sqlite3.connect(db_name)  

In [11]:
# 基本設定
pd.set_option('display.max_columns', None)
pd.set_option('display.max_rows', None)
pd.set_option('display.float_format',lambda x: '%.2f' %x)
warnings.filterwarnings("ignore")

### Insert CPU 的資料

In [12]:
#讀取資料
df_cpu=pd.read_csv(output_path+'cpu.csv')
df_cpu.head()

Unnamed: 0,class,name,price,etl_date,model,brand,performance,base_clock,boost_clock
0,處理器 CPU,Intel Processor 300【2核/4緒】3.9GHz/6M/UHD710/46W,2990,2024-03-03,Intel Processor,Intel,2核/4緒,3.9,3.9
1,處理器 CPU,Intel i3-14100F【4核/8緒】3.5GHz(↑4.7GHz)/12M/無內顯/...,3990,2024-03-03,Intel i3-14100F,Intel,4核/8緒,3.5,4.7
2,處理器 CPU,Intel i3-14100【4核/8緒】3.5GHz(↑4.7GHz)/12M/UHD73...,4800,2024-03-03,Intel i3-14100,Intel,4核/8緒,3.5,4.7
3,處理器 CPU,Intel i5-14400F【10核/16緒】2.5GHz(↑4.7G)/20M/無內顯/...,6800,2024-03-03,Intel i5-14400F,Intel,10核/16緒,2.5,4.7
4,處理器 CPU,Intel i5-14400【10核/16緒】2.5GHz(↑4.7G)/20M/UHD73...,7650,2024-03-03,Intel i5-14400,Intel,10核/16緒,2.5,4.7


In [20]:
# 資料塞選
import re

def cpu_remove_rules(input_str):

    remove = False
    pattern = re.compile(r'【.*?】') # 去掉沒有特殊符號:【】的字串，像是砍掉單含主機板的，誤砍的先不管
    if not pattern.search(input_str):
        remove = True
        
    if '含風扇' in input_str: # 去掉含風扇的字串 因為價格不是單顆cpu
        remove = True
        
    return remove

In [14]:
# 創建一個空的list
multiple_records = []
names = []

for index, row in df_cpu.iterrows():
    id=row['model']
    name=row['name']
    if remove_rules(name):
        continue
        
    name = name.split('【')[0] # 使用 【 切割出產品名稱
    if '盒' in name:  # 過濾後面有 盒 的名稱
        name = name[:-1]
        
    names.append(name)
    price = row['price']
    model = row['model']
    brand= row['brand']
    multiple_records.append((id,name,price, model,brand))

new_df = pd.DataFrame(multiple_records, columns=['id','name','price', 'model','brand'])

# 显示新的 DataFrame
# new_df.head(20)

In [15]:
conn = sqlite3.connect(db_name)  
table_name='cpu'
conn.execute(f'''DELETE FROM {table_name};''')
conn.commit()

cursor.executemany('INSERT INTO cpu (id,name,price,model,brand) VALUES (?,?,?,?,?)', multiple_records)
cursor.commit()
cursor.close()  

In [19]:
### 驗證查詢
sql1=f'''
select *
from cpu
'''
test=pd.read_sql(sql1,conn)
conn.close()
test.head()

Unnamed: 0,ID,NAME,PRICE,MODEL,BRAND
0,Intel Processor,Intel Processor 300,2990,Intel Processor,Intel
1,Intel i3-14100F,Intel i3-14100F,3990,Intel i3-14100F,Intel
2,Intel i3-14100,Intel i3-14100,4800,Intel i3-14100,Intel
3,Intel i5-14400F,Intel i5-14400F,6800,Intel i5-14400F,Intel
4,Intel i5-14400,Intel i5-14400,7650,Intel i5-14400,Intel


### Insert GPU 的資料

In [44]:
def gpu_remain_rules(input_str):

    remain = False
        
    if '華碩' in input_str or '技嘉' in input_str or '微星' in input_str: # 僅保留三家 華碩、技嘉、微星的人卡
        remain = True
        
    return remain
    
#讀取資料
df_cpu=pd.read_csv(output_path+'gpu.csv')

# 創建一個空的list
multiple_records = []
names = []

# 資料處理過程
for index, row in df_cpu.iterrows():
    id=row['model']
    name=row['name']
    if not remain_rules(name):
        continue
        
    name = name.split('(')[0] # 使用 【 切割出產品名稱
    name = name[3:] # 過濾前面的公司名稱
        
    names.append(name)
    price = row['price']
    model = row['model']
    brand= row['brand']
    multiple_records.append((id,name,price, model,brand))

new_df = pd.DataFrame(multiple_records, columns=['id','name','price', 'model','brand'])


In [54]:
# 清空資料(這邊是為了避免資料因為爬蟲出現重複，所以先清空table全部資料)
import sqlite3
# db_name = 'data/cpu_and_gpu.db'
conn = sqlite3.connect(db_name)  

table_name='gpu'
conn.execute(f'''DELETE FROM {table_name};''')
conn.commit()

conn.executemany(f'INSERT INTO {table_name} (id,name,price,model,brand) VALUES (?,?,?,?,?)', multiple_records)
conn.commit()

conn = sqlite3.connect(db_name)  
sql1=f'''
select *
from gpu
'''
test=pd.read_sql(sql1,conn)
conn.close()
test.head()

Unnamed: 0,ID,NAME,PRICE,MODEL,BRAND
0,Unknown,N210-MD1G/D3,1250,Unknown,微星
1,GT710,GT710-SL-2GD3-BRK-EVO,1690,GT710,華碩
2,GT710,GT710-SL-2GD5-BRK-EVO,1790,GT710,華碩
3,Unknown,N710D3-2GL,1790,Unknown,技嘉
4,GT710,GT710 1GD3H LP,1450,GT710,微星


In [21]:
# 查詢測試
conn = sqlite3.connect(db_name)  
sql1=f'''
select *
from cpu
'''
test=pd.read_sql(sql1,conn)
conn.close()
test.head()

Unnamed: 0,ID,NAME,PRICE,MODEL,BRAND
0,Intel Processor,Intel Processor 300,2990,Intel Processor,Intel
1,Intel i3-14100F,Intel i3-14100F,3990,Intel i3-14100F,Intel
2,Intel i3-14100,Intel i3-14100,4800,Intel i3-14100,Intel
3,Intel i5-14400F,Intel i5-14400F,6800,Intel i5-14400F,Intel
4,Intel i5-14400,Intel i5-14400,7650,Intel i5-14400,Intel
