In [1]:
import pandas as pd
import numpy as np
import re
import io

In [None]:
CPU_SCORES = {
    #APPLE SILICON
    'M4 MAX': 38000, 'M4 PRO': 32000, 'M4': 26000,
    'M3 MAX': 36000, 'M3 PRO': 30000, 'M3': 22000,
    'M2 MAX': 30000, 'M2 PRO': 26000, 'M2': 19000,
    'M1 MAX': 28000, 'M1 PRO': 24000, 'M1': 14500,

    '14900HX': 46000, '13980HX': 45000, '14700HX': 38000, '14650HX': 35000,
    '13900H': 44000,  '13700HX': 34000, '7945HX': 55000, '9955HX': 50000, 
    'ULTRA 9': 32000, '285H': 32000,    '185H': 30000,
    'RYZEN AI 9': 34000, 'HX 370': 35000, '365': 33000,

    '13650HX': 32000, '13620H': 26000, '12650H': 23000, '14450HX': 28000,
    'ULTRA 7': 25000, '155H': 25000,   '258V': 26000,   '255H': 25000, '265U': 24000,
    'RYZEN 9': 28000, '8945HS': 29000, 
    'RYZEN 7': 22000, '8845HS': 26000, '7840HS': 26000, '7735HS': 21000, '6800H': 20000,
    'RYZEN AI 7': 29000, '350': 28000, 
    'SNAPDRAGON X ELITE': 24000, 'X1E': 24000,

    'ULTRA 5': 20000, '125H': 20000, '226V': 21000, '225H': 20000,
    '13500H': 22000, '12600H': 22000, '13450HX': 24000, '13420H': 18500, '12450H': 17500,
    'RYZEN 5': 16000, '8645HS': 22000, '7535HS': 18500, '6600H': 17000,
    '7430U': 16000, '5825U': 16000, '5625U': 14000, '5500U': 13000,
    '150U': 16000, '120U': 14000, '210H': 15000, '240H': 15000,
    'SNAPDRAGON X PLUS': 18000, '240': 14000, '220': 12000,

    '1355U': 15000, '1255U': 14000, '1335U': 14500, '1235U': 13500, 
    '1334U': 13000, '1235': 13500, '1215': 9500, #variasi tanpa U
    '7520U': 9500, '7320U': 9000, '1315U': 10000, '1215U': 9500,
    'N355': 6000, 'N305': 9000, 'N300': 7000, 
    'CORE I7': 15000, 'CORE I5': 12000, #fallback jika model g ketemu
    'RYZEN 7': 14000, 'RYZEN 5': 11000,

    'N100': 5500, 'N200': 6000, 'N4500': 3000, 'N4020': 2000, 'CELERON': 2500
}

GPU_SCORES = {
    'RTX 5090': 45000, 'RTX 5080': 38000, 'RTX 5070': 32000, 'RTX 5060': 26000,
    'RTX 4090': 36000, 'RTX 4080': 30000, 'RTX 4070': 26000,
    'RTX 4060': 22000, 'RTX 4050': 18000, 'RTX 3060': 19000, 'RTX 3050': 12000,
    'RTX 2050': 9000, 'GTX 1650': 8000, 'ARC': 7000, 
    'RADEON 780M': 7000, 'RADEON 680M': 6000,
    'IRIS': 3000, 'UHD': 2000, 'INTEGRATED': 2500}

In [None]:
def load_and_merge_data():
    files = ['laptop_tokopedia_offline.csv', 'laptop_tokopedia_offline2.csv']
    dfs = []
    for f in files:
        try:
            df = pd.read_csv(f, encoding='utf-8')
            dfs.append(df)
        except:
            print(f"[FAIL] Gagal membaca {f}")
    if dfs:
        return pd.concat(dfs, ignore_index=True).drop_duplicates()
    return None

In [None]:
def advanced_extraction(row):
    text = str(row['Nama_Produk']).upper().replace("  ", " ")
    
    blacklist_keywords = [
        "MONITOR", "TV ", "PROJECTOR", "PRINTER", "DESKJET", "INK ", 
        "SCANNER", "SPEAKER", "HEADSET", "MOUSE", "KEYBOARD", "WEBCAM", 
        "GLASSES", "WATCH", "ROUTER", "WIFI", "CABLE", 
        "ALL IN ONE", "AIO", "PC DESKTOP", "TOWER", "MINI PC", 
        "IPAD", "TABLET", "CONSOLE", "PLAYSTATION", "PS5", "ROG ALLY", "LEGION GO",
        "MOTOPAD", "COOLER", "FAN "
    ]
    if text.startswith("UPGRADE") or text.startswith("RAM ") or text.startswith("SSD "):
        return pd.Series([None]*6)
    if any(k in text for k in blacklist_keywords): return pd.Series([None]*6)

    #EKSTRAKSI CPU (ULTIMATE REGEX)
    cpu_brand, cpu_series, cpu_model = "UNKNOWN", "UNKNOWN", "UNKNOWN"
    
    #APPLE SILICON
    match_apple = re.search(r'\b(M[1-4])\s*(PRO|MAX|ULTRA)?', text)
    
    #INTEL ULTRA / CORE (Menangkap "Ultra 7 155H", "Core 5 120U", "Ultra 135U")
    #Cari kata (Core/Ultra) > Optional Angka (3/5/7/9) > Wajib Angka Model (155H, 135U)
    match_intel_modern = re.search(r'(?:CORE)?\s*(ULTRA|I)?\s*(\d)?\s*[-]?\s*(\d{3,4}[A-Z]{1,2})', text)
    
    #INTEL CORE LAMA (i7-13650HX)
    match_intel_old = re.search(r'\bI([3579])\s*[-]?\s*(\d{4,5}[HKIUQMGP]*)', text)

    #AMD RYZEN (Menangkap "Ryzen AI 9 HX 370", "Ryzen 7 7735HS")
    #Cari Ryzen > Optional AI > Optional Angka Seri (9) > Optional Kode Huruf (HX) > Angka Model (370)
    match_amd = re.search(r'RYZEN\s*(AI)?\s*(\d)?\s*(HX|HS|H|U)?\s*(\d{3,4}[HSU]*)?', text)
    
    if match_apple:
        cpu_brand = "APPLE"
        cpu_model = match_apple.group(0)
        cpu_series = "APPLE SILICON"

    elif match_intel_modern and ("ULTRA" in text or "CORE" in text or "INTEL" in text):
        cpu_brand = "INTEL"
        #prioritas model: group 3 (misal 155H, 135U)
        cpu_model = match_intel_modern.group(3)
        if match_intel_modern.group(1) == "ULTRA":
            cpu_series = f"INTEL ULTRA {match_intel_modern.group(2) if match_intel_modern.group(2) else ''}"
        else:
            cpu_series = f"INTEL CORE {match_intel_modern.group(2) if match_intel_modern.group(2) else ''}"

    elif match_intel_old:
        cpu_brand = "INTEL"
        cpu_model = match_intel_old.group(2) 
        cpu_series = f"CORE I{match_intel_old.group(1)}"
        
    elif "RYZEN" in text:
        cpu_brand = "AMD"
        #cari angka 3-4 digit di deket kata Ryzen cth: Ryzen AI 9 HX 370 > Ambil "370" dan "HX 370"
        
        #cek seri (3/5/7/9)
        seri_match = re.search(r'RYZEN\s*(AI)?\s*(\d)', text)
        seri = seri_match.group(2) if seri_match else ""
        is_ai = "AI" in text
        
        #cek model (angka 3-4 digit diikuti huruf opsional)
        #cari angka yang BUKAN angka seri (3,5,7,9) dan BUKAN angka RAM/SSD
        model_candidates = re.findall(r'\b(\d{3,4}[HSU]?[X]?)\b', text)
        
        #filter kandidat (buang angka umum seperti 512, 3050, 4060)
        valid_models = []
        blacklist_nums = ['512', '256', '128', '3050', '4050', '4060', '4070', '3060', '165', '144']
        for m in model_candidates:
            if not any(b in m for b in blacklist_nums):
                valid_models.append(m)
        
        cpu_model = valid_models[0] if valid_models else "UNKNOWN"
        
        if is_ai: cpu_series = f"RYZEN AI {seri}"
        else: cpu_series = f"RYZEN {seri}"

    elif "CELERON" in text or "N40" in text or "N50" in text or "N100" in text:
        cpu_brand = "INTEL"
        cpu_series = "CELERON/ENTRY"
        match_n = re.search(r'(N\d{3,4})', text)
        cpu_model = match_n.group(1) if match_n else "CELERON"

    #LAYAR (SMART DETECTOR)
    screen = 0.0
    standard_sizes = ['13.3', '13.4', '13.5', '13.6', '14.0', '14.5', '15.6', '16.0', '16.1', '17.3', '11.6']
    for size in standard_sizes:
        if size in text:
            screen = float(size)
            break
    if screen == 0.0:
        match_gen = re.search(r'(\d{2}(?:\.\d)?)\s*("|INCH|FHD|WUXGA|OLED)', text)
        if match_gen:
            temp = float(match_gen.group(1))
            if 10.0 <= temp <= 18.5: screen = temp

    #RAM & GPU
    ram = 0
    ram_match = re.findall(r'(\d+)\s*(?:GB)', text)
    if ram_match: 
        valid = [int(x) for x in ram_match if int(x) in [4,8,12,16,20,24,32,48,64]]
        if valid: ram = max(valid)

    gpu = "INTEGRATED"
    if "RTX" in text:
        m = re.search(r'RTX\s*(\d{4})', text)
        gpu = f"RTX {m.group(1)}" if m else "RTX UNKNOWN"
    elif "GTX" in text: gpu = "GTX SERIES"

    return pd.Series([cpu_brand, cpu_series, cpu_model, ram, gpu, screen])

In [None]:
def clean_and_score(df):
    blacklist = ['IPHONE', 'ANDROID', 'REDMI', 'INFINIX NOTE', 'GALAXY', 'PS5', 'NINTENDO']
    mask = df['Nama_Produk'].str.upper().str.contains('|'.join(blacklist))
    df_clean = df[~mask].copy()

    def get_score(val, dictionary, default_val):
        val = str(val).upper()
        best_score = default_val
        best_key_len = 0
        for key, score in dictionary.items():
            if key in val:
                if len(key) > best_key_len:
                    best_score = score
                    best_key_len = len(key)
        return best_score

    #gabungkan text CPU untuk scoring
    df_clean['Full_CPU_Text'] = df_clean['Seri_CPU'].astype(str) + " " + df_clean['Model_CPU'].astype(str)
    
    df_clean['Skor_CPU'] = df_clean['Full_CPU_Text'].apply(lambda x: get_score(x, CPU_SCORES, 5000))
    df_clean['Skor_GPU'] = df_clean['GPU'].apply(lambda x: get_score(x, GPU_SCORES, 2500))

    def label_kelas(row):
        if row['Skor_GPU'] >= 18000: return "GAMING/RENDERING BERAT"
        elif row['Skor_CPU'] >= 25000: return "PROFESSIONAL/CODING BERAT"
        elif row['Skor_CPU'] >= 15000: return "MENENGAH/OFFICE MULTITASKING"
        elif row['Skor_CPU'] < 6000: return "BASIC/ENTRY"
        else: return "PELAJAR/ADMIN"
    
    df_clean['Kelas_Laptop'] = df_clean.apply(label_kelas, axis=1)
    return df_clean

In [None]:
print("1. Load Data...")
df_main = load_and_merge_data()

if df_main is not None:
    print("2. Ekstraksi Fitur (Regex Agresif)...")
    cols = ['Merk_CPU', 'Seri_CPU', 'Model_CPU', 'RAM', 'GPU', 'Layar']
    df_main[cols] = df_main.apply(advanced_extraction, axis=1)
    
    #hapus yang bukan laptop (Merk_CPU kosong)
    df_laptop = df_main.dropna(subset=['Merk_CPU'])
    print(f"   -> Berhasil mendeteksi {len(df_laptop)} Komputer/Laptop Valid.")

    print("3. Membersihkan & Memberi Skor...")
    df_final = clean_and_score(df_laptop)
    
    print("4. Simpan Data...")
    df_final.to_csv('laptop_dataset_cleaned.csv', index=False)
    
    print("\n" + "="*50)
    print(f"SUKSES! {len(df_final)} Data tersimpan di: laptop_dataset_cleaned.csv")
    print("="*50)
    print(df_final[['Nama_Produk', 'Model_CPU', 'Skor_CPU', 'Kelas_Laptop']].head(15))

1. Memuat Data...
2. Ekstraksi Fitur (Regex Agresif)...
   -> Berhasil mendeteksi 377 Komputer/Laptop Valid.
3. Membersihkan & Memberi Skor...
4. Menyimpan Data...

SUKSES! 374 Data tersimpan di: DATABASE_FINAL_SISTEM_PAKAR.csv
                                          Nama_Produk Model_CPU  Skor_CPU  \
0   Laptop Acer Aspire Go 14 Intel Core 5 120U RAm...      120U     14000   
1   Laptop Advan 360 Stylus Pro AMD Ryzen 5 7430U ...       360     11000   
2   Laptop Advan Workplus Air OLED AMD Ryzen 5 753...   UNKNOWN      5000   
3   Laptop Acer Aspire 7 Pro A715 Intel COre i5 13...     3420H      5000   
18  Laptop Axioo Pongo 765 Intel Core I7 13620H Ge...     3620H      5000   
20  Laptop Axioo Hype-R X8 OLED AMD Ryzen 7 6850 R...      6850     14000   
23  Laptop MSI Katana 15HX Intel Core i7 14650HX G...    4650HX      5000   
26  ASUS ROG XBOX ALLY AMD RYZEN Z2 A 16GB 512GB W...   UNKNOWN      5000   
27  Laptop Acer Aspire Lite 14 AL14 Intel Celeron ...      N150      2500   
29