In [1]:
import pandas as pd
import re
import os
import io

# =============================================================================
# TAHAP 1: CUCI BERSIH CSV (HAPUS KUTIP & SKIP ERROR)
# =============================================================================
input_filename = 'final_scrap.csv' 
output_filename = 'dataset_final_super_lengkap.csv'

print("1. Memperbaiki format CSV...")

if not os.path.exists(input_filename):
    print(f"❌ File '{input_filename}' tidak ditemukan!")
    exit()

# Baca file sebagai TEXT
with open(input_filename, 'r', encoding='utf-8') as f:
    lines = f.readlines()

clean_data = []
# Ambil Header
header = lines[0].strip().replace('"', '') 
clean_data.append(header)

# Proses isinya
for line in lines[1:]:
    line = line.strip()
    if not line: continue
    
    # Hapus kutip pembungkus (Penyebab skor 0)
    if line.startswith('"') and line.endswith('"'):
        line = line[1:-1] 
    
    # Fix double quotes
    line = line.replace('""', '"')
    
    clean_data.append(line)

# Gabungkan jadi string
csv_string = "\n".join(clean_data)

# --- BAGIAN PERBAIKAN UTAMA ---
# Kita gunakan on_bad_lines='skip' agar baris yang komanya kebanyakan DIBUANG
# Kita gunakan engine='python' yang lebih pintar menangani CSV berantakan
try:
    df = pd.read_csv(io.StringIO(csv_string), on_bad_lines='skip', engine='python')
    print(f"    Berhasil memuat {len(df)} baris data bersih (Baris error sudah dibuang otomatis).")
except Exception as e:
    print(f"   ❌ Masih error fatal: {e}")
    exit()

# =============================================================================
# TAHAP 2: LOAD REFERENSI CPU
# =============================================================================
cpu_dictionary = {}
try:
    df_ref = pd.read_csv('skor_cpu.csv')
    cpu_dictionary = dict(zip(df_ref['CPU Name'].str.lower().str.strip(), df_ref['CPU Mark']))
    print(f"    Referensi CPU dimuat: {len(cpu_dictionary)} item.")
except:
    print("   ⚠️ File skor_cpu.csv tidak ada. Menggunakan mode tebakan.")

# =============================================================================
# TAHAP 3: FUNGSI PENILAIAN (SCORING)
# =============================================================================
def clean_ram(text):
    try:
        numbers = re.findall(r'\d+', str(text))
        return int(numbers[0]) if numbers else 0
    except: return 0

def clean_price(price_raw):
    try:
        clean_str = str(price_raw).replace('$', '').replace(',', '').replace('USD', '').strip()
        val = float(clean_str)
        if val > 500000: return int(val)
        return int((val / 100) * 15500)
    except: return 0

def clean_storage_type(text):
    return 'SSD' if 'SSD' in str(text).upper() else 'HDD'

def classify_gpu_type(gpu_text):
    txt = str(gpu_text).upper()
    if any(x in txt for x in ['RTX', 'GTX', 'RX', 'ARC', 'MX', 'ADRENO', 'QUADRO', 'FIREPRO', 'T500', 'T600', 'T1000', 'A1000', 'A2000']):
        return 'Dedicated'
    return 'Integrated'

# --- CPU SCORING (UPDATED UNTUK MENANGKAP SEMUA LIST ERROR KAMU) ---
# --- B. FUNGSI SKOR CPU (VERSI FINAL - SAPU BERSIH) ---
def get_cpu_score(nama_laptop_cpu):
    if pd.isna(nama_laptop_cpu): return 0
    target = str(nama_laptop_cpu).lower()
    
    # 1. Cek Dictionary (Prioritas Utama)
    best = 0
    longest = 0
    for k, v in cpu_dictionary.items():
        if k in target and len(k) > longest:
            longest = len(k)
            best = v
    if best > 0: return best

    # 2. LOGIKA "CATCH-ALL" (Menangkap Sisanya)
    
    # --- Fix Sisa Error Kamu ---
    if 'ryzen z1' in target or 'z2' in target: return 20000 # Handheld Gaming
    if 'processor n' in target: return 5000 # Intel N100/N200
    if 'pentium silver' in target or 'athlon silver' in target: return 3500
    if 'pentium gold' in target or 'athlon gold' in target: return 4000
    if '3020e' in target or '3015' in target: return 2600
    if 'core m' in target: return 2800
    
    # Apple Silicon
    if 'm5' in target: return 40000
    if 'm4 max' in target: return 38000
    if 'm4 pro' in target: return 28000
    if 'm4' in target: return 22000
    if 'm3 max' in target: return 32000
    if 'm3 pro' in target: return 24000
    if 'm3' in target: return 19000
    if 'm2' in target: return 15000
    if 'm1' in target: return 14000
    
    # High End Modern
    if 'snapdragon' in target: return 18000
    if 'ryzen ai' in target: return 25000
    if 'threadripper' in target: return 40000
    if 'xeon' in target: return 20000

    # Core Ultra & i-Series
    if 'ultra 9' in target or 'core 9' in target: return 28000
    if 'ultra 7' in target or 'core 7' in target: return 24000
    if 'ultra 5' in target or 'core 5' in target: return 18000
    if 'ultra 3' in target or 'core 3' in target: return 10000
    
    if 'i9' in target: return 22000
    if 'i7' in target: return 15000
    if 'i5' in target: return 10000
    if 'i3' in target: return 6000
    
    # Ryzen Series
    if 'ryzen 9' in target: return 25000
    if 'ryzen 7' in target: return 16000
    if 'ryzen 5' in target: return 11000
    if 'ryzen 3' in target: return 7000
    
    # Low End / Jadul
    if 'pentium' in target: return 3000
    if 'celeron' in target: return 2000
    if 'athlon' in target: return 3000
    if 'mediatek' in target: return 2500
    if 'atom' in target: return 1500
    if 'fx-' in target or 'a12-' in target or 'a10-' in target: return 2500 
    if 'a9-' in target or 'a6-' in target or 'a4-' in target: return 1800
    
    return 3000 # Default Akhir

# --- C. FUNGSI SKOR GPU (VERSI FINAL - BRUTAL FIX) ---
def get_gpu_score_2025(gpu_raw):
    if pd.isna(gpu_raw): return 0
    name = str(gpu_raw).lower()

    # --- Fix Workstation NVIDIA (RTX A-Series) ---
    if 'rtx a5' in name: return 22000 # A5000/A5500
    if 'rtx a4' in name: return 18000 # A4000/A4500
    if 'rtx a3' in name: return 14000 # A3000
    if 'rtx a2' in name: return 10000 # A2000
    if 'rtx a1' in name: return 8000  # A1000
    if 't1000' in name or 't1200' in name: return 7000
    if 't500' in name or 't600' in name: return 5000

    if 'radeon r' in name:
        if 'r9' in name: return 4000
        if 'r7' in name: return 1500
        if 'r5' in name: return 1000
        if 'r4' in name or 'r3' in name or 'r2' in name: return 800
        return 1000 # Default R-series
   
    if 'radeon hd' in name: return 600 # Sangat tua
    if 'powervr' in name: return 500 # Tablet grade
    if 'radeon pro' in name or 'wx' in name: return 4000 # Workstation tua
    if 'radeon 610' in name or 'radeon 530' in name or 'radeon 540' in name: return 1500


    # --- Fix Radeon Gaming & Integrated Modern ---
    if 'rx 6850' in name: return 22000
    if 'rx 6650' in name: return 14000
    if 'radeon 8' in name and 'm' in name: return 6000 # 840M/860M/880M/890M
    if 'radeon 7' in name and 'm' in name: return 5000 # 740M/760M/780M
    if 'radeon 680m' in name: return 5000
    if 'radeon 660m' in name: return 3500
    if 'radeon 6' in name and 'm' in name: return 2000 # 610M (Low end)

    # --- Fix Intel Integrated ---
    if 'intel graphics' in name: return 3000 # Core Ultra iGPU baru
    if 'arc' in name: return 5000

    # --- WORKSTATION (Quadro/Ada/FirePro) ---

    if 'rtx 5000' in name: return 25000
    if 'rtx 4000' in name: return 22000
    if 'rtx 3500' in name: return 20000
    if 'rtx 3000' in name: return 18000
    if 'rtx 2000' in name: return 12000
    if 'rtx 1000' in name or 'a1000' in name: return 9000
    if 'rtx 500' in name or 'a500' in name: return 7000
    if 'rtx a' in name: return 10000 # Catch all RTX A-series
    if 't1200' in name: return 8000
    if 't1000' in name or 't1200' in name: return 7000
    if 't500' in name or 't600' in name: return 5000
    if 't600' in name or 't550' in name: return 5000
    if 'quadro' in name: return 4000
    if 'firepro' in name: return 3000

    # --- APPLE ---
    if 'apple' in name or 'm4' in name or 'm3' in name:
        if 'max' in name: return 28000
        if 'pro' in name: return 20000
        return 10000
   
    if 'gtx 10' in name: return 6000
    if 'mx' in name: return 3500

    # --- GAMING (RTX/GTX/RX) ---
    if 'rtx 509' in name: return 35000
    if 'rtx 508' in name: return 30000
    if 'rtx 409' in name: return 30000
    if 'rtx 408' in name: return 26000
    if 'rtx 507' in name: return 24000
    if 'rtx 407' in name: return 22000
    if 'rtx 308' in name: return 20000
    if 'rtx 506' in name: return 19000
    if 'rtx 406' in name: return 18000
    if 'rtx 307' in name: return 16000
    if 'rtx 405' in name or 'rtx 505' in name: return 14000
    if 'rtx 306' in name: return 13000
    if 'rtx 305' in name: return 9500
    if 'rtx 208' in name: return 14000
    if 'rtx 207' in name: return 12000
    if 'rtx 206' in name: return 10000
    if 'rtx 205' in name: return 7500

    if 'gtx 166' in name: return 8000
    if 'gtx 165' in name: return 7000
    if 'gtx 108' in name: return 9000
    if 'gtx 107' in name: return 8000
    if 'gtx 106' in name: return 7000
    if 'gtx 105' in name: return 5000
    if 'gtx 960' in name or 'gtx 860' in name: return 3000

    if 'rx 7900' in name: return 25000
    if 'rx 7800' in name or 'rx 6800' in name: return 20000
    if 'rx 7700' in name or 'rx 6700' in name: return 15000
    if 'rx 7600' in name or 'rx 6600' in name: return 12000
    if 'rx 6550' in name or 'rx 6500' in name: return 7000
    if 'rx 6400' in name or 'rx 5600' in name: return 6000
    if 'rx 5500' in name: return 5000

    # --- LOW END / INTEGRATED (YANG BANYAK ERROR) ---
    if 'arc' in name: return 5000 # Intel Arc (Integrated/Low End Dedicated)
    if 'mx5' in name: return 4500
    if 'mx4' in name: return 4000
    if 'mx3' in name: return 3500
    if 'mx2' in name or 'mx1' in name: return 2500
    if '940mx' in name or '940m' in name: return 2000

    if 'radeon 890m' in name: return 6000
    if 'radeon 880m' in name: return 5500
    if 'radeon 780m' in name: return 5000
    if 'radeon 680m' in name: return 4500
    if 'radeon 660m' in name: return 3500
    if 'vega' in name: return 2500 # Vega 6/7/8/10
    if 'radeon' in name: return 1500 # Radeon R2/R4/R5/R6/530/610

    if 'iris' in name: return 2500
    if 'uhd' in name: return 1000
    if 'hd graphics' in name: return 800 # Intel HD Jadul
    if 'adreno' in name or 'mali' in name: return 1000 # ARM GPU
    return 1500


def get_screen_quality(display_text):
    text = str(display_text).lower()
    score = 0
    if 'ips' in text or 'oled' in text: score += 50 
    if '1920' in text or 'fhd' in text: score += 30 
    if '3840' in text or '4k' in text: score += 50 
    if '1366' in text: score -= 20 
    return score

# =============================================================================
# TAHAP 4: EKSEKUSI
# =============================================================================
print("2. Menghitung ulang semua skor...")

# Terapkan fungsi
df['Price_IDR'] = df['Harga_USD'].apply(clean_price)
df['RAM_Clean'] = df['RAM'].apply(clean_ram)
df['Storage_Type'] = df['Storage'].apply(clean_storage_type)
df['GPU_Class'] = df['GPU'].apply(classify_gpu_type) 
df['CPU_Score'] = df['Processor'].apply(get_cpu_score)
df['GPU_Score'] = df['GPU'].apply(get_gpu_score_2025)
df['Screen_Score'] = df['Display'].apply(get_screen_quality)

# Simpan
df.to_csv(output_filename, index=False)

print("\n=== HASIL DATA (PREVIEW) ===")
# Cek apakah skor masih 0 atau sudah benar
print(df[['Nama_Laptop', 'CPU_Score', 'GPU_Score', 'Price_IDR']].head())

# Cek Error
cpu_fail = df[df['CPU_Score'] == 3000]
gpu_fail = df[df['GPU_Score'] == 1500]
print(f"\n⚠️ Sisa CPU Default (3000): {len(cpu_fail)} (Dari 6000+)")
print(f"⚠️ Sisa GPU Default (1500): {len(gpu_fail)} (Dari 16000+)")
print(f"\n File '{output_filename}' SIAP DIPAKAI!")

1. Memperbaiki format CSV...
    Berhasil memuat 44205 baris data bersih (Baris error sudah dibuang otomatis).
    Referensi CPU dimuat: 91 item.
2. Menghitung ulang semua skor...

=== HASIL DATA (PREVIEW) ===
          Nama_Laptop  CPU_Score  GPU_Score  Price_IDR
0        HP Victus 15      18500       1000   12384500
1               HP 15      10000        800    7269500
2  HP OmniBook X Flip      18000       5000   12399845
3  HP ProBook 450 G10      12500       2500   15608345
4      HP Pavilion 15      15500       3500   17654500

⚠️ Sisa CPU Default (3000): 16 (Dari 6000+)
⚠️ Sisa GPU Default (1500): 372 (Dari 16000+)

 File 'dataset_final_super_lengkap.csv' SIAP DIPAKAI!
