In [1]:
import pandas as pd
import re

In [2]:
file_path = "croma_laptops.csv" 
df = pd.read_csv(file_path)

In [3]:
def extract_specs(name):

    brand = name.split()[0] if name else None

    # Screen Size
    screen_size = re.search(r'(\d{2}\.?\d?\s*inch)', name, re.IGNORECASE)
    screen_size = screen_size.group(1) if screen_size else None

    # Color
    color = re.search(r'(Silver|Gray|Grey|Black|Blue|Gold|White|Green)', name, re.IGNORECASE)
    color = color.group(1) if color else None

    # RAM
    ram = re.search(r'(\d+GB)\s*RAM', name, re.IGNORECASE)
    if not ram:
        ram = re.search(r'(\d+GB)', name, re.IGNORECASE)
    ram = ram.group(1) if ram else None

    # Storage (must come AFTER RAM if RAM exists)
    storage = None
    if ram:
        match = re.search(rf'{ram}.*?(\d+TB|\d+GB)', name, re.IGNORECASE)
        if match:
            storage = match.group(1)
    if not storage:  # fallback if RAM not found or storage alone present
        match = re.search(r'(\d+TB|\d+GB)', name, re.IGNORECASE)
        if match:
            storage = match.group(1)

    # GPU
    gpu = re.search(r'(NVIDIA\s+[^\s,]+|Radeon\s+[^\s,]+|Apple\s+GPU|Intel\s+Iris\s+Xe)', name, re.IGNORECASE)
    gpu = gpu.group(1) if gpu else None

    # CPU
    cpu = re.search(r'(Intel\s+[^\s,]+|AMD\s+[^\s,]+|M\d\s*Pro?|M\d)', name, re.IGNORECASE)
    cpu = cpu.group(1) if cpu else None

    return pd.Series([brand, screen_size, color, ram, storage, gpu, cpu])

df[['Brand', 'Screen Size', 'Color', 'RAM', 'Storage', 'GPU', 'CPU']] = df['Name'].apply(extract_specs)

df_cleaned = df[['Brand', 'Screen Size', 'Color', 'RAM', 'Storage', 'GPU', 'CPU', 'Price']]

print(df_cleaned.head())

   Brand Screen Size   Color   RAM Storage   GPU     CPU     Price
0  Apple   13.3 inch    Gray   8GB   256GB  None      M1   54490.0
1  Apple   13.3 inch    Gold   8GB   256GB  None      M1   54490.0
2  Apple   13.3 inch  Silver   8GB   256GB  None      M1   54490.0
3  Apple   14.2 inch   Black  24GB   512GB  None  M4 Pro  187990.0
4  Apple   13.6 inch    None  16GB   256GB  None      M4   93990.0


In [4]:
df_cleaned['Screen Size'] = df_cleaned['Screen Size'].astype(str).str.extract(r'(\d+\.?\d*)').astype(float)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_cleaned['Screen Size'] = df_cleaned['Screen Size'].astype(str).str.extract(r'(\d+\.?\d*)').astype(float)


In [5]:
df_cleaned['RAM'] = df_cleaned['RAM'].astype(str).str.replace("GB", "", regex=False).str.strip().astype(int)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_cleaned['RAM'] = df_cleaned['RAM'].astype(str).str.replace("GB", "", regex=False).str.strip().astype(int)


In [6]:
df_cleaned['Storage'] = (
    df_cleaned['Storage'].astype(str)
    .str.replace("GB", "", regex=False)
    .str.replace("TB", "000", regex=False) 
    .str.extract(r'(\d+)')
    .astype(float)
)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_cleaned['Storage'] = (


In [7]:
df_cleaned['GPU'].value_counts()

GPU
NVIDIA GeForce     100
Intel Iris Xe       14
NVIDIA Geforce       3
Radeon Graphics      2
Radeon RX            1
Radeon 610M          1
NVIDIA GTX           1
Name: count, dtype: int64

In [8]:
df_cleaned.isnull().sum()

Brand            0
Screen Size      0
Color           62
RAM              0
Storage          0
GPU            351
CPU             28
Price            0
dtype: int64

In [9]:
df_cleaned['Brand'] = df_cleaned['Brand'].astype(str)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_cleaned['Brand'] = df_cleaned['Brand'].astype(str)


In [10]:
df_cleaned.loc[(df_cleaned['Brand'].str.lower() == 'apple') & (df_cleaned['GPU'].isna()),'GPU'] = "Integrated"

In [11]:
df_cleaned.isnull().sum()

Brand            0
Screen Size      0
Color           62
RAM              0
Storage          0
GPU            252
CPU             28
Price            0
dtype: int64

In [12]:
df_cleaned['GPU'] = df_cleaned['GPU'].fillna("Integrated")

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_cleaned['GPU'] = df_cleaned['GPU'].fillna("Integrated")


In [13]:
color_mode = df_cleaned['Color'].mode()[0]
df_cleaned['Color'] = df_cleaned['Color'].fillna(color_mode)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_cleaned['Color'] = df_cleaned['Color'].fillna(color_mode)


In [14]:
df_cleaned.isnull().sum()

Brand           0
Screen Size     0
Color           0
RAM             0
Storage         0
GPU             0
CPU            28
Price           0
dtype: int64

In [15]:
cpu_mode = df_cleaned['CPU'].mode()[0]
df_cleaned['CPU'] = df_cleaned['CPU'].fillna(cpu_mode)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_cleaned['CPU'] = df_cleaned['CPU'].fillna(cpu_mode)


In [16]:
df_cleaned.to_csv("cleaned_croma_laptops.csv", index=False)

In [17]:
df_cleaned

Unnamed: 0,Brand,Screen Size,Color,RAM,Storage,GPU,CPU,Price
0,Apple,13.3,Gray,8,256.0,Integrated,M1,54490.0
1,Apple,13.3,Gold,8,256.0,Integrated,M1,54490.0
2,Apple,13.3,Silver,8,256.0,Integrated,M1,54490.0
3,Apple,14.2,Black,24,512.0,Integrated,M4 Pro,187990.0
4,Apple,13.6,Silver,16,256.0,Integrated,M4,93990.0
...,...,...,...,...,...,...,...,...
468,DELL,16.3,Silver,32,1000.0,Integrated,Intel Core,327199.0
469,SAMSUNG,14.0,Gray,16,1000.0,Integrated,Intel Core,145990.0
470,SAMSUNG,14.0,Gray,16,512.0,Integrated,Intel Core,139990.0
471,SAMSUNG,14.0,Gray,32,1000.0,Integrated,Intel Core,171990.0
