In [1]:
import numpy as np
import pandas as pd
# Para que funcione necesitas bajarte los archivos de datos de Kaggle 
df = pd.read_csv("./df_Cpu.csv")
df

Unnamed: 0,Cpu,Price_in_euros
0,Intel Core i3 6006U 2GHz,539.00
1,Intel Core i7 6700HQ 2.6GHz,879.01
2,Intel Core i7 7500U 2.7GHz,900.00
3,Intel Core i5 1.8GHz,898.94
4,Intel Core i3 6006U 2.0GHz,428.00
...,...,...
907,Intel Core i5 8250U 1.6GHz,800.00
908,Intel Core i5 6300U 2.4GHz,1629.00
909,Intel Core i5 7200U 2.5GHz,519.00
910,AMD E-Series E2-9000e 1.5GHz,258.00


In [2]:
import pandas as pd
import numpy as np

# Cargar el dataset
df = pd.read_csv("df_Cpu.csv")

# Función para parsear la columna "Cpu"
def parse_cpu(cpu_name):
    # Verificar si cpu_name es una cadena
    if not isinstance(cpu_name, str):
        return {
            "brand": np.nan,
            "family": np.nan,
            "model": np.nan,
            "generation": np.nan,
            "sku": np.nan,
            "suffix": np.nan,
            "base_frequency_ghz": np.nan
        }
    
    # Diccionario para almacenar los componentes
    components = {
        "brand": np.nan,
        "family": np.nan,
        "model": np.nan,
        "generation": np.nan,
        "sku": np.nan,
        "suffix": np.nan,
        "base_frequency_ghz": np.nan
    }
    
    # Extraer marca
    if "Intel" in cpu_name:
        components["brand"] = "Intel"
        parts = cpu_name.replace("Intel ", "").split()
    elif "AMD" in cpu_name:
        components["brand"] = "AMD"
        parts = cpu_name.replace("AMD ", "").split()
    else:
        return components
    
    # Extraer frecuencia base (último elemento que contiene GHz)
    for part in reversed(parts):
        if "GHz" in part:
            try:
                components["base_frequency_ghz"] = float(part.replace("GHz", ""))
                parts.remove(part)
            except ValueError:
                pass
            break
    
    # Casos especiales (ej: "Intel Core i5 1.6GHz" sin modelo)
    if len(parts) < 2:
        return components
    
    # Familia (ej: "Core i5", "Ryzen 1700")
    components["family"] = " ".join(parts[:2]) if components["brand"] == "Intel" else " ".join(parts[:1])
    
    # Modelo para Intel
    if components["brand"] == "Intel":
        model = parts[2] if len(parts) > 2 else np.nan
        if isinstance(model, str) and model[0].isdigit():
            components["model"] = model
            # Extraer generación y SKU (ej: "8250U" -> gen=8, sku=250)
            if len(model) >= 4:
                components["generation"] = int(model[0]) if model[0].isdigit() else np.nan
                components["sku"] = model[1:4] if model[1:4].isdigit() else np.nan
                components["suffix"] = model[4:] if len(model) > 4 else np.nan
    
    return components

# Aplicar la función a cada fila
parsed_data = df["Cpu"].apply(lambda x: pd.Series(parse_cpu(x)))

# Unir con el DataFrame original
df = pd.concat([df, parsed_data], axis=1)

# Mostrar el resultado
print(df.head())

                           Cpu  Price_in_euros  brand   family   model  \
0     Intel Core i3 6006U 2GHz          539.00  Intel  Core i3   6006U   
1  Intel Core i7 6700HQ 2.6GHz          879.01  Intel  Core i7  6700HQ   
2   Intel Core i7 7500U 2.7GHz          900.00  Intel  Core i7   7500U   
3         Intel Core i5 1.8GHz          898.94  Intel  Core i5     NaN   
4   Intel Core i3 6006U 2.0GHz          428.00  Intel  Core i3   6006U   

   generation  sku suffix  base_frequency_ghz  
0         6.0  006      U                 2.0  
1         6.0  700     HQ                 2.6  
2         7.0  500      U                 2.7  
3         NaN  NaN    NaN                 1.8  
4         6.0  006      U                 2.0  


In [3]:
df.head()

Unnamed: 0,Cpu,Price_in_euros,brand,family,model,generation,sku,suffix,base_frequency_ghz
0,Intel Core i3 6006U 2GHz,539.0,Intel,Core i3,6006U,6.0,6.0,U,2.0
1,Intel Core i7 6700HQ 2.6GHz,879.01,Intel,Core i7,6700HQ,6.0,700.0,HQ,2.6
2,Intel Core i7 7500U 2.7GHz,900.0,Intel,Core i7,7500U,7.0,500.0,U,2.7
3,Intel Core i5 1.8GHz,898.94,Intel,Core i5,,,,,1.8
4,Intel Core i3 6006U 2.0GHz,428.0,Intel,Core i3,6006U,6.0,6.0,U,2.0


In [4]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 912 entries, 0 to 911
Data columns (total 9 columns):
 #   Column              Non-Null Count  Dtype  
---  ------              --------------  -----  
 0   Cpu                 912 non-null    object 
 1   Price_in_euros      912 non-null    float64
 2   brand               912 non-null    object 
 3   family              912 non-null    object 
 4   model               743 non-null    object 
 5   generation          743 non-null    float64
 6   sku                 729 non-null    object 
 7   suffix              729 non-null    object 
 8   base_frequency_ghz  912 non-null    float64
dtypes: float64(3), object(6)
memory usage: 64.3+ KB


In [16]:
# Aplicar la función a cada fila
parsed_data = df["Cpu"].apply(lambda x: pd.Series(parse_cpu(x)))

# Unir con el DataFrame original
df = pd.concat([df, parsed_data], axis=1)
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 912 entries, 0 to 911
Data columns (total 44 columns):
 #   Column              Non-Null Count  Dtype  
---  ------              --------------  -----  
 0   Cpu                 912 non-null    object 
 1   Price_in_euros      912 non-null    float64
 2   brand               912 non-null    object 
 3   family              912 non-null    object 
 4   model               743 non-null    object 
 5   generation          743 non-null    float64
 6   sku                 729 non-null    object 
 7   suffix              729 non-null    object 
 8   base_frequency_ghz  912 non-null    float64
 9   brand               912 non-null    object 
 10  family              912 non-null    object 
 11  model               743 non-null    object 
 12  generation          743 non-null    float64
 13  sku                 729 non-null    object 
 14  suffix              729 non-null    object 
 15  base_frequency_ghz  912 non-null    float64
 16  brand   