In [26]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import warnings
import re

from sklearn.preprocessing import MultiLabelBinarizer, OneHotEncoder
from sklearn.impute import SimpleImputer
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer

warnings.filterwarnings('ignore')
pd.set_option('display.max_columns', 500)

In [9]:
df = pd.read_csv('data.csv')

In [10]:
df.head()

Unnamed: 0.1,Unnamed: 0,name,brand,2G,3G,4G,5G,Announced,Status,Weight,Length,Width,Diameter,SIM,Display Type,Display Size,ppi,body ratio,OS,battery_capacity,Price,CPU,ratio,pixel,WLAN,Colors,Sensors,Bluetooth,GPU,Loudspeaker,3.5mm jack,Chipset,Network,Internal,Card slot,RAM,Storage
0,0,alcatel 1b (2022),alcatel,True,True,True,False,2022.0,Available,172.0,146.7,71.9,10.0,Nano-SIM,TFT LCD,5.5,293.0,74.0,Android 11,3000.0,100.0,4.0,18:9,1036800.0,"Wi-Fi 802.11 b/g/n, Wi-Fi Direct","Prime Black, Atlantic Blue",Accelerometer,"5.0, A2DP",PowerVR GE8300,True,True,Mediatek MT6761 Helio A22 (12 nm),"GSM / HSPA / LTE HSPA 42.2/11.1 Mbps, LTE Cat4...",32GB 2GB RAM,microSDXC (dedicated slot),2.0,32.0
1,1,alcatel 1l pro (2021),alcatel,True,True,True,False,2021.0,Available,190.0,156.4,74.8,9.7,Nano-SIM,IPS LCD,6.1,282.0,78.1,Android 11,3000.0,110.0,8.0,5:9,1123200.0,"Wi-Fi 802.11 b/g/n, Wi-Fi Direct","Power Grey, Twilight Blue","Fingerprint (rear-mounted), accelerometer, pro...","4.2, A2DP",IMG8322,True,True,Unisoc SC9863A (28nm),"GSM / HSPA / LTE HSPA 21.1/5.76 Mbps, LTE Cat4...",32GB 2GB RAM,microSDXC (uses shared SIM slot),2.0,32.0
2,2,alcatel 1 (2021),alcatel,True,True,True,False,2021.0,Available,134.0,137.6,65.7,9.8,Nano-SIM,TFT LCD,5.0,215.0,71.4,Android 11,2000.0,60.0,4.0,18:9,460800.0,"Wi-Fi 802.11 b/g/n, Wi-Fi Direct","Volcano Black, AI Aqua","Accelerometer, proximity","4.2, A2DP, LE",PowerVR GE8100,True,True,Mediatek MT6739 (28 nm),"GSM / HSPA / LTE HSPA 42.2/11.5 Mbps, LTE Cat4...","8GB 1GB RAM, 16GB 1GB RAM",microSDHC (dedicated slot),1.0,8.0
3,3,alcatel 3l (2021),alcatel,True,True,True,False,2021.0,Available,194.0,165.6,75.6,8.7,Nano-SIM,IPS LCD,6.52,269.0,82.0,Android 11,4000.0,330.0,8.0,20:9,1152000.0,"Wi-Fi 802.11 b/g/n, Wi-Fi Direct","Jewelry Blue, Jewelry Black","Fingerprint (rear-mounted), accelerometer, pro...","5.0, A2DP, LE",PowerVR GE8320,True,True,Mediatek MT6762D Helio P22 (12 nm),"GSM / HSPA / LTE HSPA 42.2/5.76 Mbps, LTE Cat4...",64GB 4GB RAM,microSDXC (only single sim model),4.0,64.0
4,4,alcatel 1s (2021),alcatel,True,True,True,False,2021.0,Available,190.0,165.6,75.6,8.8,Nano-SIM,IPS LCD,6.52,269.0,82.0,Android 11,4000.0,130.0,8.0,20:9,1152000.0,"Wi-Fi 802.11 b/g/n, Wi-Fi Direct","Elegant Black, Twilight Blue","Fingerprint (rear-mounted), accelerometer, pro...","5.0, A2DP, LE",PowerVR GE8320,True,True,Mediatek MT6762D Helio P22 (12 nm),"GSM / HSPA / LTE HSPA 42.2/5.76 Mbps, LTE Cat4...",32GB 3GB RAM,microSDXC (uses shared SIM slot),3.0,32.0


In [50]:
def categorize_os(os_name):
    os_name = os_name.lower()  # Convert to lowercase to ensure uniformity
    if 'android' in os_name or 'android os' in os_name:
        return 'android'
    elif 'kaios' in os_name:
        return 'other'
    elif 'ios' in os_name or os_name == 'ipados':
        return 'ios'
    else:
        return 'other'
    
def extract_number(s):
    numbers = re.findall(r"[-+]?\d*\.\d+|\d+", s)
    return float(numbers[0]) if numbers else None

def extract_max(s):
    numbers = re.findall(r"[-+]?\d*\.\d+|\d+", s)
    numbers = [float(num) for num in numbers]
    return max(numbers) if numbers else None

def ratio_to_decimal(ratio_str):
    try:
        numerator, denominator = ratio_str.split(':')
        return float(numerator) / float(denominator)
    except:
        return None

In [75]:
df = pd.read_csv('data.csv')

In [76]:
df['Announced'] = df['Announced'].astype(int)
df['OS_Category'] = df['OS'].str.replace('\d+', '', regex=True) \
                    .str.replace('\s*\.\.*$', '', regex=True) \
                    .str.strip()
df['OS_Category'] = df['OS_Category'].apply(categorize_os)
df['Length'] = df['Length'].apply(extract_number)
df['Width'] = df['Width'].apply(extract_number)
df['Diameter'] = df['Diameter'].apply(extract_max)
df['ratio'] = df['ratio'].apply(ratio_to_decimal)

df['Colors_capitalized'] = df['Colors'].str.title()
df['Color_list'] = df['Colors_capitalized'].apply(lambda x: x.split(', ') if pd.notnull(x) else [])
# mlb_color = MultiLabelBinarizer()
# colors_encoded = mlb_color.fit_transform(df['Color_list'])
# colors_df = pd.DataFrame(colors_encoded, columns=mlb_color.classes_)
# df = pd.concat([df, colors_df], axis=1)

df['Color_count'] = df['Color_list'].apply(len)
df['CPU'] = df['CPU'].fillna(2.0)
df['Sensors_capitalized'] = df['Sensors'].str.title()
df['Sensors_list'] = df['Sensors_capitalized'].apply(lambda x: x.split(', ') if isinstance(x, str) else [])
# mlb_sensor = MultiLabelBinarizer()
# sensors_encoded = mlb_sensor.fit_transform(df['Sensors_list'])
# sensors_df = pd.DataFrame(sensors_encoded, columns=mlb_sensor.classes_)
# df = pd.concat([df, sensors_df], axis=1)

df['Sensors_count'] = df['Sensors_list'].apply(len)

df['Bluetooth_version'] = df['Bluetooth'].apply(lambda x: float(re.findall(r"[\d.]+", x)[0]) if pd.notnull(x) and re.findall(r"[\d.]+", x) else None)
df['Bluetooth_version'] = df['Bluetooth_version'].fillna(1.0)
boolean_columns = df.select_dtypes(include=['bool']).columns
for col in boolean_columns:
    df[col] = df[col].astype(int)

categorical_cols = ['brand', 'Status', 'OS_Category', 'SIM']
categorical_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='most_frequent')),
    ('onehot', OneHotEncoder(handle_unknown='ignore'))
])

preprocessor = ColumnTransformer(
    transformers=[
        ('cat', categorical_transformer, categorical_cols)
    ],
    remainder='passthrough'
)
X_transformed = preprocessor.fit_transform(df)
onehot_columns = preprocessor.named_transformers_['cat'].named_steps['onehot'].get_feature_names_out(categorical_cols)
non_categorical_cols = [col for col in df.columns if col not in categorical_cols]
feature_names = list(onehot_columns) + non_categorical_cols
df_transformed = pd.DataFrame(X_transformed, columns=feature_names)
preprocessed_df = df_transformed.drop(columns=['Display Type', 'OS', 'WLAN', 'GPU', 'Bluetooth', 'Sensors', 'Colors', 'Chipset', 'Network', 'Internal', 'Card slot', 'Colors_capitalized', 'Color_list', 'Sensors_capitalized', 'Sensors_list', 'Unnamed: 0', 'name'])

In [77]:
preprocessed_df

Unnamed: 0,brand_alcatel,brand_apple,brand_asus,brand_blu,brand_htc,brand_huawei,brand_infinix,brand_lenovo,brand_lg,brand_nokia,brand_samsung,brand_sony,brand_xiaomi,brand_zte,Status_Available,Status_Coming,Status_Discontinued,OS_Category_android,OS_Category_ios,OS_Category_other,SIM_Micro-SIM,SIM_Mini-SIM,SIM_Nano-SIM,SIM_eSIM,2G,3G,4G,5G,Announced,Weight,Length,Width,Diameter,Display Size,ppi,body ratio,battery_capacity,Price,CPU,ratio,pixel,Loudspeaker,3.5mm jack,RAM,Storage,Color_count,Sensors_count,Bluetooth_version
0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,1,1,1,0,2022,172.0,146.7,71.9,10.0,5.5,293.0,74.0,3000.0,100.0,4.0,2.0,1036800.0,1,1,2.0,32.0,2,1,5.0
1,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,1,1,1,0,2021,190.0,156.4,74.8,9.7,6.1,282.0,78.1,3000.0,110.0,8.0,0.555556,1123200.0,1,1,2.0,32.0,2,3,4.2
2,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,1,1,1,0,2021,134.0,137.6,65.7,9.8,5.0,215.0,71.4,2000.0,60.0,4.0,2.0,460800.0,1,1,1.0,8.0,2,2,4.2
3,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,1,1,1,0,2021,194.0,165.6,75.6,8.7,6.52,269.0,82.0,4000.0,330.0,8.0,2.222222,1152000.0,1,1,4.0,64.0,2,4,5.0
4,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,1,1,1,0,2021,190.0,165.6,75.6,8.8,6.52,269.0,82.0,4000.0,130.0,8.0,2.222222,1152000.0,1,1,3.0,32.0,2,4,5.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2646,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,1,1,0,0,2010,113.0,113.5,55.0,12.6,3.2,146.0,46.7,1500.0,140.0,2.0,1.666667,96000.0,1,1,0.25,0.5,2,3,3.0
2647,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,1,1,0,0,2010,109.0,113.5,55.0,12.9,3.2,146.0,46.7,1500.0,140.0,2.0,1.666667,96000.0,1,1,0.25,0.5,2,3,3.0
2648,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,1,1,0,0,2010,128.0,119.5,59.8,12.5,3.7,252.0,54.5,1500.0,100.0,2.0,1.666667,384000.0,1,1,0.375,1.0,2,3,2.1
2649,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,1,1,0,0,2010,119.0,122.4,64.2,9.9,4.0,233.0,58.0,1500.0,120.0,2.0,1.666667,384000.0,1,1,0.5,8.0,2,3,3.0


In [78]:
preprocessed_df.to_csv('preprocessed_df.csv')