In [1]:
import warnings
import pandas as pd
import numpy as np
import re

warnings.filterwarnings('ignore')

In [2]:
df = pd.read_csv('./data/gsmarena_product_data_final_text1.csv')
df.head()

Unnamed: 0,brand,phone_name,text
0,huawei,nova 11 SE,Network Technology GSM / CDMA / HSPA / CDMA200...
1,huawei,Mate 60 Pro+,Network Technology GSM / CDMA / HSPA / CDMA200...
2,huawei,Mate X5,Network Technology GSM / CDMA / HSPA / EVDO / ...
3,huawei,Mate 60 Pro,Network Technology GSM / CDMA / HSPA / CDMA200...
4,huawei,nova Y91,Network Technology GSM / HSPA / LTE 2G bands G...


In [3]:
df.shape

(3019, 3)

In [4]:
df['text'] = df['text'].str.lower()

In [5]:
def get_network(string):
    res = re.search(r"network.*2g bands", string)
    if res is not None:
        network = re.sub(r"network technology|2g bands|\/|gsm", "", res.group()).strip()
        if ("lte" in network) or "5g" in network or "4g" in network:
            network = re.sub(r"cdma|hspa|cdma|cdma2000|evdo|2000|umts", "", network).strip()
            # print(network)
            return network
        else:
            # print("0")
            return "0"
    else:
        # print("0")
        return "0"

In [6]:
df['network'] = df['text'].apply(lambda x: get_network(x))
df.head()

Unnamed: 0,brand,phone_name,text,network
0,huawei,nova 11 SE,network technology gsm / cdma / hspa / cdma200...,lte
1,huawei,Mate 60 Pro+,network technology gsm / cdma / hspa / cdma200...,lte 5g
2,huawei,Mate X5,network technology gsm / cdma / hspa / evdo / ...,lte 5g
3,huawei,Mate 60 Pro,network technology gsm / cdma / hspa / cdma200...,lte 5g
4,huawei,nova Y91,network technology gsm / hspa / lte 2g bands g...,lte


In [7]:
df[df['network'] == "0"].shape

(1200, 4)

In [8]:
drop_idx = df[df['network'] == "0"].index
df = df.drop(index=drop_idx)
df.shape

(1819, 4)

In [9]:
df['text'][0]

'network technology gsm / cdma / hspa / cdma2000 / lte 2g bands gsm 850 / 900 / 1800 / 1900 - sim 1 & sim 2 cdma 800 3g bands hsdpa 850 / 900 / 1700(aws) / 1900 / 2100 cdma2000 1x 4g bands 1, 3, 4, 5, 8, 18, 19, 26, 34, 38, 39, 40, 41 speed hspa, lte-a launch announced 2023, october 31 status available. released 2023, november 03 body dimensions 162.4 x 75.5 x 7.4 mm (6.39 x 2.97 x 0.29 in) weight 186 g (6.56 oz) sim dual sim (nano-sim, dual stand-by) display type oled, 1b colors, 90hz size 6.67 inches, 107.4 cm2 (~87.6% screen-to-body ratio) resolution 1080 x 2400 pixels, 20:9 ratio (~395 ppi density) platform os harmonyos 4.0 chipset qualcomm snapdragon 680 4g (6 nm) cpu octa-core (4x2.4 ghz cortex-a73 & 4x1.9 ghz cortex-a53) gpu adreno 610 memory card slot no internal 256gb, 512gb main camera triple 108 mp, f/1.9, (wide), 1/1.67", 0.64µm, pdaf\r\n8 mp, f/2.2, 112˚ (ultrawide)\r\n2 mp, f/2.4, (macro) features led flash, hdr, panorama video 1080p@30fps selfie camera single 32 mp, f/2.

In [10]:
def get_4g_bands(text):
    pattern = re.compile(r"4g bands.*speed|4g bands.*5g")
    bands_4g = re.search(pattern, text)
    if bands_4g is not None:
        bands = re.sub(r"4g bands|speed|5g bands.*|", "", bands_4g.group()).strip()
        if "lte" not in bands:
            bands = re.findall(r"\d,|\d\d,", bands)
            if len(bands) not in [0, 1]:
                bands = " ".join(bands)
                bands = re.sub(",", "", bands)
                bands = " ".join([str(x) for x in sorted(([int(x) for x in set(bands.split())]))])
                # print(bands)
                return bands
            else:
                # print("1")
                return "1"
        else:
            # print("1")
            return "1"

In [11]:
df['4g_bands'] = df['text'].apply(lambda x: get_4g_bands(x))
df.head(1)

Unnamed: 0,brand,phone_name,text,network,4g_bands
0,huawei,nova 11 SE,network technology gsm / cdma / hspa / cdma200...,lte,1 3 4 5 8 18 19 26 34 38 39 40


In [12]:
def get_bands_count(text):
    if text not in ["1", "0"]:
        band_count = len(text.split())
        # print(band_count)
        return band_count
    elif text == "1":
        # print(1)
        return 1
    else:
        return 0

In [13]:
df['4g_band_count'] = df['4g_bands'].apply(lambda x: get_bands_count(x))
df.head()

Unnamed: 0,brand,phone_name,text,network,4g_bands,4g_band_count
0,huawei,nova 11 SE,network technology gsm / cdma / hspa / cdma200...,lte,1 3 4 5 8 18 19 26 34 38 39 40,12
1,huawei,Mate 60 Pro+,network technology gsm / cdma / hspa / cdma200...,lte 5g,1,1
2,huawei,Mate X5,network technology gsm / cdma / hspa / evdo / ...,lte 5g,1,1
3,huawei,Mate 60 Pro,network technology gsm / cdma / hspa / cdma200...,lte 5g,1,1
4,huawei,nova Y91,network technology gsm / hspa / lte 2g bands g...,lte,1 2 3 4 5 7 8 13 20 26 28 38 40 41,14


In [14]:
df['text'][0]

'network technology gsm / cdma / hspa / cdma2000 / lte 2g bands gsm 850 / 900 / 1800 / 1900 - sim 1 & sim 2 cdma 800 3g bands hsdpa 850 / 900 / 1700(aws) / 1900 / 2100 cdma2000 1x 4g bands 1, 3, 4, 5, 8, 18, 19, 26, 34, 38, 39, 40, 41 speed hspa, lte-a launch announced 2023, october 31 status available. released 2023, november 03 body dimensions 162.4 x 75.5 x 7.4 mm (6.39 x 2.97 x 0.29 in) weight 186 g (6.56 oz) sim dual sim (nano-sim, dual stand-by) display type oled, 1b colors, 90hz size 6.67 inches, 107.4 cm2 (~87.6% screen-to-body ratio) resolution 1080 x 2400 pixels, 20:9 ratio (~395 ppi density) platform os harmonyos 4.0 chipset qualcomm snapdragon 680 4g (6 nm) cpu octa-core (4x2.4 ghz cortex-a73 & 4x1.9 ghz cortex-a53) gpu adreno 610 memory card slot no internal 256gb, 512gb main camera triple 108 mp, f/1.9, (wide), 1/1.67", 0.64µm, pdaf\r\n8 mp, f/2.2, 112˚ (ultrawide)\r\n2 mp, f/2.4, (macro) features led flash, hdr, panorama video 1080p@30fps selfie camera single 32 mp, f/2.

In [15]:
def get_5g_bands(text):
    res = re.search(r"5g bands.*speed|5g bands.*announced", text)
    if res is not None:
        bands = re.sub("5g bands|speed", "", res.group()).strip()
        bands = re.sub(r"sa/nsa|sa/nsa/sub6", "1,", bands).strip()
        bands = re.findall(r"\d,|\d\d,|\d\d\d,", bands)
        if len(bands) not in [0, 1]:
            bands = " ".join(bands)
            bands = re.sub(",", "", bands)
            bands = " ".join([str(x) for x in sorted(([int(x) for x in set(bands.split())]))])
            # print(bands)
            return bands
        else:
            # print("1")
            return "1"
    else:
        # print("0")
        return "0"

In [16]:
df['5g_bands'] = df['text'].apply(lambda x: get_5g_bands(x))
df.head()

Unnamed: 0,brand,phone_name,text,network,4g_bands,4g_band_count,5g_bands
0,huawei,nova 11 SE,network technology gsm / cdma / hspa / cdma200...,lte,1 3 4 5 8 18 19 26 34 38 39 40,12,0
1,huawei,Mate 60 Pro+,network technology gsm / cdma / hspa / cdma200...,lte 5g,1,1,1
2,huawei,Mate X5,network technology gsm / cdma / hspa / evdo / ...,lte 5g,1,1,1
3,huawei,Mate 60 Pro,network technology gsm / cdma / hspa / cdma200...,lte 5g,1,1,1
4,huawei,nova Y91,network technology gsm / hspa / lte 2g bands g...,lte,1 2 3 4 5 7 8 13 20 26 28 38 40 41,14,0


In [17]:
df['5g_band_count'] = df['5g_bands'].apply(lambda x: get_bands_count(x))
df.head()

Unnamed: 0,brand,phone_name,text,network,4g_bands,4g_band_count,5g_bands,5g_band_count
0,huawei,nova 11 SE,network technology gsm / cdma / hspa / cdma200...,lte,1 3 4 5 8 18 19 26 34 38 39 40,12,0,0
1,huawei,Mate 60 Pro+,network technology gsm / cdma / hspa / cdma200...,lte 5g,1,1,1,1
2,huawei,Mate X5,network technology gsm / cdma / hspa / evdo / ...,lte 5g,1,1,1,1
3,huawei,Mate 60 Pro,network technology gsm / cdma / hspa / cdma200...,lte 5g,1,1,1,1
4,huawei,nova Y91,network technology gsm / hspa / lte 2g bands g...,lte,1 2 3 4 5 7 8 13 20 26 28 38 40 41,14,0,0


In [18]:
def get_released_year(text):
    pattern = re.compile(r"status.*dimensions")
    res = re.search(pattern=pattern, string=text)
    if res is not None:
        res = re.sub(r",|status|dimensions|available. released|body", "", res.group()).strip()
        if res not in ["discontinued", "cancelled"]:
            # print(res.split()[0])
            return res.split()[0]
        else:
            # print(res)
            return res
    else:
        # print('0')
        return "no info"

In [19]:
df['released_year'] = df['text'].apply(lambda x: get_released_year(x))
df.head()

Unnamed: 0,brand,phone_name,text,network,4g_bands,4g_band_count,5g_bands,5g_band_count,released_year
0,huawei,nova 11 SE,network technology gsm / cdma / hspa / cdma200...,lte,1 3 4 5 8 18 19 26 34 38 39 40,12,0,0,2023
1,huawei,Mate 60 Pro+,network technology gsm / cdma / hspa / cdma200...,lte 5g,1,1,1,1,2023
2,huawei,Mate X5,network technology gsm / cdma / hspa / evdo / ...,lte 5g,1,1,1,1,2023
3,huawei,Mate 60 Pro,network technology gsm / cdma / hspa / cdma200...,lte 5g,1,1,1,1,2023
4,huawei,nova Y91,network technology gsm / hspa / lte 2g bands g...,lte,1 2 3 4 5 7 8 13 20 26 28 38 40 41,14,0,0,2023


In [20]:
df['text'][0]

'network technology gsm / cdma / hspa / cdma2000 / lte 2g bands gsm 850 / 900 / 1800 / 1900 - sim 1 & sim 2 cdma 800 3g bands hsdpa 850 / 900 / 1700(aws) / 1900 / 2100 cdma2000 1x 4g bands 1, 3, 4, 5, 8, 18, 19, 26, 34, 38, 39, 40, 41 speed hspa, lte-a launch announced 2023, october 31 status available. released 2023, november 03 body dimensions 162.4 x 75.5 x 7.4 mm (6.39 x 2.97 x 0.29 in) weight 186 g (6.56 oz) sim dual sim (nano-sim, dual stand-by) display type oled, 1b colors, 90hz size 6.67 inches, 107.4 cm2 (~87.6% screen-to-body ratio) resolution 1080 x 2400 pixels, 20:9 ratio (~395 ppi density) platform os harmonyos 4.0 chipset qualcomm snapdragon 680 4g (6 nm) cpu octa-core (4x2.4 ghz cortex-a73 & 4x1.9 ghz cortex-a53) gpu adreno 610 memory card slot no internal 256gb, 512gb main camera triple 108 mp, f/1.9, (wide), 1/1.67", 0.64µm, pdaf\r\n8 mp, f/2.2, 112˚ (ultrawide)\r\n2 mp, f/2.4, (macro) features led flash, hdr, panorama video 1080p@30fps selfie camera single 32 mp, f/2.

In [21]:
def get_full_dimension(string):
    pattern = re.compile(r"dimensions.*weight")
    res = re.search(pattern=pattern, string=string).group()
    if res is not None:
        res = re.sub(r"dimensions|weight|mm.*|unfolded|or.*|cc|\(.*|-.*|:|x", "", res).strip()
        if len(res.split()) == 3:
            # print(res)
            return " ".join(res.split())
        else:
            # print("0 x 0 x 0")
            return "0 0 0"
    else:
        # print("0 x 0 x 0")
        return "0 0 0"

In [22]:
df['data'] = df['text'].apply(lambda x: get_full_dimension(x))

In [23]:
df['height_width_depth'] = df['text'].apply(lambda x: get_full_dimension(x))
df.head()

Unnamed: 0,brand,phone_name,text,network,4g_bands,4g_band_count,5g_bands,5g_band_count,released_year,data,height_width_depth
0,huawei,nova 11 SE,network technology gsm / cdma / hspa / cdma200...,lte,1 3 4 5 8 18 19 26 34 38 39 40,12,0,0,2023,162.4 75.5 7.4,162.4 75.5 7.4
1,huawei,Mate 60 Pro+,network technology gsm / cdma / hspa / cdma200...,lte 5g,1,1,1,1,2023,163.7 79 8.1,163.7 79 8.1
2,huawei,Mate X5,network technology gsm / cdma / hspa / evdo / ...,lte 5g,1,1,1,1,2023,156.9 141.5 5.3,156.9 141.5 5.3
3,huawei,Mate 60 Pro,network technology gsm / cdma / hspa / cdma200...,lte 5g,1,1,1,1,2023,163.7 79 8.1,163.7 79 8.1
4,huawei,nova Y91,network technology gsm / hspa / lte 2g bands g...,lte,1 2 3 4 5 7 8 13 20 26 28 38 40 41,14,0,0,2023,171.6 79.9 8.9,171.6 79.9 8.9


In [24]:
def get_hwd(text):
    res = [0.0, 0.0, 0.0] if text == "0 0 0" else [round(float(x), 1) for x in text.split()]
    return pd.Series(res, index=['height', 'width', 'depth'])

In [25]:
df[['height', 'width', 'depth']] = df['height_width_depth'].apply(lambda x: get_hwd(x))
df.head()

Unnamed: 0,brand,phone_name,text,network,4g_bands,4g_band_count,5g_bands,5g_band_count,released_year,data,height_width_depth,height,width,depth
0,huawei,nova 11 SE,network technology gsm / cdma / hspa / cdma200...,lte,1 3 4 5 8 18 19 26 34 38 39 40,12,0,0,2023,162.4 75.5 7.4,162.4 75.5 7.4,162.4,75.5,7.4
1,huawei,Mate 60 Pro+,network technology gsm / cdma / hspa / cdma200...,lte 5g,1,1,1,1,2023,163.7 79 8.1,163.7 79 8.1,163.7,79.0,8.1
2,huawei,Mate X5,network technology gsm / cdma / hspa / evdo / ...,lte 5g,1,1,1,1,2023,156.9 141.5 5.3,156.9 141.5 5.3,156.9,141.5,5.3
3,huawei,Mate 60 Pro,network technology gsm / cdma / hspa / cdma200...,lte 5g,1,1,1,1,2023,163.7 79 8.1,163.7 79 8.1,163.7,79.0,8.1
4,huawei,nova Y91,network technology gsm / hspa / lte 2g bands g...,lte,1 2 3 4 5 7 8 13 20 26 28 38 40 41,14,0,0,2023,171.6 79.9 8.9,171.6 79.9 8.9,171.6,79.9,8.9


In [26]:
def get_weight(string):
    res = re.search(r"weight.*sim|weight.*build", string)
    if res is not None:
        weight = re.search(r"\d\d\d g|\d\d g", res.group())
        if weight is not None:
            # print(weight[0])
            return weight[0]
        else:
            # print("0 g")
            return np.nan
    else:
        # print("0 g")
        return np.nan

In [27]:
df['data'] = df['text'].apply(lambda x: get_weight(x))

In [28]:
df['weight'] = df['text'].apply(lambda x: get_weight(x))
df.head()

Unnamed: 0,brand,phone_name,text,network,4g_bands,4g_band_count,5g_bands,5g_band_count,released_year,data,height_width_depth,height,width,depth,weight
0,huawei,nova 11 SE,network technology gsm / cdma / hspa / cdma200...,lte,1 3 4 5 8 18 19 26 34 38 39 40,12,0,0,2023,186 g,162.4 75.5 7.4,162.4,75.5,7.4,186 g
1,huawei,Mate 60 Pro+,network technology gsm / cdma / hspa / cdma200...,lte 5g,1,1,1,1,2023,225 g,163.7 79 8.1,163.7,79.0,8.1,225 g
2,huawei,Mate X5,network technology gsm / cdma / hspa / evdo / ...,lte 5g,1,1,1,1,2023,243 g,156.9 141.5 5.3,156.9,141.5,5.3,243 g
3,huawei,Mate 60 Pro,network technology gsm / cdma / hspa / cdma200...,lte 5g,1,1,1,1,2023,225 g,163.7 79 8.1,163.7,79.0,8.1,225 g
4,huawei,nova Y91,network technology gsm / hspa / lte 2g bands g...,lte,1 2 3 4 5 7 8 13 20 26 28 38 40 41,14,0,0,2023,214 g,171.6 79.9 8.9,171.6,79.9,8.9,214 g


In [29]:
def get_resolution(string):
    res = re.search(r"resolution.*protection|resolution.*os", string)
    if res is not None:
        resolution = re.search(r"resolution.*pixels", res.group())
        ppi = re.search(r"\(~.* ppi", res.group())
        if resolution and ppi is not None:
            resolution = " ".join(re.sub(r",.*|\(.*|pixels.*|resolution|:|x", "", resolution.group()).strip().split())
            ppi = re.sub(r"\(~|", "", ppi.group()).strip()
            # print([resolution, ppi])
            return pd.Series([resolution, ppi], index=['resolution', 'ppi'])
        else:
            # print(["0 0", "0 ppi"])
            return pd.Series(["0 0", "0 ppi"], index=['resolution', 'ppi'])
    else:
        # print(["0 0", "0 ppi"])
        return pd.Series(["0 0", "0 ppi"], index=['resolution', 'ppi'])

In [30]:
df[['resolution', 'ppi']] = df['text'].apply(lambda x: get_resolution(x))
df.head()

Unnamed: 0,brand,phone_name,text,network,4g_bands,4g_band_count,5g_bands,5g_band_count,released_year,data,height_width_depth,height,width,depth,weight,resolution,ppi
0,huawei,nova 11 SE,network technology gsm / cdma / hspa / cdma200...,lte,1 3 4 5 8 18 19 26 34 38 39 40,12,0,0,2023,186 g,162.4 75.5 7.4,162.4,75.5,7.4,186 g,1080 2400,395 ppi
1,huawei,Mate 60 Pro+,network technology gsm / cdma / hspa / cdma200...,lte 5g,1,1,1,1,2023,225 g,163.7 79 8.1,163.7,79.0,8.1,225 g,1260 2720,440 ppi
2,huawei,Mate X5,network technology gsm / cdma / hspa / evdo / ...,lte 5g,1,1,1,1,2023,243 g,156.9 141.5 5.3,156.9,141.5,5.3,243 g,2224 2496,426 ppi
3,huawei,Mate 60 Pro,network technology gsm / cdma / hspa / cdma200...,lte 5g,1,1,1,1,2023,225 g,163.7 79 8.1,163.7,79.0,8.1,225 g,1260 2720,440 ppi
4,huawei,nova Y91,network technology gsm / hspa / lte 2g bands g...,lte,1 2 3 4 5 7 8 13 20 26 28 38 40 41,14,0,0,2023,214 g,171.6 79.9 8.9,171.6,79.9,8.9,214 g,1080 2376,376 ppi


In [31]:
def get_resolution_wh(resolution):
    res = resolution.split()
    return pd.Series([float(res[0]), float(res[1])], index=['display_width', 'display_height'])

In [32]:
df[['display_width', 'display_height']] = df['resolution'].apply(lambda x: get_resolution_wh(x))
df.head()

Unnamed: 0,brand,phone_name,text,network,4g_bands,4g_band_count,5g_bands,5g_band_count,released_year,data,height_width_depth,height,width,depth,weight,resolution,ppi,display_width,display_height
0,huawei,nova 11 SE,network technology gsm / cdma / hspa / cdma200...,lte,1 3 4 5 8 18 19 26 34 38 39 40,12,0,0,2023,186 g,162.4 75.5 7.4,162.4,75.5,7.4,186 g,1080 2400,395 ppi,1080.0,2400.0
1,huawei,Mate 60 Pro+,network technology gsm / cdma / hspa / cdma200...,lte 5g,1,1,1,1,2023,225 g,163.7 79 8.1,163.7,79.0,8.1,225 g,1260 2720,440 ppi,1260.0,2720.0
2,huawei,Mate X5,network technology gsm / cdma / hspa / evdo / ...,lte 5g,1,1,1,1,2023,243 g,156.9 141.5 5.3,156.9,141.5,5.3,243 g,2224 2496,426 ppi,2224.0,2496.0
3,huawei,Mate 60 Pro,network technology gsm / cdma / hspa / cdma200...,lte 5g,1,1,1,1,2023,225 g,163.7 79 8.1,163.7,79.0,8.1,225 g,1260 2720,440 ppi,1260.0,2720.0
4,huawei,nova Y91,network technology gsm / hspa / lte 2g bands g...,lte,1 2 3 4 5 7 8 13 20 26 28 38 40 41,14,0,0,2023,214 g,171.6 79.9 8.9,171.6,79.9,8.9,214 g,1080 2376,376 ppi,1080.0,2376.0


In [33]:
def get_ppi(ppi):
    res = ppi.split()
    return float(res[0])

In [34]:
df['display_ppi'] = df['ppi'].apply(lambda x: get_ppi(x))
df.head()

Unnamed: 0,brand,phone_name,text,network,4g_bands,4g_band_count,5g_bands,5g_band_count,released_year,data,height_width_depth,height,width,depth,weight,resolution,ppi,display_width,display_height,display_ppi
0,huawei,nova 11 SE,network technology gsm / cdma / hspa / cdma200...,lte,1 3 4 5 8 18 19 26 34 38 39 40,12,0,0,2023,186 g,162.4 75.5 7.4,162.4,75.5,7.4,186 g,1080 2400,395 ppi,1080.0,2400.0,395.0
1,huawei,Mate 60 Pro+,network technology gsm / cdma / hspa / cdma200...,lte 5g,1,1,1,1,2023,225 g,163.7 79 8.1,163.7,79.0,8.1,225 g,1260 2720,440 ppi,1260.0,2720.0,440.0
2,huawei,Mate X5,network technology gsm / cdma / hspa / evdo / ...,lte 5g,1,1,1,1,2023,243 g,156.9 141.5 5.3,156.9,141.5,5.3,243 g,2224 2496,426 ppi,2224.0,2496.0,426.0
3,huawei,Mate 60 Pro,network technology gsm / cdma / hspa / cdma200...,lte 5g,1,1,1,1,2023,225 g,163.7 79 8.1,163.7,79.0,8.1,225 g,1260 2720,440 ppi,1260.0,2720.0,440.0
4,huawei,nova Y91,network technology gsm / hspa / lte 2g bands g...,lte,1 2 3 4 5 7 8 13 20 26 28 38 40 41,14,0,0,2023,214 g,171.6 79.9 8.9,171.6,79.9,8.9,214 g,1080 2376,376 ppi,1080.0,2376.0,376.0


In [35]:
def display_size_str(string):
    res = re.search(r"size.*inches,", string)
    if res is not None:
        res = re.sub(r"size|inches,.*", "", res.group()).strip()
        res = str(round(float(res), 1))
        # print(res)
        return res
    else:
        # print("0")
        return "0"

In [36]:
df['display_size_str'] = df['text'].apply(lambda x: display_size_str(x))
df.head()

Unnamed: 0,brand,phone_name,text,network,4g_bands,4g_band_count,5g_bands,5g_band_count,released_year,data,...,height,width,depth,weight,resolution,ppi,display_width,display_height,display_ppi,display_size_str
0,huawei,nova 11 SE,network technology gsm / cdma / hspa / cdma200...,lte,1 3 4 5 8 18 19 26 34 38 39 40,12,0,0,2023,186 g,...,162.4,75.5,7.4,186 g,1080 2400,395 ppi,1080.0,2400.0,395.0,6.7
1,huawei,Mate 60 Pro+,network technology gsm / cdma / hspa / cdma200...,lte 5g,1,1,1,1,2023,225 g,...,163.7,79.0,8.1,225 g,1260 2720,440 ppi,1260.0,2720.0,440.0,6.8
2,huawei,Mate X5,network technology gsm / cdma / hspa / evdo / ...,lte 5g,1,1,1,1,2023,243 g,...,156.9,141.5,5.3,243 g,2224 2496,426 ppi,2224.0,2496.0,426.0,7.8
3,huawei,Mate 60 Pro,network technology gsm / cdma / hspa / cdma200...,lte 5g,1,1,1,1,2023,225 g,...,163.7,79.0,8.1,225 g,1260 2720,440 ppi,1260.0,2720.0,440.0,6.8
4,huawei,nova Y91,network technology gsm / hspa / lte 2g bands g...,lte,1 2 3 4 5 7 8 13 20 26 28 38 40 41,14,0,0,2023,214 g,...,171.6,79.9,8.9,214 g,1080 2376,376 ppi,1080.0,2376.0,376.0,7.0


In [37]:
def display_size(string):
    res = re.search(r"size.*inches,", string)
    if res is not None:
        res = re.sub(r"size|inches,.*", "", res.group()).strip()
        res = round(float(res), 1)
        # print(res)
        return res
    else:
        # print("0")
        return 0

In [38]:
df['display_size'] = df['text'].apply(lambda x: display_size(x))
df.head()

Unnamed: 0,brand,phone_name,text,network,4g_bands,4g_band_count,5g_bands,5g_band_count,released_year,data,...,width,depth,weight,resolution,ppi,display_width,display_height,display_ppi,display_size_str,display_size
0,huawei,nova 11 SE,network technology gsm / cdma / hspa / cdma200...,lte,1 3 4 5 8 18 19 26 34 38 39 40,12,0,0,2023,186 g,...,75.5,7.4,186 g,1080 2400,395 ppi,1080.0,2400.0,395.0,6.7,6.7
1,huawei,Mate 60 Pro+,network technology gsm / cdma / hspa / cdma200...,lte 5g,1,1,1,1,2023,225 g,...,79.0,8.1,225 g,1260 2720,440 ppi,1260.0,2720.0,440.0,6.8,6.8
2,huawei,Mate X5,network technology gsm / cdma / hspa / evdo / ...,lte 5g,1,1,1,1,2023,243 g,...,141.5,5.3,243 g,2224 2496,426 ppi,2224.0,2496.0,426.0,7.8,7.8
3,huawei,Mate 60 Pro,network technology gsm / cdma / hspa / cdma200...,lte 5g,1,1,1,1,2023,225 g,...,79.0,8.1,225 g,1260 2720,440 ppi,1260.0,2720.0,440.0,6.8,6.8
4,huawei,nova Y91,network technology gsm / hspa / lte 2g bands g...,lte,1 2 3 4 5 7 8 13 20 26 28 38 40 41,14,0,0,2023,214 g,...,79.9,8.9,214 g,1080 2376,376 ppi,1080.0,2376.0,376.0,7.0,7.0


In [None]:
res = re.search(r"oled", "type: ltpo oled, 1b colors, 120hz size:")
res[0]

In [None]:
def display_oled(string):
    res = re.search(r"type.*size|type.*resolution", string)
    if res is not None:
        oled = re.search(r"oled", res.group())
        if oled is not None:
            return "yes"
        else:
            return "0"
    else:
        return "0"


def display_ips(string):
    res = re.search(r"type.*size|type.*resolution", string)
    if res is not None:
        ips = re.search(r"ips", res.group())
        if ips is not None:
            return "yes"
        else:
            return "0"
    else:
        return "0"


def display_tft(string):
    res = re.search(r"type.*size|type.*resolution", string)
    if res is not None:
        tft = re.search(r"tft", res.group())
        if tft is not None:
            return "yes"
        else:
            return "0"
    else:
        return "0"

In [None]:
df['oled'] = df['text'].apply(lambda x: display_oled(x))
df['ips'] = df['text'].apply(lambda x: display_ips(x))
df['tft'] = df['text'].apply(lambda x: display_tft(x))
df.head()

In [None]:
def display_90hz(string):
    pattern = re.compile(r"type.*size")
    res = re.search(pattern, string)
    if res is not None:
        hz90 = re.search(r"90hz", res.group())
        if hz90 is not None:
            return "yes"
        else:
            return "0"
    else:
        return "0"


def display_120hz(string):
    pattern = re.compile(r"type.*size")
    res = re.search(pattern, string)
    if res is not None:
        hz120 = re.search(r"120hz", res.group())
        if hz120 is not None:
            return "yes"
        else:
            return "0"
    else:
        return "0"

In [None]:
df['90hz'] = df['text'].apply(lambda x: display_90hz(x))
df['120hz'] = df['text'].apply(lambda x: display_120hz(x))
df.head()

In [None]:
df['text'][0]

In [None]:
def get_os(string):
    res = re.search(r"os.*chipset|os.*cpu", string)
    if res is not None:
        res = re.sub(r"cpu|os|chipset|,.*|\(.*|\/.*|oreo", "", res.group()).strip()
        return res
    else:
        return "0"

In [None]:
df['os'] = df['text'].apply(lambda x: get_os(x))
df.head()

In [None]:
df['text'][0]

In [None]:
def get_chipset_size(string):
    res = re.search(r"chipset.*cpu", string)
    if res is not None:
        chipset = re.search(
            r"snapdragon [a-z0-9+ ]{2,8}|helio [a-z0-9+ ]{2,8}|dimensity [a-z0-9+ ]{2,10}|kirin [a-z0-9+ ]{2,8}|exynos [a-z0-9+ ]{2,5}",
            res.group(),
        )
        size = re.search(r"\(.*\)", res.group())
        if chipset and size is not None:
            size = re.sub(r"\(|\).*", "", size.group()).strip()
            chipset = chipset[0].strip()
            # print(chipset)
            return pd.Series([chipset, size], index=['chipset', 'size'])
        else:
            # print("0")
            return pd.Series(["0", "0"], index=['chipset', 'size'])
    else:
        # print("0")
        return pd.Series(["0", "0"], index=['chipset', 'size'])

In [None]:
df[['chipset', 'size']] = df['text'].apply(lambda x: get_chipset_size(x))
df.head()

In [None]:
df['text'][0]

In [None]:
def get_one_gpu(string):
    if string == "adreno 615adreno 616":
        return "adreno 615"

    if string == "adreno 530mali-t880 mp12":
        return "adreno 530"

    if string == "adreno 619mali-g57 mc2":
        return "adreno 619"

    if string == "adreno 730mali-g710 mc10":
        return "adreno 730"

    if string == "broadcom videocore ivmali-400mp2":
        return "broadcom videocore iv"

    if string == "adreno 505adreno 308":
        return "adreno 505"

    if string == "adreno 405mali-t720mp2":
        return "adreno 405"

    if string == "mali-t720mp2adreno 405":
        return "mali-t720mp2"

    if string == "mali-400mali-t720":
        return "mali-400"

    if string == "adreno 306mali-400":
        return "adreno 306"

    if string == "mali-400mp2vivante gc7000 ul":
        return "mali-400mp2"

    if string == "mali-t720mp2adreno 405":
        return "mali-t720mp2"

    if string == "mali-t720mp2adreno 306":
        return "mali-t720mp2"

    if string == "adreno 405mali-t830 mp1":
        return "adreno 405"

    if string == "mali-g77 mp11adreno 650":
        return "mali-g77 mp11"

    if string == "adreno 615adreno 616":
        return "adreno 615"

    if string == "mali-t720mp2mali-400mp2":
        return "mali-t720mp2"

    if string == "mali-g78 mp24adreno 660":
        return "mali-g78 mp2"


two_gpu = (
    "adreno 615adreno 616",
    "adreno 530mali-t880 mp12",
    "adreno 619mali-g57 mc2",
    "adreno 730mali-g710 mc10",
    "broadcom videocore ivmali-400mp2",
    "adreno 505adreno 308",
    "adreno 405mali-t720mp2",
    "mali-t720mp2adreno 405",
    "mali-400mali-t720",
    "adreno 306mali-400",
    "mali-400mp2vivante gc7000 ul",
    "mali-t720mp2adreno 405",
    "mali-t720mp2adreno 306",
    "adreno 405mali-t830 mp1",
    "mali-g77 mp11adreno 650",
    "adreno 615adreno 616",
    "mali-t720mp2mali-400mp2",
    "mali-g78 mp24adreno 660",
)


def get_gpu(string):
    res = re.search(r"gpu.*card|gpu.*internal", string)
    if res is not None:
        res = re.sub(r"gpu|card| -.*|\(.*|gpu|arm", "", res.group()).strip()
        if res not in two_gpu:
            return res
        else:
            res = get_one_gpu(res)
            return res
    else:
        return "0"

In [None]:
df['gpu'] = df['text'].apply(lambda x: get_gpu(x))
df.head()

In [None]:
df['text'][0]

In [None]:
def get_storage(string):
    res = re.search(r"internal.*triple|internal.*dual|internal.*single", string)
    if res is not None:
        storage = re.search(r"\d\dgb|\d\d\dgb", res.group())
        if storage is not None:
            return storage.group().strip()
        else:
            return "0"
    else:
        return "0"


def get_ram(string):
    res = re.search(r"internal.*triple|internal.*dual|internal.*single", string)
    if res is not None:
        storage = re.search(r"\dgb ram|\d\dgb ram", res.group())
        if storage is not None:
            return storage.group().strip()
        else:
            return "0"
    else:
        return "0"


def get_storage_type(string):
    res = re.search(r"internal.*triple|internal.*dual|internal.*single", string)
    if res is not None:
        storage_type = re.search(r"emmc|ufs", res.group())
        if storage_type is not None:
            return storage_type.group().strip()
        else:
            return "0"
    else:
        return "0"

In [None]:
df['storage'] = df['text'].apply(lambda x: get_storage(x))
df.head()

In [None]:
df['ram'] = df['text'].apply(lambda x: get_ram(x))
df.head()

In [None]:
df['storage_type'] = df['text'].apply(lambda x: get_storage_type(x))
df.head()

In [None]:
df['text'][2]

In [None]:
def get_battery(string):
    res = re.search(r"li-ion.*mah|li-po.*mah|silicon-carbon.*mah", string)
    if res is not None:
        battery = re.sub(r",.*|or.*|\(.*|battery .*", "", res.group())
        return battery
    else:
        return "0"

In [None]:
df['battery'] = df['text'].apply(lambda x: get_battery(x))
df.head()

In [None]:
df['text'][0]

In [None]:
def get_camera(string):
    res = re.search(r" [0-9.]{1,3} mp,", string)
    if res is not None:
        back_camera = re.sub(r":|,", "", res.group()).strip()
        return back_camera
    else:
        return "0"

In [None]:
df['back_camera'] = df['text'].apply(lambda x: get_camera(x))
df.head()

In [None]:
df['text'][8]

In [None]:
def get_camera(string):
    res = string.split("single")
    if res is not None:
        front_camera = len(res)
        if front_camera == 2:
            res = re.search(r"\d+ mp,", res[1])
            if res is not None:
                return res.group().strip()
            else:
                return "0"
        elif front_camera == 3:
            res = re.search(r"\d+ mp,", res[2])
            if res is not None:
                return res.group().strip()
            else:
                return "0"
        else:
            return "0"
    else:
        return "0"

In [None]:
df['text'][0]

In [None]:
def get_camera(string):
    res = re.search(r"selfie camera.*mp,", string)
    if res is not None:
        res = re.search(r"[0-9.]{1,3} mp", res.group())
        if res is not None:
            # print(res.group())
            return res.group()
        else:
            # print("0")
            return "0"
    else:
        # print("0")
        return "0"

In [None]:
df['data'] = df['text'].apply(lambda x: get_camera(x))

In [None]:
df['front_camera'] = df['text'].apply(lambda x: get_camera(x))
df.head()

In [None]:
df['text'][0]