In [1]:
import pandas as pd
import numpy as np
import requests
from bs4 import BeautifulSoup

In [2]:
headers = {
    "URL": "www.amazon.com",
    "accept-language": "en-US,en;q=0.9,bn;q=0.8",
    "user-agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/102.0.0.0 Safari/537.36",
}

In [3]:
url = "https://www.gsmarena.com/makers.php3"

In [4]:
response = requests.get(url, headers=headers)

In [5]:
soup = BeautifulSoup(response.content, "html.parser")

In [6]:
tags = soup.select("tr")

In [7]:
data = []

In [8]:
for tag in tags:
    link = tag.select_one("td a").get("href")
    name = tag.select_one("td a").get_text()
    data.append((name, link))

In [9]:
df = pd.DataFrame(np.array(data), columns=["brand", "link"])
df.head()

Unnamed: 0,brand,link
0,Acer100 devices,acer-phones-59.php
1,Allview157 devices,allview-phones-88.php
2,Amoi47 devices,amoi-phones-28.php
3,Archos43 devices,archos-phones-90.php
4,AT&T4 devices,at&t-phones-57.php


In [10]:
import re

In [11]:
def get_brand(text):
    text = re.sub(r"\d.*", "", text)
    return text

In [12]:
df["brand_name"] = df["brand"].apply(lambda x: get_brand(x))
df.head()

Unnamed: 0,brand,link,brand_name
0,Acer100 devices,acer-phones-59.php,Acer
1,Allview157 devices,allview-phones-88.php,Allview
2,Amoi47 devices,amoi-phones-28.php,Amoi
3,Archos43 devices,archos-phones-90.php,Archos
4,AT&T4 devices,at&t-phones-57.php,AT&T


In [13]:
def get_devices(text):
    devices = re.findall(r"\d+", text)
    return devices[0]

In [14]:
df["devices"] = df["brand"].apply(lambda x: get_devices(x))
df.head()

Unnamed: 0,brand,link,brand_name,devices
0,Acer100 devices,acer-phones-59.php,Acer,100
1,Allview157 devices,allview-phones-88.php,Allview,157
2,Amoi47 devices,amoi-phones-28.php,Amoi,47
3,Archos43 devices,archos-phones-90.php,Archos,43
4,AT&T4 devices,at&t-phones-57.php,AT&T,4


In [15]:
df = df[["brand_name", "devices", "link"]]
df.rename(columns={"brand_name": "brand"}, inplace=True)
df.head()

Unnamed: 0,brand,devices,link
0,Acer,100,acer-phones-59.php
1,Allview,157,allview-phones-88.php
2,Amoi,47,amoi-phones-28.php
3,Archos,43,archos-phones-90.php
4,AT&T,4,at&t-phones-57.php


In [16]:
df["brand"] = df["brand"].astype("str").str.lower()

In [17]:
df.to_csv("./data/gsmarena_brand_link.csv", index=False)

In [18]:
df = pd.read_csv("./data/gsmarena_brand_link.csv")
df

Unnamed: 0,brand,devices,link
0,acer,100,acer-phones-59.php
1,allview,157,allview-phones-88.php
2,amoi,47,amoi-phones-28.php
3,archos,43,archos-phones-90.php
4,at&t,4,at&t-phones-57.php
...,...,...,...
56,vodafone,87,vodafone-phones-53.php
57,wnd,5,wnd-phones-51.php
58,xiaomi,364,xiaomi-phones-80.php
59,yezz,113,yezz-phones-78.php


In [19]:
response = requests.get(
    "https://www.gsmarena.com/apple-phones-f-48-0-p1.php", headers=headers
)

soup = BeautifulSoup(response.content, "html.parser")

In [20]:
names = soup.select("div.makers ul li")
len(names)

40

In [21]:
for name in names:
    l = name.find("a").get("href")
    n = name.find("span").get_text()
    print(n, l)

iPhone 15 Pro Max apple_iphone_15_pro_max-12548.php
iPhone 15 Pro apple_iphone_15_pro-12557.php
iPhone 15 Plus apple_iphone_15_plus-12558.php
iPhone 15 apple_iphone_15-12559.php
Watch Ultra 2 apple_watch_ultra_2-12560.php
Watch Series 9 apple_watch_series_9-12561.php
Watch Series 9 Aluminum apple_watch_series_9_aluminum-12562.php
iPad Pro 12.9 (2022) apple_ipad_pro_12_9_(2022)-11939.php
iPad Pro 11 (2022) apple_ipad_pro_11_(2022)-11940.php
iPad (2022) apple_ipad_(2022)-11941.php
iPhone 14 Pro Max apple_iphone_14_pro_max-11773.php
iPhone 14 Pro apple_iphone_14_pro-11860.php
iPhone 14 Plus apple_iphone_14_plus-11862.php
iPhone 14 apple_iphone_14-11861.php
Watch Ultra apple_watch_ultra-11827.php
Watch Series 8 apple_watch_series_8-11866.php
Watch Series 8 Aluminum apple_watch_series_8_aluminum-11864.php
Watch SE (2022) apple_watch_se_(2022)-11865.php
iPhone SE (2022) apple_iphone_se_(2022)-11410.php
iPad Air (2022) apple_ipad_air_(2022)-11411.php
iPhone 13 Pro Max apple_iphone_13_pro_max-

In [22]:
df["link"][0]

'acer-phones-59.php'

In [23]:
def get_code(text):
    code = re.findall(r"\d+", text)
    return code[0]

In [24]:
df["code"] = df["link"].apply(lambda x: get_code(x))
df.head()

Unnamed: 0,brand,devices,link,code
0,acer,100,acer-phones-59.php,59
1,allview,157,allview-phones-88.php,88
2,amoi,47,amoi-phones-28.php,28
3,archos,43,archos-phones-90.php,90
4,at&t,4,at&t-phones-57.php,57


In [25]:
df.brand.unique()

array(['acer', 'allview', 'amoi', 'archos', 'at&t', 'benq', 'bird',
       'blackview', 'bosch', 'casio', 'celkon', 'coolpad', 'dell',
       'emporia', 'ericsson', 'fairphone', 'garmin-asus', 'gionee',
       'haier', 'hp', 'huawei', 'i-mobile', 'infinix', 'inq', 'itel',
       'karbonn', 'lava', 'lenovo', 'maxon', 'meizu', 'microsoft',
       'mitsubishi', 'motorola', 'nec', 'niu', 'nothing', 'o', 'oppo',
       'oukitel', 'panasonic', 'parla', 'plum', 'prestigio', 'qtek',
       'realme', 'samsung', 'sewon', 'siemens', 'sony', 'spice', 'tcl',
       'tel.me.', 'thuraya', 'ulefone', 'vertu', 'vivo', 'vodafone',
       'wnd', 'xiaomi', 'yezz', 'yu'], dtype=object)

In [26]:
df = df[
    df["brand"].isin(
        [
            "huawei",
            "infinix",
            "lava",
            "motorola",
            "nothing",
            "oppo",
            "realme",
            "samsung",
            "vivo",
            "xiaomi",
        ]
    )
]

df["brand"].unique()

array(['huawei', 'infinix', 'lava', 'motorola', 'nothing', 'oppo',
       'realme', 'samsung', 'vivo', 'xiaomi'], dtype=object)

In [27]:
df.reset_index(inplace=True)

In [29]:
print(df["link"][0])
print(df["code"][0])
print(df["brand"][0])

huawei-phones-58.php
58
huawei


In [30]:
link = df["link"][0]
link

'huawei-phones-58.php'

In [31]:
l = list(link.split("-"))

In [32]:
l.pop()

'58.php'

In [33]:
l.insert(1, "-")
l.append("-f-")
l.append("{}")
l.append("-0-p{}.php")

In [34]:
"".join(l)

'huawei-phones-f-{}-0-p{}.php'

In [35]:
df["devices"] = df["devices"].astype("int32")

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df['devices'] = df['devices'].astype('int32')


In [36]:
df[(df["devices"] > 50)]

Unnamed: 0,index,brand,devices,link,code
0,20,huawei,436,huawei-phones-58.php,58
1,22,infinix,118,infinix-phones-119.php,119
2,26,lava,141,lava-phones-94.php,94
3,32,motorola,603,motorola-phones-4.php,4
5,37,oppo,288,oppo-phones-82.php,82
6,44,realme,179,realme-phones-118.php,118
7,45,samsung,1377,samsung-phones-9.php,9
8,55,vivo,401,vivo-phones-98.php,98
9,58,xiaomi,364,xiaomi-phones-80.php,80


In [37]:
df.drop("index", axis=1, inplace=True)
df.head()

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df.drop('index', axis=1, inplace=True)


Unnamed: 0,brand,devices,link,code
0,huawei,436,huawei-phones-58.php,58
1,infinix,118,infinix-phones-119.php,119
2,lava,141,lava-phones-94.php,94
3,motorola,603,motorola-phones-4.php,4
4,nothing,2,nothing-phones-128.php,128


In [38]:
df["pages"] = np.int32(round(df["devices"] / 40, 0))
df.head()

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df['pages'] = np.int32(round(df['devices']/40, 0))


Unnamed: 0,brand,devices,link,code,pages
0,huawei,436,huawei-phones-58.php,58,11
1,infinix,118,infinix-phones-119.php,119,3
2,lava,141,lava-phones-94.php,94,4
3,motorola,603,motorola-phones-4.php,4,15
4,nothing,2,nothing-phones-128.php,128,0


In [39]:
df.loc[df["brand"] == "nothing", "pages"] = 1

In [40]:
df

Unnamed: 0,brand,devices,link,code,pages
0,huawei,436,huawei-phones-58.php,58,11
1,infinix,118,infinix-phones-119.php,119,3
2,lava,141,lava-phones-94.php,94,4
3,motorola,603,motorola-phones-4.php,4,15
4,nothing,2,nothing-phones-128.php,128,1
5,oppo,288,oppo-phones-82.php,82,7
6,realme,179,realme-phones-118.php,118,4
7,samsung,1377,samsung-phones-9.php,9,34
8,vivo,401,vivo-phones-98.php,98,10
9,xiaomi,364,xiaomi-phones-80.php,80,9


In [41]:
data2 = []

In [42]:
def gen_link(brand, devices, link, code, page):
    prefix = "https://www.gsmarena.com/"
    if page == 1:
        l = prefix + link
        data2.append((brand, devices, link, code, l))
        print(brand, devices, link, code, l)
    else:
        l = list(link.split("-"))
        l.pop()


        l.insert(1, "-")
        l.append("-f-")

        l.append("{}".format(code))

        l.append("-0-p{}.php".format(page))
        l = "".join(l)
        l = prefix + l
        data2.append((brand, devices, link, code, l))
        print(brand, devices, link, code, l)

In [43]:
for brand, devices, link, code, pages in df.values:
    for page in range(1, pages):
        gen_link(brand, devices, link, code, page)

huawei 436 huawei-phones-58.php 58 https://www.gsmarena.com/huawei-phones-58.php
huawei 436 huawei-phones-58.php 58 https://www.gsmarena.com/huawei-phones-f-58-0-p2.php
huawei 436 huawei-phones-58.php 58 https://www.gsmarena.com/huawei-phones-f-58-0-p3.php
huawei 436 huawei-phones-58.php 58 https://www.gsmarena.com/huawei-phones-f-58-0-p4.php
huawei 436 huawei-phones-58.php 58 https://www.gsmarena.com/huawei-phones-f-58-0-p5.php
huawei 436 huawei-phones-58.php 58 https://www.gsmarena.com/huawei-phones-f-58-0-p6.php
huawei 436 huawei-phones-58.php 58 https://www.gsmarena.com/huawei-phones-f-58-0-p7.php
huawei 436 huawei-phones-58.php 58 https://www.gsmarena.com/huawei-phones-f-58-0-p8.php
huawei 436 huawei-phones-58.php 58 https://www.gsmarena.com/huawei-phones-f-58-0-p9.php
huawei 436 huawei-phones-58.php 58 https://www.gsmarena.com/huawei-phones-f-58-0-p10.php
infinix 118 infinix-phones-119.php 119 https://www.gsmarena.com/infinix-phones-119.php
infinix 118 infinix-phones-119.php 119 

In [44]:
int(round(2 / 40, 0))

0

In [45]:
df1 = pd.DataFrame(data2, columns=["brand", "devices", "link", "code", "gen_link"])
df1

Unnamed: 0,brand,devices,link,code,gen_link
0,huawei,436,huawei-phones-58.php,58,https://www.gsmarena.com/huawei-phones-58.php
1,huawei,436,huawei-phones-58.php,58,https://www.gsmarena.com/huawei-phones-f-58-0-...
2,huawei,436,huawei-phones-58.php,58,https://www.gsmarena.com/huawei-phones-f-58-0-...
3,huawei,436,huawei-phones-58.php,58,https://www.gsmarena.com/huawei-phones-f-58-0-...
4,huawei,436,huawei-phones-58.php,58,https://www.gsmarena.com/huawei-phones-f-58-0-...
...,...,...,...,...,...
83,xiaomi,364,xiaomi-phones-80.php,80,https://www.gsmarena.com/xiaomi-phones-f-80-0-...
84,xiaomi,364,xiaomi-phones-80.php,80,https://www.gsmarena.com/xiaomi-phones-f-80-0-...
85,xiaomi,364,xiaomi-phones-80.php,80,https://www.gsmarena.com/xiaomi-phones-f-80-0-...
86,xiaomi,364,xiaomi-phones-80.php,80,https://www.gsmarena.com/xiaomi-phones-f-80-0-...


In [46]:
df1.to_csv("./data/gsmarena_brand_link1.csv", index=False)

In [47]:
len(df1)

88