# Scraping Samsung from GSMArena

In [1]:
import requests
from bs4 import BeautifulSoup
import pandas as pd

In [2]:
base_url = "https://www.gsmarena.com/results.php3?sMakers=9"
HEADERS = {
    "User-Agent": (
        "Mozilla/5.0 (Windows NT 10.0; Win64; x64) "
        "AppleWebKit/537.36 (KHTML, like Gecko) "
        "Chrome/118.0.5993.118 Safari/537.36"
    )
}

In [3]:
r = requests.get(base_url,HEADERS)
r.raise_for_status()  
soup = BeautifulSoup(r.text, "html.parser")

In [4]:
row_divs = soup.find_all("div", class_="makers")
if row_divs:
    target_row = row_divs[0] 
else:
    print("Error: The 'makers' div was not found on the page.")
    target_row = None

if target_row:
    all_target_li = target_row.select("ul > li")
    first_30_li = all_target_li[:30]

    print(f"Total <li> elements found in the target <ul>: {len(all_target_li)}")
    print(f"Successfully selected the first {len(first_30_li)} list items.")

Total <li> elements found in the target <ul>: 70
Successfully selected the first 30 list items.


In [5]:
print(first_30_li)  

[<li><a href="samsung_galaxy_a56-13603.php"><img src="https://fdn2.gsmarena.com/vv/bigpic/samsung-galaxy-a56-.jpg" title="Samsung Galaxy A56 Android smartphone. Announced Mar 2025. Features 6.7″  display, Exynos 1580 chipset, 5000 mAh battery, 256 GB storage, 12 GB RAM, Corning Gorilla Glass Victus+."/><strong><span>Samsung<br/>Galaxy A56</span></strong></a></li>, <li><a href="samsung_galaxy_s25_ultra-13322.php"><img src="https://fdn2.gsmarena.com/vv/bigpic/samsung-galaxy-s25-ultra-sm-s938.jpg" title="Samsung Galaxy S25 Ultra Android smartphone. Announced Jan 2025. Features 6.9″  display, Snapdragon 8 Elite chipset, 5000 mAh battery, 1024 GB storage, 16 GB RAM, Corning Gorilla Armor 2."/><strong><span>Samsung<br/>Galaxy S25 Ultra</span></strong></a></li>, <li><a href="samsung_galaxy_a17_5g-14041.php"><img src="https://fdn2.gsmarena.com/vv/bigpic/samsung-galaxy-a17-5g.jpg" title="Samsung Galaxy A17 5G Android smartphone. Announced Aug 2025. Features 6.7″  display, Exynos 1330 chipset, 5

In [6]:
new_base_url = "https://www.gsmarena.com/" 
scraped_data = []
for li_tag in first_30_li:
    link_tag = li_tag.find('a')
    name = link_tag.find('strong').get_text(strip=True).replace('\n', ' ')
    
    full_url = new_base_url + link_tag.get('href')
    
    scraped_data.append({
        'name': name, 
        'link': full_url 
    })

In [7]:
scraped_data

[{'name': 'SamsungGalaxy A56',
  'link': 'https://www.gsmarena.com/samsung_galaxy_a56-13603.php'},
 {'name': 'SamsungGalaxy S25 Ultra',
  'link': 'https://www.gsmarena.com/samsung_galaxy_s25_ultra-13322.php'},
 {'name': 'SamsungGalaxy A17',
  'link': 'https://www.gsmarena.com/samsung_galaxy_a17_5g-14041.php'},
 {'name': 'SamsungGalaxy S25',
  'link': 'https://www.gsmarena.com/samsung_galaxy_s25-13610.php'},
 {'name': 'SamsungGalaxy A36',
  'link': 'https://www.gsmarena.com/samsung_galaxy_a36-13497.php'},
 {'name': 'SamsungGalaxy S25 FE',
  'link': 'https://www.gsmarena.com/samsung_galaxy_s25_fe_5g-14042.php'},
 {'name': 'SamsungGalaxy A07 4G',
  'link': 'https://www.gsmarena.com/samsung_galaxy_a07-14066.php'},
 {'name': 'SamsungGalaxy S24 Ultra',
  'link': 'https://www.gsmarena.com/samsung_galaxy_s24_ultra-12771.php'},
 {'name': 'SamsungGalaxy S24',
  'link': 'https://www.gsmarena.com/samsung_galaxy_s24-12773.php'},
 {'name': 'SamsungGalaxy S24 FE',
  'link': 'https://www.gsmarena.com/

In [8]:
import time 
final_data = []

for item in scraped_data:
    url = item['link']
    
   
    try:
        r = requests.get(url, headers=HEADERS, timeout=10)
        r.raise_for_status()  
        soup = BeautifulSoup(r.text, "html.parser")
        specs_div = soup.find(id="specs-list") 
        specs_html = str(specs_div) if specs_div else "Specs list not found"
        
        final_data.append({
            'name': item['name'],
            'link': url,
            'specs_list_html': specs_html
        })
        
        
        time.sleep(2) 
        
    except requests.exceptions.RequestException as e:
        print(f"Failed to fetch {url}: {e}")
        final_data.append({
            'name': item['name'],
            'link': url,
            'specs_list_html': f"ERROR: Failed to fetch page - {e}"
        })


print(f"Finished scraping. Total items processed: {len(final_data)}")

Finished scraping. Total items processed: 30


In [9]:
final_data

[{'name': 'SamsungGalaxy A56',
  'link': 'https://www.gsmarena.com/samsung_galaxy_a56-13603.php',
  'specs_list_html': '<div id="specs-list">\n<style type="text/css"> .tr-toggle {  display:none; }  </style>\n<table cellspacing="0">\n<tr class="tr-hover">\n<th rowspan="15" scope="row">Network</th>\n<td class="ttl"><a href="network-bands.php3">Technology</a></td>\n<td class="nfo"><a class="link-network-detail collapse" data-spec="nettech" href="#">GSM / HSPA / LTE / 5G</a></td>\n</tr>\n<tr class="tr-toggle">\n<td class="ttl"><a href="network-bands.php3">2G bands</a></td>\n<td class="nfo" data-spec="net2g">GSM 850 / 900 / 1800 / 1900 </td>\n</tr><tr class="tr-toggle">\n<td class="ttl"><a href="network-bands.php3">3G bands</a></td>\n<td class="nfo" data-spec="net3g">HSDPA 850 / 900 / 1700(AWS) / 1900 / 2100 </td>\n</tr>\n<tr class="tr-toggle">\n<td class="ttl"><a href="network-bands.php3">4G bands</a></td>\n<td class="nfo" data-spec="net4g">1, 2, 3, 4, 5, 7, 8, 12, 17, 20, 25, 26, 28, 32, 

In [10]:
import re

STATUS_PATTERN = re.compile(r'data-spec="status">([^<]+)')
DISPLAY_PATTERN = re.compile(r'data-spec="displaysize">([^<]+)')
BATTERY_PATTERN = re.compile(r'data-spec="batdescription1">([^<]+)')
CAMERA_PATTERN = re.compile(r'data-spec="cam1modules">([^<]+)')
MEMORY_PATTERN = re.compile(r'data-spec="internalmemory">([^<]+)')
PRICE_PATTERN = re.compile(r'data-spec="price"><a[^>]*>([^<]+)')

for item in final_data:
    html_str = item['specs_list_html']
    
    match = STATUS_PATTERN.search(html_str)
    item['released'] = match.group(1).strip() if match else 'N/A'
    
    match = DISPLAY_PATTERN.search(html_str)
    item['display'] = match.group(1).strip() if match else 'N/A'
    
    match = BATTERY_PATTERN.search(html_str)
    item['battery'] = match.group(1).strip() if match else 'N/A'
    
    match = CAMERA_PATTERN.search(html_str)
    if match:
        raw_camera = match.group(1).replace('\\r\\n', ' ').replace('<br>', ' ').strip()
        item['camera'] = re.sub('<[^>]+>', '', raw_camera)
    else:
        item['camera'] = 'N/A'
    
    match = MEMORY_PATTERN.search(html_str)
    item['memory'] = match.group(1).strip() if match else 'N/A'
    
    match = PRICE_PATTERN.search(html_str)
    if match:
        item['price'] = match.group(1).strip().replace('\u2009', ' ')
    else:
        item['price'] = 'N/A'
        
    del item['specs_list_html']

In [11]:
final_data

[{'name': 'SamsungGalaxy A56',
  'link': 'https://www.gsmarena.com/samsung_galaxy_a56-13603.php',
  'released': 'Available. Released 2025, March 10',
  'display': '6.7 inches, 110.2 cm',
  'battery': '5000 mAh',
  'camera': '50 MP, f/1.8, (wide), 1/1.56", 1.0µm, PDAF, OIS',
  'memory': '128GB 6GB RAM, 128GB 8GB RAM, 256GB 6GB RAM, 256GB 8GB RAM, 256GB 12GB RAM',
  'price': '$ 305.00 / € 290.00 / £ 252.00 / ₹ 32,724'},
 {'name': 'SamsungGalaxy S25 Ultra',
  'link': 'https://www.gsmarena.com/samsung_galaxy_s25_ultra-13322.php',
  'released': 'Available. Released 2025, February 03',
  'display': '6.9 inches, 116.9 cm',
  'battery': 'Li-Ion 5000 mAh',
  'camera': '200 MP, f/1.7, 24mm (wide), 1/1.3", 0.6µm, multi-directional PDAF, OIS',
  'memory': '256GB 12GB RAM, 512GB 12GB RAM, 1TB 12GB RAM, 1TB 16GB RAM',
  'price': '$ 689.94 / € 920.00 / £ 770.00 / ₹ 97,500'},
 {'name': 'SamsungGalaxy A17',
  'link': 'https://www.gsmarena.com/samsung_galaxy_a17_5g-14041.php',
  'released': 'Available. 

In [12]:
unique_ram = set()
unique_storage = set()

MEMORY_PATTERN = re.compile(r'(\d+GB)(?:\s+RAM)?')

for item in final_data:
    memory_string = item['memory']
    
    configurations = [s.strip() for s in memory_string.split(',')]
    
    for config in configurations:
        matches = MEMORY_PATTERN.findall(config)
        
        if len(matches) == 2:
            unique_storage.add(matches[0].strip())
            unique_ram.add(matches[1].strip())
        elif len(matches) == 1 and 'RAM' in config:
             unique_ram.add(matches[0].strip())
        elif len(matches) == 1:
            unique_storage.add(matches[0].strip())

    item['ram'] = ', '.join(sorted(list(unique_ram), key=lambda x: int(re.findall(r'\d+', x)[0])))
    item['storage'] = ', '.join(sorted(list(unique_storage), key=lambda x: int(re.findall(r'\d+', x)[0])))
    
    del item['memory']
    
    unique_ram.clear()
    unique_storage.clear()

In [13]:
final_data

[{'name': 'SamsungGalaxy A56',
  'link': 'https://www.gsmarena.com/samsung_galaxy_a56-13603.php',
  'released': 'Available. Released 2025, March 10',
  'display': '6.7 inches, 110.2 cm',
  'battery': '5000 mAh',
  'camera': '50 MP, f/1.8, (wide), 1/1.56", 1.0µm, PDAF, OIS',
  'price': '$ 305.00 / € 290.00 / £ 252.00 / ₹ 32,724',
  'ram': '6GB, 8GB, 12GB',
  'storage': '128GB, 256GB'},
 {'name': 'SamsungGalaxy S25 Ultra',
  'link': 'https://www.gsmarena.com/samsung_galaxy_s25_ultra-13322.php',
  'released': 'Available. Released 2025, February 03',
  'display': '6.9 inches, 116.9 cm',
  'battery': 'Li-Ion 5000 mAh',
  'camera': '200 MP, f/1.7, 24mm (wide), 1/1.3", 0.6µm, multi-directional PDAF, OIS',
  'price': '$ 689.94 / € 920.00 / £ 770.00 / ₹ 97,500',
  'ram': '12GB, 16GB',
  'storage': '256GB, 512GB'},
 {'name': 'SamsungGalaxy A17',
  'link': 'https://www.gsmarena.com/samsung_galaxy_a17_5g-14041.php',
  'released': 'Available. Released 2025, August 14',
  'display': '6.7 inches, 110

In [14]:
df = pd.DataFrame(final_data)

In [15]:
df

Unnamed: 0,name,link,released,display,battery,camera,price,ram,storage
0,SamsungGalaxy A56,https://www.gsmarena.com/samsung_galaxy_a56-13...,"Available. Released 2025, March 10","6.7 inches, 110.2 cm",5000 mAh,"50 MP, f/1.8, (wide), 1/1.56"", 1.0µm, PDAF, OIS","$ 305.00 / € 290.00 / £ 252.00 / ₹ 32,724","6GB, 8GB, 12GB","128GB, 256GB"
1,SamsungGalaxy S25 Ultra,https://www.gsmarena.com/samsung_galaxy_s25_ul...,"Available. Released 2025, February 03","6.9 inches, 116.9 cm",Li-Ion 5000 mAh,"200 MP, f/1.7, 24mm (wide), 1/1.3"", 0.6µm, mul...","$ 689.94 / € 920.00 / £ 770.00 / ₹ 97,500","12GB, 16GB","256GB, 512GB"
2,SamsungGalaxy A17,https://www.gsmarena.com/samsung_galaxy_a17_5g...,"Available. Released 2025, August 14","6.7 inches, 110.2 cm",5000 mAh,"50 MP, f/1.8, (wide), 1/2.76"", 0.64µm, AF, OIS","$ 205.00 / € 168.00 / £ 182.00 / ₹ 18,999","4GB, 6GB, 8GB","128GB, 256GB"
3,SamsungGalaxy S25,https://www.gsmarena.com/samsung_galaxy_s25-13...,"Available. Released 2025, February 03","6.2 inches, 94.4 cm",Li-Ion 4000 mAh,"50 MP, f/1.8, 24mm (wide), 1/1.56"", 1.0µm, dua...","$ 457.00 / € 519.00 / £ 479.99 / ₹ 61,350",12GB,"128GB, 256GB, 512GB"
4,SamsungGalaxy A36,https://www.gsmarena.com/samsung_galaxy_a36-13...,"Available. Released 2025, March 10","6.7 inches, 110.2 cm",5000 mAh,"50 MP, f/1.8, (wide), 1/1.96"", PDAF, OIS","$ 248.00 / € 242.99 / £ 215.00 / ₹ 28,499","6GB, 8GB, 12GB","128GB, 256GB"
5,SamsungGalaxy S25 FE,https://www.gsmarena.com/samsung_galaxy_s25_fe...,"Available. Released 2025, September 04","6.7 inches, 110.2 cm",4900 mAh,"50 MP, f/1.8, 24mm (wide), 1/1.57"", 1.0µm, dua...","$ 623.99 / € 578.00 / £ 599.00 / ₹ 65,999",8GB,"128GB, 256GB, 512GB"
6,SamsungGalaxy A07 4G,https://www.gsmarena.com/samsung_galaxy_a07-14...,"Available. Released 2025, August 25","6.7 inches, 108.4 cm",5000 mAh,"50 MP, f/1.8, (wide), 1/2.76"", 0.64µm, PDAF",$ 143.64,"4GB, 6GB, 8GB","64GB, 128GB, 256GB"
7,SamsungGalaxy S24 Ultra,https://www.gsmarena.com/samsung_galaxy_s24_ul...,"Available. Released 2024, January 24","6.8 inches, 113.5 cm",Li-Ion 5000 mAh,"200 MP, f/1.7, 24mm (wide), 1/1.3"", 0.6µm, mul...","$ 574.90 / € 705.34 / £ 579.95 / ₹ 79,999",12GB,"256GB, 512GB"
8,SamsungGalaxy S24,https://www.gsmarena.com/samsung_galaxy_s24-12...,"Available. Released 2024, January 24","6.2 inches, 94.4 cm",Li-Ion 4000 mAh,"50 MP, f/1.8, 24mm (wide), 1/1.56"", 1.0µm, dua...","$ 289.95 / € 430.00 / £ 368.00 / ₹ 41,560","8GB, 12GB","128GB, 256GB, 512GB"
9,SamsungGalaxy S24 FE,https://www.gsmarena.com/samsung_galaxy_s24_fe...,"Available. Released 2024, October 03","6.7 inches, 110.2 cm",4700 mAh,"50 MP, f/1.8, 24mm (wide), 1/1.57"", 1.0µm, dua...","$ 250.00 / € 391.90 / £ 364.00 / ₹ 30,999",8GB,"128GB, 256GB, 512GB"


# DB- PostgreSQL 

In [16]:
from sqlalchemy import create_engine
import psycopg2

DB_USER = 'postgres'
DB_PASSWORD = 'password'
DB_HOST = 'localhost'
DB_PORT = '5432'
NEW_DB_NAME = 'samsung-scraped'
TABLE_NAME = 'mobile_specs'

try:
    conn = psycopg2.connect(
        database='postgres',
        user=DB_USER,
        password=DB_PASSWORD,
        host=DB_HOST,
        port=DB_PORT
    )
    conn.autocommit = True
    cursor = conn.cursor()

    cursor.execute(f"SELECT 1 FROM pg_database WHERE datname = '{NEW_DB_NAME}'")
    exists = cursor.fetchone()
    if not exists:
        cursor.execute(f"CREATE DATABASE \"{NEW_DB_NAME}\"")
        print(f"Database '{NEW_DB_NAME}' created successfully.")
    else:
        print(f"Database '{NEW_DB_NAME}' already exists.")

    cursor.close()
    conn.close()

except psycopg2.OperationalError as e:
    print(f"Error connecting to PostgreSQL: {e}")
    print("Please check your PostgreSQL server status and credentials (user/password).")

Database 'samsung-scraped' already exists.


In [17]:
DB_CONNECTION_STRING = f"postgresql+psycopg2://{DB_USER}:{DB_PASSWORD}@{DB_HOST}:{DB_PORT}/{NEW_DB_NAME}"

try:
    engine = create_engine(DB_CONNECTION_STRING)
    print("SQLAlchemy Engine created successfully.")

    df.to_sql(
        TABLE_NAME, 
        engine, 
        if_exists='replace', 
        index=False         
    )
    print(f"DataFrame successfully inserted into table '{TABLE_NAME}'.")

except Exception as e:
    print(f"Error during data insertion: {e}")

SQLAlchemy Engine created successfully.
DataFrame successfully inserted into table 'mobile_specs'.


In [18]:
sql_query = f"SELECT * FROM {TABLE_NAME} LIMIT 5"

try:
    df_check = pd.read_sql(sql_query, engine)
    
    print(f"\nSuccessfully retrieved data from table '{TABLE_NAME}'.")
    print("\n--- First 5 rows of the new PostgreSQL table ---")
    print(df_check)
    print("\n---------------------------------------------------")
    print(df_check.info())

except Exception as e:
    print(f"Error during verification query: {e}")


Successfully retrieved data from table 'mobile_specs'.

--- First 5 rows of the new PostgreSQL table ---
                      name                                               link  \
0        SamsungGalaxy A56  https://www.gsmarena.com/samsung_galaxy_a56-13...   
1  SamsungGalaxy S25 Ultra  https://www.gsmarena.com/samsung_galaxy_s25_ul...   
2        SamsungGalaxy A17  https://www.gsmarena.com/samsung_galaxy_a17_5g...   
3        SamsungGalaxy S25  https://www.gsmarena.com/samsung_galaxy_s25-13...   
4        SamsungGalaxy A36  https://www.gsmarena.com/samsung_galaxy_a36-13...   

                                released               display  \
0     Available. Released 2025, March 10  6.7 inches, 110.2 cm   
1  Available. Released 2025, February 03  6.9 inches, 116.9 cm   
2    Available. Released 2025, August 14  6.7 inches, 110.2 cm   
3  Available. Released 2025, February 03   6.2 inches, 94.4 cm   
4     Available. Released 2025, March 10  6.7 inches, 110.2 cm   

          

# Dumping PostGreSQL Table into a .sql File

In [None]:
import subprocess
import os
DUMP_FILE= "db.sql"
dump_command = [
    'pg_dump',
    '-U', DB_USER,     # Username
    '-h', DB_HOST,     # Host
    '-p', DB_PORT,     # Port
    '-d', NEW_DB_NAME,     # Database name
    '-f', DUMP_FILE,   # Output file path
]

try:
    print(f"Running command: pg_dump -U {DB_USER} -d {NEW_DB_NAME} -f {DUMP_FILE}")
    subprocess.run(dump_command, check=True, capture_output=True, text=True)
    
    print(f"Database dump successful! File created at: {DUMP_FILE}")

except subprocess.CalledProcessError as e:
    print(f"pg_dump failed with error code {e.returncode}")
    print(f"STDERR: {e.stderr}")
    
except FileNotFoundError:
    print("Error: 'pg_dump' command not found. Ensure the PostgreSQL bin directory is in your system's PATH.")


Running command: pg_dump -U postgres -d samsung-scraped -f ../data/raw/db.sql
Database dump successful! File created at: ../data/raw/db.sql
