# library

In [None]:
from typing import List, Dict, Tuple, Iterator, Any
import requests
from bs4 import BeautifulSoup, Tag
import time
import threading
import concurrent.futures
import random
import time
from datetime import datetime
import re

In [None]:
import pandas as pd
import numpy as np

# scrapping

In [None]:
# Constants
BASE_URL: str = 'https://www.gsmarena.com'

HEADERS: Dict[str, str] = {
    'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 '
                  'Safari/537.36',
    'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,image/apng,*/*;q=0.8,'
              'application/signed-exchange;v=b3;q=0.7',
    'Accept-Language': 'en-US,en;q=0.9'
}

In [None]:
# def to make a request
def make_request(url:str, headers:Dict[str, str]) -> requests.Response:
    try:
        time.sleep(3)
        response = requests.get(url, timeout=50, headers=headers)
        response.raise_for_status()  # Raises a HTTPError if the response status code is 4XX/5XX
        return response
    except requests.RequestException as e:
        print(f"Error while making a request: {str(e)}")
        return None

In [None]:
# function to get all brands phone we need

def get_all_brands(content: BeautifulSoup) -> Dict[str, str]:
    brands: Dict[str, str] = {}
    for item in content.select('.brandmenu-v2.light.l-box.clearfix ul li'):
        brand_name = item.text.upper()
        if brand_name in ['ALCATEL', 'APPLE', 'ASUS', 'BLU', 'HTC', 'HUAWEI', 'INFINIX', 'LENOVO', 'LG', 'NOKIA', 'SONY', 'XIAOMI', 'ZTE', 'SAMSUNG']:
            brands[brand_name] = item.select_one('a')['href']
    return brands

In [None]:
# all detail from each phone

def fetch_phone_data( phone_url:str ) -> Dict[str, Any]:
    full_url = f"{BASE_URL}/{phone_url}"
    response = make_request(full_url, HEADERS)
    if response.status_code == 200:
        soup = BeautifulSoup(response.content, 'html.parser')

        # phone name
        name = soup.select_one('[data-spec="modelname"]').text if soup.select_one('[data-spec="modelname"]') else 'Unknown'


        # network
        network=soup.select_one('[data-spec="nettech"]').text if soup.select_one('[data-spec="nettech"]') else 'Unknown'


        #launch
        announced_text = soup.select_one('[data-spec="year"]').text if soup.select_one('[data-spec="year"]') else None
        if announced_text:
            match = re.search(r'(\d{4}), (\w+)', announced_text)
            if match:
                year, month = match.groups()
                try:
                    announcedDate = datetime.strptime(f'{year} {month}', '%Y %B').strftime('%Y-%m')
                except ValueError:
                    announcedDate = 'Unknown'
            else :
                announcedDate = 'Unknown'

        status_text = soup.find('td', {'data-spec': 'status'}).text if soup.find('td', {'data-spec': 'status'}) else None

        releaseDate = 'Unknown'
        try :
            if status_text and "Released" in status_text:
                release_date_text = status_text.split("Released")[1].strip()
                releaseDate = datetime.strptime(release_date_text, '%Y, %B %d').strftime('%Y-%m')
            elif announced_text and "Released" in announced_text:
                release_date_text = announced_text.split("Released")[1].strip()
                releaseDate = datetime.strptime(release_date_text, '%Y, %B %d').strftime('%Y-%m')
        except ValueError:
            releaseDate = 'Unknown'
        availability=soup.select_one('[data-spec="status"]').text.split(".")[0] if soup.select_one('[data-spec="status"]') else 'Unknown'


        #body
        dimensions = soup.select_one('[data-spec="dimensions"]').text.split("mm")[0] if soup.select_one('[data-spec="dimensions"]') else 'Unknown'
        weight = soup.select_one('[data-spec="weight"]').text.split('g')[0] if soup.select_one('[data-spec="weight"]') else 'Unknown'
        build = soup.select_one('[data-spec="build"]').text.split(",") if soup.select_one('[data-spec="build"]') else "unknown"
        sim = soup.select_one('[data-spec="sim"]').text.split(",") if soup.select_one('[data-spec="sim"]') else 'Unknown'


        #display
        display_type = soup.select_one('[data-spec="displaytype"]').text if soup.select_one('[data-spec="displaytype"]') else 'Unknown'
        display_size_text = soup.select_one('[data-spec="displaysize"]').text if soup.select_one('[data-spec="displaysize"]') else 'Unknown'
        displaysizeI = 'Unknown'
        displaysizeCM = 'Unknown'
        displayratio = 'Unknown'
        if display_size_text != 'Unknown':
            parts = display_size_text.split(', ')
            if len(parts) >= 1:
                displaysizeI = parts[0].split(' inches')[0]
            if len(parts) >= 2:
                displaysizeCM = parts[1].split(' cm')[0]
                ratio_part = parts[1].split('(')[-1] if '(' in parts[1] else 'Unknown'
                displayratio = ratio_part.replace(')', '').strip() if ')' in ratio_part else 'Unknown'

        resolution = soup.select_one('[data-spec="displayresolution"]').text if soup.select_one('[data-spec="displayresolution"]') else 'Unknown'
        protection = soup.select_one('[data-spec="displayprotection"]').text if soup.select_one('[data-spec="displayprotection"]') else 'Unknown'
        other_features = soup.select_one('[data-spec="displayother"]').text if soup.select_one('[data-spec="displayother"]') else 'Unknown'
        '''
        display_data = {
            'Display_Type': display_type,
            'Display_Size_Inches': displaysizeI,
            'Display_Size_CM': displaysizeCM,
            'Display_Ratio' : displayratio,
            'Resolution': resolution,
            'Protection': protection,
            'Other_Features': other_features
        }
        '''

        #os
        os_text = soup.select_one('[data-spec="os"]').text if soup.select_one('[data-spec="os"]') else 'Unknown'
        ostype = 'Unknown'
        osversion = 'Unknown'
        chipset = 'Unknown'
        cpu = 'Unknown'
        if os_text != 'Unknown':
            os_parts = os_text.split(",")[0].split()
            if len(os_parts) > 1:
                ostype = os_parts[0]
                osversion = ' '.join(os_parts[1:])
        chipset = soup.select_one('[data-spec="chipset"]').text if soup.select_one('[data-spec="chipset"]') else 'Unknown'
        cpu_text = soup.select_one('[data-spec="cpu"]').text if soup.select_one('[data-spec="cpu"]') else 'Unknown'
        cpu = cpu_text.split(" (")[0] if "(" in cpu_text else cpu_text
        gpu = soup.select_one('[data-spec="gpu"]').text if soup.select_one('[data-spec="gpu"]') else 'Unknown'


        #memory
        card_slot = soup.select_one('[data-spec="memoryslot"]').get_text(strip=True) if soup.select_one('[data-spec="memoryslot"]') else 'Unknown'
        internal_memory_element = soup.select_one('[data-spec="internalmemory"]')
        if internal_memory_element:
            # Splitting by comma to handle multiple
            internal_memory = internal_memory_element.get_text(strip=True).split(", ")
        else:
            internal_memory = 'Unknown'
        memory_other = soup.select_one('[data-spec="memoryother"]').get_text(strip=True) if soup.select_one('[data-spec="memoryother"]') else 'Unknown'


        #main camera
        mainCam = soup.select_one('[data-spec="cam1modules"]').get_text(separator="|") if soup.select_one('[data-spec="cam1modules"]') else 'Unknown'
        ##separator to clearly between different camera specifications
        mainCamNum = len(mainCam.split("|")) if mainCam != 'Unknown' else 0
        mainCamFeatures = soup.select_one('[data-spec="cam1features"]').get_text().split(", ") if soup.select_one('[data-spec="cam1features"]') else 'Unknown'
        mainCamVideo = soup.select_one('[data-spec="cam1video"]').get_text().split(", ") if soup.select_one('[data-spec="cam1video"]') else 'Unknown'


        #selfie cam
        selfieCam = soup.select_one('[data-spec="cam2modules"]').get_text(separator="|") if soup.select_one('[data-spec="cam2modules"]') else 'Unknown'
        selfieCamNum = selfieCam.count("MP")
        selfieCamFeatures = soup.select_one('[data-spec="cam2features"]').get_text(separator=", ").split(", ") if soup.select_one('[data-spec="cam2features"]') else "Unknown"
        selfieCamVideo = soup.select_one('[data-spec="cam2video"]').get_text(separator=", ").split(", ") if soup.select_one('[data-spec="cam2video"]') else 'Unknown'


        #sound
        #loudspeaker_info = soup.find('a', href="glossary.php3?term=loudspeaker").find_next_sibling('td')
        #loudspeaker = loudspeaker_info.get_text(strip=True) if loudspeaker_info else 'Unknown'
        #jack_info = soup.find('a', href="glossary.php3?term=audio-jack").find_next_sibling('td')
        #jack = jack_info.get_text(strip=True) if jack_info else 'Unknown'
        def find_feature_info(feature_label):
            feature_info = "Unknown"  # Default value
            for td in soup.find_all('td'):
                if feature_label.lower() in td.get_text().lower():
                    next_td = td.find_next_sibling('td')
                    if next_td:
                        feature_info = next_td.get_text(strip=True)
                        break
            return feature_info

        loudspeaker = find_feature_info("Loudspeaker")
        jack = find_feature_info("3.5mm jack")


        #comms
        wlan = soup.select_one('[data-spec="wlan"]').get_text(strip=True) if soup.select_one('[data-spec="wlan"]') else 'Unknown'
        bluetooth = soup.select_one('[data-spec="bluetooth"]').get_text(strip=True) if soup.select_one('[data-spec="bluetooth"]') else 'Unknown'
        gps = soup.select_one('[data-spec="gps"]').get_text(strip=True) if soup.select_one('[data-spec="gps"]') else 'Unknown'
        nfc = soup.select_one('[data-spec="nfc"]').get_text(strip=True) if soup.select_one('[data-spec="nfc"]') else 'Unknown'
        radio = soup.select_one('[data-spec="radio"]').get_text(strip=True) if soup.select_one('[data-spec="radio"]') else 'Unknown'
        usb = soup.select_one('[data-spec="usb"]').get_text(strip=True) if soup.select_one('[data-spec="usb"]') else 'Unknown'


        #features
        sensors = soup.select_one('[data-spec="sensors"]').get_text(strip=True).split(",") if soup.select_one('[data-spec="sensors"]') else 'Unknown'
        features_other = soup.select_one('[data-spec="featuresother"]').get_text(separator="|").split("|") if soup.select_one('[data-spec="featuresother"]') else 'Unknown'


        #battery
        batteryType = soup.select_one('[data-spec="batdescription1"]').get_text(strip=True) if soup.select_one('[data-spec="batdescription1"]') else 'Unknown'
        charging_info_element = soup.find('td', text=lambda x: x and "Charging" in x)
        charging_info = charging_info_element.find_next_sibling('td') if charging_info_element else None
        charging = ' '.join(charging_info.stripped_strings) if charging_info else 'Unknown'


        #misc
        colors = soup.select_one('[data-spec="colors"]').get_text().split(",") if soup.select_one('[data-spec="colors"]') else 'Unknown'
        models = soup.select_one('[data-spec="models"]').get_text().split(",") if soup.select_one('[data-spec="models"]') else "Unknown"

        price_element = soup.find('td', {'data-spec': 'price'})
        price_text = price_element.get_text(strip=True) if price_element else 'Unknown'
        prices = {}
        if 'About' in price_text:
            prices['About'] = ''.join(filter(lambda x: x.isdigit() or x == '.', price_text.replace("About", "").strip()))
        else:
            for part in price_text.split('/'):
                part = part.strip()
                if '$' in part:
                    prices['USD'] = ''.join(filter(lambda x: x.isdigit() or x == '.', part.replace("$", "").strip()))
                elif '€' in part:
                    prices['EUR'] = ''.join(filter(lambda x: x.isdigit() or x == '.', part.replace("€", "").strip()))
                elif '£' in part:
                    prices['GBP'] = ''.join(filter(lambda x: x.isdigit() or x == '.', part.replace("£", "").strip()))
                elif '₹' in part:
                    prices['INR'] = ''.join(filter(lambda x: x.isdigit() or x == '.', part.replace("₹", "").strip()))

        #lable
        label = 0
        watch_keywords = ['watch', 'smartwatch']
        if any(keyword in full_url.lower() for keyword in watch_keywords):
            label = 1

        #######################################
        # save on  to dict
        phone_data={'url':full_url,"name":name,"network":network,"announcedDate":announcedDate,"releaseDate":releaseDate,
        "availability":availability,"dimensions":dimensions,"weight":weight,"build":build,"sim":sim,"displaytype":display_type,
        "displaysizeI":displaysizeI,"displaysizeCM":displaysizeCM,"displayratio":displayratio,"resolution":resolution,"protection":protection,
        "other_features":other_features,"ostype":ostype,"osversion":osversion,"chipset":chipset,"cpu":cpu,"gpu":gpu,"cardslot":card_slot,
        "internal_memory":internal_memory,"memory_other":memory_other,"mainCam":mainCam,"mainCamNum":mainCamNum,"mainCamFeatures":mainCamFeatures,
        "mainCamVideo":mainCamVideo,"selfieCam":selfieCam,"selfieCamNum":selfieCamNum,"selfieCamFeatures":selfieCamFeatures,
        "selfieCamVideo":selfieCamVideo,"loudspeaker":loudspeaker,"jack_info_3.5mm":jack,"wlan":wlan,"bluetooth":bluetooth,"gps":gps,
        "nfc":nfc,"radio":radio,"usb":usb,"sensors":sensors,"features_other":features_other,"batteryType":batteryType,"charging":charging,
        "colors":colors,"models":models,"price":prices,'lable':label}
        #######################################
        return phone_data
    else:
        return {'error': 'Failed to fetch phone data because of response is not 200', 'url': full_url}


In [None]:
# get url for each phone from the <img> tag

def phone_href(brand_url: str):
    phones_href = []
    next_page_url = f"{BASE_URL}/{brand_url}"
    while next_page_url:
        response = make_request(next_page_url, HEADERS)
        if response.status_code == 200:
            soup = BeautifulSoup(response.content, 'html.parser')
            phones = soup.select('.makers ul li a')
            year_before_2010_found = False
            for phone in phones:
                time.sleep(1)
                img_tag = phone.find('img')
                if img_tag and 'title' in img_tag.attrs:
                    title_text = img_tag['title']
                    if 'Announced' in title_text:
                        year_announced = int(title_text.split('Announced ')[1].split('.')[0].split(' ')[-1])
                    else:
                        print(f"Announcement year not found in title: {title_text}")

                    if year_announced >= 2010:
                        phones_href.append(phone['href'])
                    else:
                        year_before_2010_found = True
                        break
            if year_before_2010_found:
                print("Found a phone announced before 2010, stopping.")
                break

            next_page_link = soup.select_one('.nav-pages a.prevnextbutton[title="Next page"]')
            if next_page_link and 'href' in next_page_link.attrs:
                next_page_url = f"{BASE_URL}/{next_page_link['href']}"
            else:
                print('not more pages')
                next_page_url = None
        else:
            print(f"Failed to fetch page: {next_page_url}")
            break

    return phones_href


In [None]:
# get brands urls

if __name__ == '__main__':
    home_page_response = make_request(BASE_URL, HEADERS)
    if home_page_response.status_code == 200:
        soup = BeautifulSoup(home_page_response.content, 'html.parser')
        brands = get_all_brands(soup)
        print(brands)
    else:
        print(f'the response is : {home_page_response.status_code}')

## 3 ways to collect data

### first way : collect data for each brands is good idea

In [None]:
# get phones url for each brand
brand = 'NOKIA'
path = brands[brand]
print(f"Fetching phones for {brand}...")
phones_href = phone_href(path)
print(len(phones_href))
for phone in phones_href:
    print(phone)

In [None]:
all_phone_data_NOKIA = []

for phone_url in phones_href:
    print(phone_url)
    phone_data = fetch_phone_data(phone_url)
    if 'error' not in phone_data:
        all_phone_data_NOKIA.append(phone_data)
    else:
        print(f"Error fetching data for URL: {phone_url}")


In [None]:
phones_df_NOKIA = pd.DataFrame(all_phone_data_NOKIA)

watch_keywords = ['watch', 'smartwatch', 'fit']
tablet_keywords = ['tab', 'tablet']
phone_keywords = watch_keywords + tablet_keywords

df_watch  = phones_df_NOKIA[ phones_df_NOKIA['url'].apply(lambda x: any(keyword in x.lower() for keyword in watch_keywords)) ]
df_tablet = phones_df_NOKIA[ phones_df_NOKIA['url'].apply(lambda x: any(keyword in x.lower() for keyword in tablet_keywords)) ]
df_phone  = phones_df_NOKIA[ ~phones_df_NOKIA['url'].apply(lambda x: any(keyword in x.lower() for keyword in phone_keywords)) ]
df_totall = phones_df_NOKIA

df_watch.to_excel('NOKIA_watch.xlsx')
df_tablet.to_excel('NOKIA_tablet.xlsx')
df_phone.to_excel('NOKIA_phone.xlsx')
df_totall.to_excel('NOKIA_product.xlsx')


### second way : collect whole data

In [None]:
all_phone_data = []  # List to hold data for all phones across all brands

for brand, path in brands.items():
    print(f"Fetching phones for {brand}...")
    phones_href = phone_href(path)

    for phone_url in phones_href:
        phone_data = fetch_phone_data(phone_url)
        if 'error' not in phone_data:
            all_phone_data.append({**phone_data, 'brand': brand})
        else:
            print(f"Error fetching data for URL: {phone_url}")

# Convert the list of all phone data into a DataFrame
phones_df = pd.DataFrame(all_phone_data)

### theread

In [None]:
import threading
from queue import Queue

phones_data_queue = Queue()

def thread_fetch_phone_data(phone_urls):
    for phone_url in phone_urls:
        phone_data = fetch_phone_data(phone_url)
        if 'error' not in phone_data:
            phones_data_queue.put(phone_data)
        else:
            print(f"Error fetching data for URL: {phone_url}")

def fetch_data_with_two_threads(phones_urls):
    # Split the URLs into two parts >> use 2 thereads
    midpoint = len(phones_urls) // 2
    first_half_urls = phones_urls[:midpoint]
    second_half_urls = phones_urls[midpoint:]

    # Create two threads
    thread1 = threading.Thread(target=thread_fetch_phone_data, args=(first_half_urls,))
    thread2 = threading.Thread(target=thread_fetch_phone_data, args=(second_half_urls,))

    # Start the threads
    thread1.start()
    thread2.start()

    # Wait for both threads to complete
    thread1.join()
    thread2.join()

    # Extracting data from the queue
    all_phone_data = []
    while not phones_data_queue.empty():
        all_phone_data.append(phones_data_queue.get())

    return all_phone_data

# Use this function instead of the loop to fetch phone data
all_phone_data_NOKIA = fetch_data_with_two_threads(phones_href)


phones_df_NOKIA = pd.DataFrame(all_phone_data_NOKIA)

watch_keywords = ['watch', 'smartwatch', 'fit']
tablet_keywords = ['tab', 'tablet']
phone_keywords = watch_keywords + tablet_keywords

df_watch  = phones_df_NOKIA[ phones_df_NOKIA['url'].apply(lambda x: any(keyword in x.lower() for keyword in watch_keywords)) ]
df_tablet = phones_df_NOKIA[ phones_df_NOKIA['url'].apply(lambda x: any(keyword in x.lower() for keyword in tablet_keywords)) ]
df_phone  = phones_df_NOKIA[ ~phones_df_NOKIA['url'].apply(lambda x: any(keyword in x.lower() for keyword in phone_keywords)) ]
df_totall = phones_df_NOKIA

df_watch.to_excel('NOKIA_watch.xlsx')
df_tablet.to_excel('NOKIA_tablet.xlsx')
df_phone.to_excel('NOKIA_phone.xlsx')
df_totall.to_excel('NOKIA_product.xlsx')

## extract with proxy

In [None]:
url_proxy = 'https://www.us-proxy.org/'
ip_addresses = ['155.94.241.133' ,
'5.161.103.41:88' ,
'104.236.195.90:10009' ,
'172.173.132.85:80' ,
'93.188.161.84:80',
'68.183.48.146:10006',
'143.198.226.25:80',
'198.176.56.42:80',
'31.220.56.210:80',
'155.94.241.134:3128',
'138.68.235.51:80',
'202.5.16.44:80',
'155.94.241.132:3128',
'155.94.241.130:3128',
'72.169.67.61:87',
'34.122.187.196:80',
'198.176.56.39:80',
'104.45.128.122:80',
'216.137.184.253:80',
'198.176.56.43:80']

In [None]:
# 2 way to connect it


def proxy_request(url, headers, ip_addresses):
    while True:
        try:
            proxy = random.randint(0, len(ip_addresses) - 1)
            print(f'which proxy?:{proxy}')
            proxies = {"http": ip_addresses[proxy]}
            response = requests.get(url, proxies=proxies, timeout=25, headers=headers)
            if response.status_code == 200:
                return response
            else:
                print(f"Failed to fetch page: {url}")
        except Exception as e:
            print(f"Error while making a request: {str(e)}")
            pass

'''
def proxy_request(url, headers, ip_addresses):
    backoff_time = 2
    max_try = 3

    for _ in range(max_try):
        try:
            proxy = random.choice(ip_addresses)
            proxies = {"http": f"http://{proxy}"}
            response = requests.get(url, proxies=proxies, timeout=25, headers=HEADERS)
            response.raise_for_status()
            return response
        except requests.exceptions.HTTPError as e:
            if e.response.status_code == 429:
                print(f"Rate limited. Waiting for {backoff_time} seconds.")
                time.sleep(backoff_time)
                backoff_time *= 2
            else:
                raise
        except requests.RequestException as e:
            print(f"Error while making a request: {str(e)}")
            break

    return None
'''

#clean data

In [None]:
# read data and concate this

df_sumsung = pd.read_excel('/content/Samsung_product.xlsx')
df_apple = pd.read_excel('/content/APPLE_product.xlsx')
df_huawei = pd.read_excel('/content/Huawei_product.xlsx')
df_sony = pd.read_excel('/content/SONY_product.xlsx')
df_lg = pd.read_excel('/content/LG_product.xlsx')
df_htc = pd.read_excel('/content/HTC_product.xlsx')
df_lenovo = pd.read_excel('/content/LENOVO_product.xlsx')
df_xiaomi = pd.read_excel('/content/XIAOMI_product.xlsx')
df_asus = pd.read_excel('/content/ASUS_product.xlsx')
df_alcatel = pd.read_excel('/content/ALCATEL_product.xlsx')
df_zte = pd.read_excel('/content/ZTE_product.xlsx')
df_infinix =  pd.read_excel('/content/INFINIX_product.xlsx')
df_nokia = pd.read_excel('/content/NOKIA_product.xlsx')

In [None]:
print(f'df_sumsung:{len(df_sumsung)}')
print(f'df_apple:{len(df_apple)}')
print(f'df_huawei:{len(df_huawei)}')
print(f'df_sony:{len(df_sony)}')
print(f'df_lg:{len(df_lg)}')
print(f'df_htc:{len(df_htc)}')
print(f'df_lenovo:{len(df_lenovo)}')
print(f'df_xiaomi:{len(df_xiaomi)}')
print(f'df_asus:{len(df_asus)}')
print(f'df_alcatel:{len(df_alcatel)}')
print(f'df_zte:{len(df_zte)}')
print(f'df_infinix:{len(df_infinix)}')
print(f'df_nokia:{len(df_nokia)}')

In [None]:
df = pd.concat([df_sumsung, df_apple, df_huawei, df_sony, df_lg, df_htc, df_lenovo,
                            df_xiaomi, df_asus, df_alcatel, df_zte, df_infinix, df_nokia])

df.reset_index(drop=True, inplace=True)

In [None]:
# df.to_excel('products.xlsx')

## cheack announce data( try to retry fill Unknown values)

In [None]:
df2 = df.copy()

In [None]:
BASE_URL: str = 'https://www.gsmarena.com'

HEADERS: Dict[str, str] = {
    'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 '
                  'Safari/537.36',
    'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,image/apng,*/*;q=0.8,'
              'application/signed-exchange;v=b3;q=0.7',
    'Accept-Language': 'en-US,en;q=0.9'
}

def make_request(url:str, headers:Dict[str, str]) -> requests.Response:
    try:
        time.sleep(3)
        response = requests.get(url, timeout=50, headers=headers)
        response.raise_for_status()  # Raises a HTTPError if the response status code is 4XX/5XX
        return response
    except requests.RequestException as e:
        print(f"Error while making a request: {str(e)}")
        return None

def Fill_announce_data(url: str) -> str:
    response = make_request(url, HEADERS)
    if response:
        soup = BeautifulSoup(response.content, 'html.parser')
        announced_text = soup.select_one('[data-spec="year"]').text if soup.select_one('[data-spec="year"]') else None
        if announced_text:
            match = re.search(r'(20\d{2})', announced_text)
            if match:
                year = match.group(1)
                announcedDate = f'{year}-01'
            else:
                announcedDate = 'Unknown'
        else:
            announcedDate = 'Unknown'
        return announcedDate
    else:
        return 'respone is not 200'

for i, row in df2.iterrows():
    if row['announcedDate'] == 'Unknown':
        Date = Fill_announce_data(row['url'])
        df2.loc[i, 'announcedDate'] = Date

In [None]:
# check to decrease or not ? >> yes decrease
df2.announcedDate.isin(['Unknown']).value_counts()


## set brand label

In [None]:
for i , row in df2.iterrows() :
    if 'apple' in row['url'].lower():
        df2.loc[i, 'brand_label'] = 'Apple'
    if 'alcatel' in row['url'].lower():
        df2.loc[i, 'brand_label'] = 'Alcatel'
    if 'asus' in row['url'].lower():
        df2.loc[i, 'brand_label'] = 'Asus'
    if 'htc' in row['url'].lower():
        df2.loc[i, 'brand_label'] = 'Htc'
    if 'huawei' in row['url'].lower():
        df2.loc[i, 'brand_label'] = 'Huawei'
    if 'infinix' in row['url'].lower():
        df2.loc[i, 'brand_label'] = 'Infinix'
    if 'lenovo' in row['url'].lower():
        df2.loc[i, 'brand_label'] = 'Lenovo'
    if 'lg' in row['url'].lower():
        df2.loc[i, 'brand_label'] = 'Lg'
    if 'nokia' in row['url'].lower():
        df2.loc[i, 'brand_label'] = 'Nokia'
    if 'samsung' in row['url'].lower():
        df2.loc[i, 'brand_label'] = 'Samsung'
    if 'sony' in row['url'].lower():
        df2.loc[i, 'brand_label'] = 'Sony'
    if 'xiaomi' in row['url'].lower():
        df2.loc[i, 'brand_label'] = 'Xiaomi'
    if 'zte' in row['url'].lower():
        df2.loc[i, 'brand_label'] = 'Zte'

In [None]:
df2.brand_label.value_counts()

##  set  product label

In [None]:
tablet_keywords = ['pad', 'ipad' , 'tablet' , 'air' , 'tab']
watch_keywords = ['watch' , 'smartwatch' , 'smart_watch' , 'smaty-watch' , 'fix' ]
mobile_words = tablet_keywords + watch_keywords

for index, row in df2.iterrows():
    product_label = 'Unknown'

    if any(keyword in row['url'].lower() for keyword in tablet_keywords):
        product_label = 'Tablet'

    elif any(keyword in row['url'].lower() for keyword in watch_keywords):
        product_label = 'Watch'

    else:
        product_label = 'Phone'

    # Assign the product label to the current row
    df2.loc[index, 'product_label'] = product_label


for nokia brands there are some mistake >> but ok !!

## Released data ( check from some resources )

In [None]:
df2.releaseDate.isin(['Unknown']).value_counts()

In [None]:
df3 = df2.copy()

In [None]:
HEADERS: Dict[str, str] = {
    'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 '
                  'Safari/537.36',
    'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,image/apng,*/*;q=0.8,'
              'application/signed-exchange;v=b3;q=0.7',
    'Accept-Language': 'en-US,en;q=0.9'
}

def make_request(url:str, headers:Dict[str, str]) -> requests.Response:
    try:
        time.sleep(1)
        response = requests.get(url, timeout=50, headers=headers)
        response.raise_for_status()  # Raises a HTTPError if the response status code is 4XX/5XX
        return response
    except requests.RequestException as e:
        print(f"Error while making a request: {str(e)}")
        return None

def Fill_Release_data(url: str) -> str:
    response = make_request(url, HEADERS)
    if response:
        soup = BeautifulSoup(response.content, 'html.parser')
        release_pic = soup.select_one('[data-spec="released-hl"]')

        if release_pic:
            return release_pic.text
        else :
            return 'Unknown'

        #status_text = soup.find('td', {'data-spec': 'status'}).text if soup.find('td', {'data-spec': 'status'}) else None
        #if status_text and "Released" in status_text:
        #    return status_text


        #announced_text = soup.select_one('[data-spec="year"]').text if soup.select_one('[data-spec="year"]') else None
        #if announced_text and "Released" in announced_text:
        #    return announced_text


        #return 'Unknown'

    else:
        return 'response is not 200'


for i, row in df3.iterrows():
    print(i)
    if row['releaseDate'] in ['Unknown','response is not 200']:
        re_Date = Fill_Release_data(row['url'])
        df3.loc[i, 'releaseDate'] = re_Date

In [None]:
df3.releaseDate.isin(['Unknown', 'response']).value_counts() # decrease to 1

In [None]:
# now time to convert in a unique format
months = ['January','February','March','April','May','June','July','August','September','October','November','December']

def releaase_data_check(value):
    if 'Released' in value :
        value = value.split('Released')[-1]
        print(value)

    if re.search(r'(\d{4})-(\d{2})', value):
        return value

    match1 = re.search(r'(\d{4}\d)', value)
    if match1:
        value = f'{match1.groups()}-01'
        return value

    match2 = re.search(r'(\d{4}), (\w+)', value)
    year, month = match2.groups()
    if match2 and month in months :
        value = datetime.strptime(f'{year} {month}', '%Y %B').strftime('%Y-%m')
        return value

    if match2 and month not in months :
        value = f'{year}-01'
        return value

    return 'not match'
def release_data_check2(value):
    if 'Released' in value:
        value = value.split('Released')[-1].strip()
        print(value)

    match_correct = re.search(r'(\d{4}-\d{2})', value)
    if match_correct:
        return value

    #matching the format YYYY-MM-DD or YYYY, Month
    match_direct = re.search(r'(\d{4}), (\w+ \d+|\w+)', value)
    if match_direct:
        year, month_day = match_direct.groups()
        month = month_day.split(' ')[0]  #'May 31 >> 'May'
        if month in months:
            try:
                date_str = f"{year} {month} 1"
                return datetime.strptime(date_str, '%Y %B %d').strftime('%Y-%m')
            except ValueError:
                return 'not match'

    match_year_quarter = re.search(r'(\d{4}), Q\d', value)
    if match_year_quarter:
        year = match_year_quarter.group(1)
        return f'{year}-01'

    match_year = re.search(r'(\d{4})', value)
    if match_year:
        year = match_year.group(1)
        return f'{year}-01'

    return 'Unknown'
#print(release_data_check2('Released 2010, Q3'))
#print(release_data_check2('Released 2022, May 31'))
#print(release_data_check2('Released 2022, May Q3'))
#print(release_data_check2('Released 2010, March'))
#print(release_data_check2('2010-01'))
#print(release_data_check2('2010-05'))



In [None]:
df5 = df3.copy()
df5['releaseDate'] = df3.releaseDate.apply(release_data_check)

In [None]:
'''
df3 = df3.drop('brand_lable', axis=1)
columns_list  = df3.columns.tolist()
for i in columns_list:
    unknown_count = df3[i].isin(['Unknown']).value_counts()
    print(f'Number of unknown for {i} is: {unknown_count}')
    print('//////////')
'''


In [None]:
df5.to_excel('FINAL_PRODUCTS.xlsx')

## core


In [None]:
FINAL_PRODUCTS = pd.read_excel('/content/FINAL_PRODUCTS.xlsx')

In [None]:
FINAL_PRODUCTS2 = FINAL_PRODUCTS.copy()

In [None]:
# Hexa-core , Dual-core , Quad-core , Triple-code
keywords = ['Hexa-core', 'Dual-core', 'Quad-core', 'Triple-core' , 'Deca-core' , 'Octa-core']


for i, row in FINAL_PRODUCTS2.iterrows():
    if row['cpu'] != 'Unknown':
        for keyword in keywords:
            if keyword in row['cpu']:
                core = row['cpu'].split()[0].strip()
                FINAL_PRODUCTS2.loc[i, 'cpu'] = core
                break
            else:
                FINAL_PRODUCTS2.loc[i, 'cpu'] = 'Single_core'


print(FINAL_PRODUCTS['cpu'].value_counts())
print('///////////////////////')
print(FINAL_PRODUCTS2['cpu'].value_counts())

In [None]:
FINAL_PRODUCTS2.to_excel('FINAL_PRODUCTS2.xlsx')

## price

In [None]:
totall = pd.read_excel('/content/FINAL_PRODUCTS2.xlsx')

In [None]:
total2 =  total1.copy()

In [None]:
total1.price.isin(['Unknown',{},'']).value_counts()

In [None]:
HEADERS: Dict[str, str] = {
    'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 '
                  'Safari/537.36',
    'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,image/apng,*/*;q=0.8,'
              'application/signed-exchange;v=b3;q=0.7',
    'Accept-Language': 'en-US,en;q=0.9'
}

def make_request(url:str, headers:Dict[str, str]) -> requests.Response:
    try:
        time.sleep(2)
        response = requests.get(url, timeout=50, headers=headers)
        response.raise_for_status()
        return response
    except requests.RequestException as e:
        print(f"Error while making a request: {str(e)}")
        return None
'''
def fill_price_data(url: str) -> str:
    response = make_request(url, HEADERS)
    if response:
        soup = BeautifulSoup(response.content, 'html.parser')
        price_element = soup.find('td', {'data-spec': 'price'})
        price_text = price_element.get_text(strip=True) if price_element else 'Unknown'

        prices = {}
        if 'About' in price_text:
            prices['About'] = price_text
        if '/' in price_text:
            for part in price_text.split('/'):
                part = part.strip()
                if '$' in part:
                    prices['USD'] = ''.join(filter(lambda x: x.isdigit() or x == '.', part.replace("$", "").strip()))
                elif '€' in part:
                    prices['EUR'] = ''.join(filter(lambda x: x.isdigit() or x == '.', part.replace("€", "").strip()))
                elif '£' in part:
                    prices['GBP'] = ''.join(filter(lambda x: x.isdigit() or x == '.', part.replace("£", "").strip()))
                elif '₹' in part:
                    prices['INR'] = ''.join(filter(lambda x: x.isdigit() or x == '.', part.replace("₹", "").strip()))
        else :
            prices['Other'] = price_text

        return prices
    else:
        return 'response is not 200'
'''
def fill_price_data(url: str) -> Dict[str, str]:
    response = make_request(url, HEADERS)
    if response:
        soup = BeautifulSoup(response.content, 'html.parser')
        price_element = soup.find('td', {'data-spec': 'price'})
        price_text = price_element.get_text(strip=True) if price_element else 'Unknown'

        prices = {}
        currency_symbols = {'$': 'USD', '€': 'EUR', '£': 'GBP', '₹': 'INR'}

        if 'About' in price_text or '/' not in price_text:
            for symbol, code in currency_symbols.items():
                if symbol in price_text:
                    prices[code] = ''.join(filter(lambda x: x.isdigit() or x == '.', price_text.replace(symbol, "").strip()))
                    break
            else:
                prices['About'] = price_text

        else:
            for part in price_text.split('/'):
                part = part.strip()
                for symbol, code in currency_symbols.items():
                    if symbol in part:
                        prices[code] = ''.join(filter(lambda x: x.isdigit() or x == '.', part.replace(symbol, "").strip()))
                        break
        return prices


    else:
        return {'Error': 'Response is not 200 or request failed'}


for i, row in total2.iterrows():
    print(i)
    re_price = fill_price_data(row['url'])
    total2.loc[i, 'price'] = str(re_price)

In [None]:
total2.to_excel('total2.xlsx')

In [None]:
toral2 = total2.copy()

In [None]:
# fix the price column values

import ast  # To safely evaluate string literal to dictionary

def Convert_to_eur(price_str):
    # Default conversion rates
    conversions = {'USD': 0.93, 'GBP': 1.17, 'INR': 0.011}

    try:
        # Convert the string to an actual dictionary
        price_dict = ast.literal_eval(price_str)
    except ValueError:
        return 'Unknown'

    if 'EUR' in price_dict:
        return price_dict['EUR']

    elif 'About' in price_dict:
        about_text = price_dict['About']
        # Extract the numeric value from the 'About' text
        if 'EUR' in about_text:
            # Convert the extracted amount directly to EUR
            amount = about_text.split()[1]
        elif 'INR' in about_text:
            amount = float(about_text.split()[1]) * conversions['INR']
        elif 'USD' in about_text:
            amount = float(about_text.split()[1]) * conversions['USD']
        elif 'GBP' in about_text:
            amount = float(about_text.split()[1]) * conversions['GBP']
        else:
            return 'Unknown'

        return "{:.2f}".format(float(amount))

    else:
        for currency, amount in price_dict.items():
            if currency in conversions:
                return "{:.2f}".format(float(amount) * conversions[currency])

    return 'Unknown'

# Example
price_samples = [
    "{'EUR': '1299.00', 'USD': '1089.00', 'GBP': '1149.00', 'INR': '148900'}",
    "{'About': 'About 160 EUR'}",
    "{'USD': '909.97'}",
    "{'About': 'Unknown'}",
    "{'About': 'About 22000 INR'}"]


toral2['price_in_eur'] = toral2['price'].apply(Convert_to_eur)

In [None]:
total2.price = toral2['price_in_eur']

In [None]:
final_products = total2.drop('lable' , axis=1)
final_products.to_excel('FINAL_PRODUCTS2.xlsx')