In [3]:
pip install requests beautifulsoup4 pandas

Note: you may need to restart the kernel to use updated packages.




*web scraping of mobile details from flipkart*

In [7]:
import requests
from bs4 import BeautifulSoup
import pandas as pd
import time

def get_flipkart_mobile_info(page_number=1):
    base_url = "https://www.flipkart.com"
    url = f"{base_url}/mobiles/pr?sid=tyy%2C4io&marketplace=FLIPKART&page={page_number}"

    mobile_info = []

    response = requests.get(url)
    if response.status_code == 200:
        soup = BeautifulSoup(response.content, 'html.parser')

        for product_card in soup.find_all('div', class_='_1AtVbE'):
            name_tag = product_card.find('div', class_='_4rR01T')
            price_tag = product_card.find('div', class_='_30jeq3')
            warranty_tag = product_card.find('div', {'class': '_24_Dny'})
            ram_storage_tag = product_card.find_all('li', class_='rgWa7D')

            brand_tag = product_card.find('div', class_='_2WkVRV')
            rating_tag = product_card.find('div', class_='_3LWZlK')
            display_size_tag = product_card.find('li', {'title': 'Display Size'})
            camera_tag = product_card.find('li', {'title': 'Primary Camera'})

            if name_tag and price_tag:
                mobile_name = name_tag.text.strip()
                mobile_price = price_tag.text.strip()
                mobile_warranty = warranty_tag.text.strip() if warranty_tag else "N/A"

                ram_storage_info = {"RAM": "N/A", "Storage": "N/A"}
                for info in ram_storage_tag:
                    text = info.text.strip()
                    if "RAM" in text:
                        ram_storage_info["RAM"] = text
                    elif "ROM" in text or "Storage" in text:
                        ram_storage_info["Storage"] = text

                brand = brand_tag.text.strip() if brand_tag else "N/A"
                rating = rating_tag.text.strip() if rating_tag else "N/A"
                display_size = display_size_tag.text.strip() if display_size_tag else "N/A"
                primary_camera = camera_tag.text.strip() if camera_tag else "N/A"

                mobile_info.append({
                    "Name": mobile_name,
                    "Price": mobile_price,
                    "Warranty": mobile_warranty,
                    "RAM": ram_storage_info["RAM"],
                    "Storage": ram_storage_info["Storage"],
                    "Brand": brand,
                    "Rating": rating,
                    "Display Size": display_size,
                    "Primary Camera": primary_camera
                })

        return mobile_info
    elif response.status_code == 429:
        print(f"Rate limit reached. Retrying after 3 seconds...")
        time.sleep(3)
        return get_flipkart_mobile_info(page_number=page_number)  # Retry the request after a delay
    else:
        print(f"Failed to retrieve the page {page_number}. Status code: {response.status_code}")
        print(f"HTML Content for page {page_number}: {response.content}")
        return None

# Create an empty list to store all DataFrames
all_dfs = []

# Iterate through pages from 1 to 400
for page_num in range(1, 42):
    mobile_info_page = get_flipkart_mobile_info(page_number=page_num)

    if mobile_info_page:
        try:
            df_page = pd.DataFrame(mobile_info_page)
        except Exception as e:
            print(f"Error creating DataFrame for page {page_num}: {e}")
            print(f"HTML Content for page {page_num}: {response.content}")
            continue  # Skip to the next iteration if DataFrame creation fails

        all_dfs.append(df_page)
        print(f"Mobile Phone Information (Page {page_num}) appended to the overall list.")
    else:
        print(f"No data found for page {page_num}. Skipping to the next page.")

# Concatenate all DataFrames in the list into a single DataFrame
all_mobile_info_df = pd.concat(all_dfs, ignore_index=True)

# Display the overall DataFrame
print(all_mobile_info_df)


Mobile Phone Information (Page 1) appended to the overall list.
Mobile Phone Information (Page 2) appended to the overall list.
Rate limit reached. Retrying after 3 seconds...
Mobile Phone Information (Page 3) appended to the overall list.
Mobile Phone Information (Page 4) appended to the overall list.
Mobile Phone Information (Page 5) appended to the overall list.
Mobile Phone Information (Page 6) appended to the overall list.
Mobile Phone Information (Page 7) appended to the overall list.
Mobile Phone Information (Page 8) appended to the overall list.
Mobile Phone Information (Page 9) appended to the overall list.
Mobile Phone Information (Page 10) appended to the overall list.
Mobile Phone Information (Page 11) appended to the overall list.
Mobile Phone Information (Page 12) appended to the overall list.
Mobile Phone Information (Page 13) appended to the overall list.
Mobile Phone Information (Page 14) appended to the overall list.
Mobile Phone Information (Page 15) appended to the 

dataset saved in a excel format

In [8]:
all_mobile_info_df.to_excel("mobile_info.xlsx", index=False)

cleaning and organising the dataset

In [2]:
import pandas as pd

# Load the Excel file into a DataFrame
file_path = 'mobile_info.xlsx'  # Replace with your file path
df = pd.read_excel(file_path)


In [3]:
df['RAMM'] = df['RAM'].str.split('|').str[0].str.strip()
df['ROM'] = df['RAM'].str.split('|').str[1:].apply(lambda x: '|'.join(x).strip() if isinstance(x, list) else None)
df[['Column1', 'Column2']] = df['Name'].str.extract(r'\(([^,]*),([^)]*)\)')
df['ROM'] = df['ROM'].fillna(df['Column2'])
df['Brand'] = df['Name'].str.split().str[0]


In [4]:
df

Unnamed: 0,Name,Price,Warranty,RAM,Storage,Brand,Rating,Display Size,Primary Camera,RAMM,ROM,Column1,Column2
0,"APPLE iPhone 15 (Black, 128 GB)","₹73,999",,,128 GB ROM,APPLE,4.6,,,,128 GB,Black,128 GB
1,"POCO C51 (Royal Blue, 64 GB)","₹5,799",,4 GB RAM | 64 GB ROM | Expandable Upto 1 TB,,POCO,4.0,,,4 GB RAM,64 GB ROM | Expandable Upto 1 TB,Royal Blue,64 GB
2,"vivo T2x 5G (Aurora Gold, 128 GB)","₹14,999",,8 GB RAM | 128 GB ROM,,vivo,4.3,,,8 GB RAM,128 GB ROM,Aurora Gold,128 GB
3,"APPLE iPhone 15 (Green, 128 GB)","₹73,999",,,128 GB ROM,APPLE,4.6,,,,128 GB,Green,128 GB
4,"vivo T2x 5G (Glimmer Black, 128 GB)","₹12,999",,6 GB RAM | 128 GB ROM,,vivo,4.4,,,6 GB RAM,128 GB ROM,Glimmer Black,128 GB
...,...,...,...,...,...,...,...,...,...,...,...,...,...
979,"vivo V25 5G (Elegant Black, 256 GB)","₹26,790",,12 GB RAM | 256 GB ROM,,vivo,4.2,,,12 GB RAM,256 GB ROM,Elegant Black,256 GB
980,SAMSUNG Metro 313 Dual Sim,"₹2,399",,10 MB RAM | 2 MB ROM | Expandable Upto 16 GB,,SAMSUNG,4.3,,,10 MB RAM,2 MB ROM | Expandable Upto 16 GB,,
981,"vivo V25 5G (Surfing Blue, 256 GB)","₹27,999",,12 GB RAM | 256 GB ROM,,vivo,4.2,,,12 GB RAM,256 GB ROM,Surfing Blue,256 GB
982,"itel S23 (Starry Black, 128 GB)","₹8,399",,8 GB RAM | 128 GB ROM,,itel,4.1,,,8 GB RAM,128 GB ROM,Starry Black,128 GB


In [5]:
df.drop(columns=['Column2','RAM','Storage'],inplace=True)
df.rename(columns={'RAMM': 'RAM','Column1':'Colour','ROM':'Storage'},inplace=True)

In [6]:
df

Unnamed: 0,Name,Price,Warranty,Brand,Rating,Display Size,Primary Camera,RAM,Storage,Colour
0,"APPLE iPhone 15 (Black, 128 GB)","₹73,999",,APPLE,4.6,,,,128 GB,Black
1,"POCO C51 (Royal Blue, 64 GB)","₹5,799",,POCO,4.0,,,4 GB RAM,64 GB ROM | Expandable Upto 1 TB,Royal Blue
2,"vivo T2x 5G (Aurora Gold, 128 GB)","₹14,999",,vivo,4.3,,,8 GB RAM,128 GB ROM,Aurora Gold
3,"APPLE iPhone 15 (Green, 128 GB)","₹73,999",,APPLE,4.6,,,,128 GB,Green
4,"vivo T2x 5G (Glimmer Black, 128 GB)","₹12,999",,vivo,4.4,,,6 GB RAM,128 GB ROM,Glimmer Black
...,...,...,...,...,...,...,...,...,...,...
979,"vivo V25 5G (Elegant Black, 256 GB)","₹26,790",,vivo,4.2,,,12 GB RAM,256 GB ROM,Elegant Black
980,SAMSUNG Metro 313 Dual Sim,"₹2,399",,SAMSUNG,4.3,,,10 MB RAM,2 MB ROM | Expandable Upto 16 GB,
981,"vivo V25 5G (Surfing Blue, 256 GB)","₹27,999",,vivo,4.2,,,12 GB RAM,256 GB ROM,Surfing Blue
982,"itel S23 (Starry Black, 128 GB)","₹8,399",,itel,4.1,,,8 GB RAM,128 GB ROM,Starry Black


In [7]:
import re
df['Name'] = df['Name'].apply(lambda x: re.sub(r'\([^)]*\)', '', x))
df['Name'] = df['Name'].apply(lambda x: ' '.join(x.split()[1:]))
df['Storage'] = df['Storage'].str.extract(r'(\d+ GB)')


df

Unnamed: 0,Name,Price,Warranty,Brand,Rating,Display Size,Primary Camera,RAM,Storage,Colour
0,iPhone 15,"₹73,999",,APPLE,4.6,,,,128 GB,Black
1,C51,"₹5,799",,POCO,4.0,,,4 GB RAM,64 GB,Royal Blue
2,T2x 5G,"₹14,999",,vivo,4.3,,,8 GB RAM,128 GB,Aurora Gold
3,iPhone 15,"₹73,999",,APPLE,4.6,,,,128 GB,Green
4,T2x 5G,"₹12,999",,vivo,4.4,,,6 GB RAM,128 GB,Glimmer Black
...,...,...,...,...,...,...,...,...,...,...
979,V25 5G,"₹26,790",,vivo,4.2,,,12 GB RAM,256 GB,Elegant Black
980,Metro 313 Dual Sim,"₹2,399",,SAMSUNG,4.3,,,10 MB RAM,16 GB,
981,V25 5G,"₹27,999",,vivo,4.2,,,12 GB RAM,256 GB,Surfing Blue
982,S23,"₹8,399",,itel,4.1,,,8 GB RAM,128 GB,Starry Black


In [8]:
df.rename(columns={'Name': 'Model'},inplace=True)
df['Brand'] = df['Brand'].str.capitalize()
df['Model'] = df['Model'].str.capitalize()

In [10]:
df[df.duplicated()]
df.drop_duplicates(inplace=True)
df.to_excel("mob_info.xlsx", index=False)