In [1]:
import pandas as pd
import numpy as np

In [2]:
from time import sleep
from random import random
import requests
from tqdm.notebook import tqdm
from bs4 import BeautifulSoup
from urllib.parse import urljoin
import urllib.parse as urlparse
from urllib.parse import parse_qs

In [3]:
BASE_URL = 'https://www.flipkart.com/'
SEARCH_QUERY = "mobiles"
TOP_N_PRODUCTS = 5
REVIEW_PAGES_TO_SCRAPE_FROM_PER_PRODUCT = 50

In [4]:
def get_popular_product_s_titles_and_urls(search_query, popular_products_count_limit):
    
    search_url = f"{BASE_URL}search?q={search_query}&otracker=categorytree&sort=price_desc"
    search_response = requests.get(search_url)
    
    search_html_soup = BeautifulSoup(search_response.content, 'html.parser')
    search_results_products = search_html_soup.find_all('div',attrs={"class":"_2kHMtA"})
    
    product_titles, product_urls = [],[]
    
    product_count = 0
    
    for product in tqdm(search_results_products, desc="Search Results Iteration", position=0, leave=True):
        title_mention_subrow = product.find("a", attrs={"class":"_1fQZEK"})
        products=product.find("div",attrs={"class":"_4rR01T"})
        product_titles.append(products.get_text())
        product_relative_url = title_mention_subrow["href"]
        product_url = urljoin(BASE_URL,product_relative_url)
            
        parsed_url = urlparse.urlparse(product_url)
        parsed_url_path = parsed_url.path
        parsed_url_path_split = parsed_url_path.split("/")
        parsed_url_path_split[2] = "product-reviews"
        parsed_url_path_modified = "/".join(parsed_url_path_split)
        parsed_url_modified = parsed_url._replace(path=parsed_url_path_modified)
        product_url = parsed_url_modified.geturl()
            
        
        product_urls.append(product_url)
            
        product_count += 1
            
        if popular_products_count_limit and (product_count >= popular_products_count_limit):
            break
    return product_titles, product_urls

In [5]:
product_titles,product_urls = get_popular_product_s_titles_and_urls(SEARCH_QUERY, TOP_N_PRODUCTS);

Search Results Iteration:   0%|          | 0/24 [00:00<?, ?it/s]

In [6]:
from prettytable import PrettyTable
x = PrettyTable()
x.field_names = ["# Products", "# Reviews Per Page", "# Pages", "# Total Reviews Count"]
x.add_row([len(product_urls), 10, REVIEW_PAGES_TO_SCRAPE_FROM_PER_PRODUCT, len(product_urls)*10*REVIEW_PAGES_TO_SCRAPE_FROM_PER_PRODUCT])
print(x)

+------------+--------------------+---------+-----------------------+
| # Products | # Reviews Per Page | # Pages | # Total Reviews Count |
+------------+--------------------+---------+-----------------------+
|     5      |         10         |    50   |          2500         |
+------------+--------------------+---------+-----------------------+


In [7]:
dataset = []

for idx, url in enumerate(tqdm(product_urls, desc='products')):
    # iterating over review pages
    for i in tqdm(range(1,REVIEW_PAGES_TO_SCRAPE_FROM_PER_PRODUCT+1), desc="review pages", position=0, leave=False):
        parsed=urlparse.urlparse(url)
        pid=parse_qs(parsed.query)['pid'][0]
        URL=f"{url}&page={i}"
        
        r=requests.get(URL)
        
        # Pause the loop for 0-1 seconds to simulate natural setting not overwhelm the server with back to back requests without any pause
        sleep=(2)
        
        soup=BeautifulSoup(r.content,'html.parser')
        
        rows=soup.find_all('div',attrs={'class':"col _2wzgFH K0kLPL"})
        
        for row in rows:
            sub_row=row.find_all('div',attrs={'class':'row'})
            rating=sub_row[0].find('div').text
            summary=sub_row[0].find('p').text.strip()
            review=sub_row[1].find_all('div')[2].text
            reviewed_by=sub_row[3].find('p',attrs={'class':'_2sc7ZR _2V5EHH'}).text
            location_row = sub_row[3].find('p',attrs={'class':'_2mcZGG'})
            if location_row:
                location_row = location_row.find_all('span')
                if len(location_row)>=2:
                    location = location_row[1].text
                    location = "".join(location.split(",")[1:]).strip()
                
            date = sub_row[3].find_all('p',attrs={'class':'_2sc7ZR'})[1].text
            sub_row_2 = row.find_all('div',attrs={'class':'_1e9_Zu'})[0].find_all('span',attrs={'class':'_3c3Px5'})

            upvotes = sub_row_2[0].text
            downvotes = sub_row_2[1].text
            
            upvotes = sub_row_2[0].text
            downvotes = sub_row_2[1].text
            
            dataset.append({'product_id':pid, 'product_title':product_titles[idx], 'rating': rating,
                            'summary': summary, 'review': review, 'reviewed_by':reviewed_by, 'location' : location, 
                            'date' : date, 
                            'upvotes' : upvotes, 'downvotes' : downvotes})
        
        
        
            

            
            

products:   0%|          | 0/5 [00:00<?, ?it/s]

review pages:   0%|          | 0/50 [00:00<?, ?it/s]

review pages:   0%|          | 0/50 [00:00<?, ?it/s]

review pages:   0%|          | 0/50 [00:00<?, ?it/s]

review pages:   0%|          | 0/50 [00:00<?, ?it/s]

review pages:   0%|          | 0/50 [00:00<?, ?it/s]

In [8]:
df = pd.DataFrame(dataset)

with pd.option_context('display.max_colwidth', -1):
    display(df.head(5))
    display(df.tail(5))

Unnamed: 0,product_id,product_title,rating,summary,review,reviewed_by,location,date,upvotes,downvotes
0,MOBGHWFHHURZWVKE,"APPLE iPhone 14 Pro Max (Silver, 1 TB)",5,Just wow!,"Finally after long wait I got my 512 GB IPhone 14 Pro Max, Deep Purple Colour. Superb Camera quality. Looks Premium. Upgraded from IPhone XR. Battery backup is nice. Thank you Flipkart for timely delivery.",Sandhya Rani Mohapatra,Bhubaneswar,3 months ago,7591,802
1,MOBGHWFHHURZWVKE,"APPLE iPhone 14 Pro Max (Silver, 1 TB)",5,Terrific purchase,Madly in love with the phone performance is blazing fast I love the pro motion display and dynamic island works as advertised I am in love with that feature 😍 I would like to thank Flipkart for organising the stock for me and delivering my phone on time before the delivery date,Jeevan Batla,Mumbai,3 months ago,4074,636
2,MOBGHWFHHURZWVKE,"APPLE iPhone 14 Pro Max (Silver, 1 TB)",5,Great product,One of the best Smartphone you can buy right now.Let's start with the Pros of the phone:-Dynamic Island is a joy-Always-on screen is gorgeous-Exquisitely premium design-Beautiful 120Hz display-Incredible 48+12+12+12 MP camera system-Blistering performance-Outstanding battery life-Robust iOS 16 softwareNow let's talk a little about the Cons:-One of the heaviest phones you can buy-Expensive,Priyam Gupta,Kanpur,3 months ago,3237,532
3,MOBGHWFHHURZWVKE,"APPLE iPhone 14 Pro Max (Silver, 1 TB)",4,Very Good,"I think there is much hype created for the camera and the dynamic island. If I talk about camera, I have gotten better images with iPhone 13 in comparison to iPhone 14 Pro Max. Design and build is damn good for sure. Display is crisp and clear. Feels premium when you hold it. But it is heavy, so could feel a lot of weight in your hands. Sound is great with the onboard speakers. It is loud and clear. However if you want to feel bass do use a headphone or AirPods.",Abhishek Pushker,Ghaziabad,2 months ago,88,31
4,MOBGHWFHHURZWVKE,"APPLE iPhone 14 Pro Max (Silver, 1 TB)",5,Super!,I purchased iphone first time. I loved the camera quality. But I would ask apple to please provide a feature guide. So we know how to utilize iphone fully and feel like it's worth paying lacs of rupees.,Sumit Uttam,Kanpur,2 months ago,19,4


Unnamed: 0,product_id,product_title,rating,summary,review,reviewed_by,location,date,upvotes,downvotes
257,MOBGHWFHEUDJH7TG,"APPLE iPhone 14 Pro (Deep Purple, 1 TB)",5,Must buy!,Nice phone,MANJEET SINGH,Bengaluru,28 days ago,1,1
258,MOBGHWFHEUDJH7TG,"APPLE iPhone 14 Pro (Deep Purple, 1 TB)",2,Could be way better,Camera is very poor,Rohit Kushwaha,Bhiwadi,3 months ago,15,96
259,MOBGHWFHEUDJH7TG,"APPLE iPhone 14 Pro (Deep Purple, 1 TB)",1,Absolute rubbish!,Very bed,Mahesh kumar Kumar,Samalkha,1 month ago,5,25
260,MOBGHWFHEUDJH7TG,"APPLE iPhone 14 Pro (Deep Purple, 1 TB)",1,Not recommended at all,Wrost product facing many issues.,Gaurav Sonal,Patna,1 month ago,3,22
261,MOBGHWFHEUDJH7TG,"APPLE iPhone 14 Pro (Deep Purple, 1 TB)",5,Wonderful,"My first pro model, upgraded from 12. Thank you flipkart for the discount price compared to shops.",Vishnu T das,Thrissur,Today,0,0
