In [3]:
import requests
from bs4 import BeautifulSoup
import csv
from urllib.parse import urljoin

In [4]:

# Define your base URL and parameters
base_url = "https://amul.com/m/amul-hits"

# CSV file setup
csv_file = "amul_hits.csv"
csv_columns = ['Year', 'Description', 'Image Alt', 'Image Link']

# Function to scrape the data for a given year and l value
def scrape_year(year, l_value):
    results = []
    url = f"{base_url}?s={year}&l={l_value}"
    try:
        response = requests.get(url)
        response.raise_for_status()  # Raise an HTTPError if the HTTP request returned an unsuccessful status code
        soup = BeautifulSoup(response.text, 'html.parser')
        
        descriptions = soup.select('a h2')
        image_alts = soup.select('li a img')
        images = soup.select('.brandslist a')
        
        for desc, img_alt, img in zip(descriptions, image_alts, images):
            results.append({
                'Year': year,
                'Description': desc.get_text(strip=True),
                'Image Alt': img_alt.get('alt'),
                'Image Link': urljoin(base_url, img.get('href'))
            })
        return results
    except requests.HTTPError:
        # Return an empty list in case of HTTPError
        return []

In [6]:
all_results = []

In [8]:
from time import sleep

In [9]:

# Open the CSV file for writing
with open(csv_file, 'w', newline='', encoding='utf-8') as file:
    writer = csv.DictWriter(file, fieldnames=csv_columns)
    writer.writeheader()
    
    # Loop over every year and every l value
    for year in range(2023, 1975, -1):
        for l_value in range(0, 11):
            if l_value == 0:
                # Special handling for the first loop in a new year
                year_data = scrape_year(year, l_value)
                if not year_data:
                    # If no data found for l=0, skip the year
                    break
            else:
                # If it's not the first loop, just scrape the data
                year_data = scrape_year(year, l_value)
            all_results.extend(year_data)
            # Write data to CSV
            for row_data in year_data:
                writer.writerow(row_data)
            # Sleep for 1 second
            sleep(1)    
            print(f"Scraped data for year {year} and l={l_value}")


Scraped data for year 2023 and l=0
Scraped data for year 2023 and l=1
Scraped data for year 2023 and l=2
Scraped data for year 2023 and l=3
Scraped data for year 2023 and l=4
Scraped data for year 2023 and l=5
Scraped data for year 2023 and l=6
Scraped data for year 2023 and l=7
Scraped data for year 2023 and l=8
Scraped data for year 2023 and l=9
Scraped data for year 2023 and l=10
Scraped data for year 2022 and l=0
Scraped data for year 2022 and l=1
Scraped data for year 2022 and l=2
Scraped data for year 2022 and l=3
Scraped data for year 2022 and l=4
Scraped data for year 2022 and l=5
Scraped data for year 2022 and l=6
Scraped data for year 2022 and l=7
Scraped data for year 2022 and l=8
Scraped data for year 2022 and l=9
Scraped data for year 2022 and l=10
Scraped data for year 2021 and l=0
Scraped data for year 2021 and l=1
Scraped data for year 2021 and l=2
Scraped data for year 2021 and l=3
Scraped data for year 2021 and l=4
Scraped data for year 2021 and l=5
Scraped data for y

In [11]:
# read the file and download the images into a folder
import pandas as pd
import requests
import os

# Read the CSV file
df = pd.read_csv(csv_file)
# Create a folder to store the images
os.makedirs('images', exist_ok=True)

# Loop over every row in the dataframe
for index, row in df.iterrows():
    # Get the image link from the dataframe
    image_link = row['Image Link']
    # Get the image name from the image link
    image_name = image_link.split('/')[-1]
    # Download the image
    response = requests.get(image_link)
    sleep(1)
    # Save the image
    with open(f"images/{image_name}", 'wb') as file:
        file.write(response.content)
    print(f"Downloaded image {image_name}")

Downloaded image amul-hits-3411.jpg
Downloaded image amul-hits-3410.jpg
Downloaded image amul-hits-3409.jpg
Downloaded image amul-hits-3408.jpg
Downloaded image amul-hits-3407.jpg
Downloaded image amul-hits-3405.jpg
Downloaded image amul-hits-3406.jpg
Downloaded image amul-hits-3404.jpg
Downloaded image amul-hits-3403.jpg
Downloaded image amul-hits-3402.jpg
Downloaded image amul-hits-3401.jpg
Downloaded image amul-hits-3400.jpg
Downloaded image amul-hits-3399.jpg
Downloaded image amul-hits-3398.jpg
Downloaded image amul-hits-3397.jpg
Downloaded image amul-hits-3396.jpg
Downloaded image amul-hits-3395.jpg
Downloaded image amul-hits-3394.jpg
Downloaded image amul-hits-3393.jpg
Downloaded image amul-hits-3392.jpg
Downloaded image amul-hits-3390.jpg
Downloaded image amul-hits-3391.jpg
Downloaded image amul-hits-3389.jpg
Downloaded image amul-hits-3387.jpg
Downloaded image amul-hits-3388.jpg
Downloaded image amul-hits-3386.jpg
Downloaded image amul-hits-3385.jpg
Downloaded image amul-hits-3