## Importing the modules.

In [99]:
from bs4 import BeautifulSoup
import requests
import pandas as pd

## Creating the get_url function to get the url.

In [100]:
def get_url(brand, location):

    brand=brand.lower()
    # Convert location to lowercase for case-insensitive comparison
    location_lower = location.lower()

    # Dictionary mapping location names to their IDs
    locations = {
        'delhi ncr': 1,
        'new delhi': 2,
        'mumbai': 2378,
        'bangalore': 5709,
        'hyderabad': 3686,
        'ahmedabad': 1692,
        'gurgaon': 5,
        'chennai': 5732,
        'pune': 2423,
        'noida': 134,
        'ghaziabad': 132,
        'lucknow': 290,
        'jaipur': 2130,
        'kolkata': 777,
        'kochi': 6356,
        'indore': 2920
    }

    # Check if the location exists in the dictionary
    if location_lower in locations:
        location_id = locations[location_lower]
    else:
        raise ValueError(f"Location '{location}' is not supported.")
    
    #defining the base url
    base_url = f'https://www.cars24.com/buy-used-car?f=make%3A%3D%3A{brand}&sort=bestmatch&serveWarrantyCount=true&listingSource=TabFilter&storeCityId={location_id}'
    
    return base_url


## Creating the data_extraction function to extract the details of the cars.

In [101]:
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.chrome.service import Service as ChromeService
from webdriver_manager.chrome import ChromeDriverManager
import time

In [102]:
def data_extraction(brand, location):
    try:
        url = get_url(brand, location)
        
        # Set up the Chrome WebDriver
        driver = webdriver.Chrome(service=ChromeService(ChromeDriverManager().install()))
        
        
        # Open the URL
        driver.get(url)

        # to move to the bottom of the page
        last_height = driver.execute_script("return document.body.scrollHeight")
        while True:
            # Scroll down to the bottom
            driver.execute_script("window.scrollTo(0, document.body.scrollHeight);")
           
            # Wait for new data to load
            time.sleep(5)
            
            # Calculate new scroll height and compare with last scroll height
            new_height = driver.execute_script("return document.body.scrollHeight")
            if new_height == last_height:
                break
            last_height = new_height
        
        # Extract car names and years
        car_elements = driver.find_elements(By.CSS_SELECTOR, "h3._11dVb")
        cars = []
        years = []
        for elem in car_elements:
            parts = elem.text.split()
            years.append(parts[0])
            cars.append(" ".join(parts[1:]))
        
        # Extract distance traveled, fuel type, transmission type
        dist_elements = driver.find_elements(By.CSS_SELECTOR, "div._2YB7p ul li:nth-child(1)")
        dist = [elem.text for elem in dist_elements]
        
        fuel_elements = driver.find_elements(By.CSS_SELECTOR, "div._2YB7p ul li:nth-child(3)")
        fuel = [elem.text for elem in fuel_elements]
        
        trans_elements = driver.find_elements(By.CSS_SELECTOR, "div._2YB7p ul li:nth-child(5)")
        trans = [elem.text for elem in trans_elements]
        
        # Extract prices
        price_elements = driver.find_elements(By.CSS_SELECTOR, "strong._3RL-I")
        prices = [elem.text for elem in price_elements]
        
        # Create a dictionary to store the extracted data
        scraped_data = {
            'Car Model': cars,
            'Year of Manufacture': years,
            'Distance Traveled': dist,
            'Fuel Type': fuel,
            'Transmission Type': trans,
            'Prices': prices
        }
        
        # Close the WebDriver
        driver.quit()
        
        return scraped_data
    
    except Exception as e:
        print(f"An error occurred for {brand} in {location}: {e}")
        if 'driver' in locals():
            driver.quit()
        return None



In [103]:
# Example usage
brand = "Mahindra"
location = "mumbai"
data = data_extraction(brand, location)
df = pd.DataFrame(data)

In [104]:
df

Unnamed: 0,Car Model,Year of Manufacture,Distance Traveled,Fuel Type,Transmission Type,Prices
0,Mahindra XUV300 W6 1.5 DIESEL,2022,"16,454 KM",DIESEL,MANUAL,₹10.45 Lakh
1,Mahindra XUV300 W8 (O) 1.2 PETROL,2019,"69,558 KM",PETROL,MANUAL,₹7.71 Lakh
2,Mahindra XUV300 W6 1.2 PETROL AMT,2022,"13,010 KM",PETROL,AUTOMATIC,₹9.62 Lakh
3,Mahindra XUV300 W8 1.5 DIESEL,2019,"80,839 KM",DIESEL,MANUAL,₹8.28 Lakh
4,Mahindra XUV500 W10 AT,2017,"57,449 KM",DIESEL,AUTOMATIC,₹11.38 Lakh
5,Mahindra XUV300 W6 1.5 DIESEL AMT,2021,"38,524 KM",DIESEL,AUTOMATIC,₹9.28 Lakh
6,Mahindra XUV500 W10 AT,2017,"84,784 KM",DIESEL,AUTOMATIC,₹10.22 Lakh
7,Mahindra XUV300 W8 1.5 DIESEL,2019,"51,938 KM",DIESEL,MANUAL,₹8.84 Lakh
8,Mahindra XUV300 W6 1.2 PETROL,2019,"39,094 KM",PETROL,MANUAL,₹7.89 Lakh
9,Mahindra XUV300 W8 (O) 1.2 PETROL AMT,2022,"20,405 KM",PETROL,AUTOMATIC,₹11.27 Lakh


## Cleaning the dataset df

In [105]:
df.isnull().sum()

Car Model              0
Year of Manufacture    0
Distance Traveled      0
Fuel Type              0
Transmission Type      0
Prices                 0
dtype: int64

In [106]:
# from above we can clearly tell that the df dataset is cleaned ,there are no missing values ,null da values
# the above dataset is cleaned

## Storing the data in a CSV file

In [107]:
df.to_csv("Car24_miniproject.csv",index=False)