### Outline Project 

* Site is https://www.cars24.com
* Get the list of cities names. For each city, get city title, city page URL 
* For each city, get the top 20 used cars url from each pages 
* For each car, we'll take 
    - car name, brand name, history, kms driven
    - registartion, year of purchase, owner, fuel
    - transmission, insurance
* For each city, create CSV file


#### import libraries for scraping 

In [191]:
!pip install requests bs4 -q

In [192]:
import requests 
from bs4 import BeautifulSoup 

#### Downloadinf Web Page with `requests.get()` function 

In [213]:
# hard coding Cities name in list 

cities = ["New delhi", "noida", "gurgaon", "pune",
         "bengaluru", "hyderabad", "chennai", "kolkata",
         "Ahmedabad"]

In [194]:
def gather_cities_url(cities, pages = 5):
    ''' return dictionary which contains key as cities name and 
        values as cities url no of `pages`'''
    
    # will contain each cities name as key and 
    # values as cities url for each page
    cities_dict = {} 
    
    for city in cities:
        # trimming whitespaces, replacing string spaces with '-'
        # converting into lowercase 
        city = city.strip().replace(" ", "-").lower()
        
        # contains each page URL's of city 
        city_URL = []
        for page in range(1,pages+1):
            URL = f"https://www.cars24.com/buy-used-cars-{city}/?page={page}"
            city_URL.append(URL)
        
        # storing city name as key and
        # city pages url's as values into cities_dict 
        cities_dict[city]= city_URL
        
    return cities_dict        

In [195]:
def get_city_requests(city_name, cities_url_):
    ''' Returns list of responses for each url in citi_url 
        corresponding to city_name '''
    # city_name (string) should be into cities_url otherwise thrown an ERROR.
    # cities_url (dictionary) contains url's for each page as value and for each city as key 
    
    cityName = city_name.strip().replace(" ","-").lower()
    
    # make sure city_name is in cities_url 
    if cityName not in cities_url_:
        return f"Incorrect city name. '{city_name}' city is not in cities_url."
    
    requests_url = [requests.get(page_url) for page_url in cities_url_[cityName]]
    
    return requests_url 

In [196]:
def city_bs4_object(city_name, citiesUrl):
    ''' Function uses `get_city_requests` function and create 
    BeautifulSoup object from 'get_city_requests` result '''
    # city_name (string) should be into cities_url otherwise thrown an ERROR.
    # cities_url (dictionary) contains url's for each page as value and for each city as key
        
    request_url = get_city_requests(city_name, citiesUrl)
    
    Beautiful_obj = [BeautifulSoup(each_page_url.text, 'lxml')
                                   for each_page_url in request_url]
    return Beautiful_obj 

In [202]:

def car_page_link(city_BS_Object):
    '''Returns list of BeautifulSoup Object for every cars in each page '''
        
    # finding all the car objects in doc 
    car_doc = [cityObject.find_all('a', class_='_9Ue0B') for cityObject in city_BS_Object]
    
    # collecting each car objects URL 
    car_URL = [i['href'] for each_page in car_doc for i in each_page]
    
    requests_ = [requests.get(j) for j in car_URL]

    # get html page of each car object Url and parsing with BeautifulSoup
    car_res = [BeautifulSoup(req.text, 'lxml') for req in requests_]
    
    return car_res 

In [205]:
def car_specification(car_res):
    
    '''Returns Dictionary containing the scrapped informations like
       - Car Model
       - Price
       - History etc...
    '''
        
    dataset = {
               "Car Model":[], 
               "Price":[], 
               "History":[], 
               "Kilometers Driven":[],
               "Transmission":[], 
               "Insurance": [],
               "Owner":[],
               "Fuel Type":[], 
               "Registration":[],
               "Year of Purchase":[],
               "Last Service":[]
               } 
    
    for car_num, each_car in enumerate(car_res):
        car_price = each_car.find('strong', class_='_2yYvS')
        car_model = each_car.find('h2', class_='_2geSF')
        if not car_price or not car_model:
          continue 
        car_price = car_price.text.split('₹')[-1]
        car_model = car_model.text
        

        dataset["Price"].append(car_price)
        dataset["Car Model"].append(car_model)

        columns = [ "History", "Kilometers Driven","Transmission","Insurance",
                   "Owner","Fuel Type", "Registration","Year of Purchase",
                   "Last Service"]

        car_feature = []

        car_col_val = each_car.find_all('li', class_='tHlIu')

        for each_tag in car_col_val:
          feature = each_tag.find('label', class_='_1Q_nE').text.strip()
          feature_spec = each_tag.find_all('strong', class_='_1-PH2')[0].text.strip()

          car_feature.append(feature)
          dataset[feature].append(feature_spec)

        for column in columns:
          if column not in car_feature:
            dataset[column].append(None)

    return dataset 


In [222]:
cities_urls = gather_cities_url(cities, pages=20)

In [265]:
city_soup = city_bs4_object('kolkata', cities_urls)

In [266]:
abd_car_pages = car_page_link(city_soup)

In [256]:
len(car_pages)

6

In [267]:
dataset = car_specification(abd_car_pages)

#### Converting dictionaries into DataFrame 

In [258]:
import pandas as pd 

In [268]:
df_sample = pd.DataFrame(dataset)

In [269]:
df_sample.shape

(285, 11)

In [270]:
df_sample.to_csv("Kolkata.csv")