In [1]:
from urllib.request import urlopen
from bs4 import BeautifulSoup 

import datetime
import re

import pandas as pd



# Extraction 

## Extract all listing car information in GTA area from kijiji webite

### Key Car information is as below:

- 'brand'
- 'model'
- 'model_year'
- 'list_price'
- 'color'
- 'configration'
- 'condition'
- 'body_type'
- 'wheel_config'
- 'transmission'
- 'fuel_type'
- 'mileage'
- 'carfax_link'
- 'dealer_address'

#### Examples:
Car Information link: https://www.kijiji.ca/v-cars-trucks/city-of-toronto/2009-ford-f-150-xlt-super-crew-4x4/1385290163

- brand :'Ford'
- model:'F-150'
- model_year:'2009'
- list_price: '$11,999.00' 
- color: 'Blue'
- configration: 'XLT'
- condition: 'Used'
- body_type: 'Pickup Truck'
- wheel_config: '4 x 4'
- transmission: 'Automatic'
- fuel_type:  'Gasoline'
- mileage: '204,000'
- carfax_link: 'https://www.carproof.com/order?ref=kijiji&vin=1FTRW14819FB42024'
- dealer_address: '2 Castleton Ave unit 3, York, ON, M6N 3Z5'



In [2]:
def get_page_num(url):
    
    html = urlopen(url)
    bs = BeautifulSoup(html, 'html.parser')
    try:
        page_info = bs.find('div',class_='showing').text
        page_num = page_info.split()[-2].replace(',','')
        page_num = int(int(page_num)/20)
    except:
        page_num = 1
        pass
    
    return page_num

        
    
    
    
    

In [3]:
def get_item_info(url):
    item_info_list = []
   
        
    html = urlopen(url)
    bs_item = BeautifulSoup(html, 'html.parser')
    
    try: 
        item_brand = bs_item.find(itemprop='brand').text
    except:
        item_brand ='na'
    
    
    try:
        item_model = bs_item.find(itemprop='model').text
    except:
        item_model ='na'
    
    try:
        item_date = bs_item.find(itemprop='vehicleModelDate').text
    except:
        item_date ='na'
    
    try:
        item_price = bs_item.find('span',itemprop='price').text
    except:
        item_price = 'na'
    
    try:
        item_color = bs_item.find(itemprop='color').text
    except:
        item_color = 'na'
        
    try:
        item_config = bs_item.find(itemprop='vehicleConfiguration').text
    except:
        item_config = 'na'
    
    try:
        item_condition = bs_item.find(itemprop='itemCondition').text
    except:
        item_condition = 'na'
    
    try:
        item_bodytype = bs_item.find(itemprop='bodyType').text
    except:
        item_bodytype = 'na'
        
    try:
        item_wheelConfig = bs_item.find(itemprop='driveWheelConfiguration').text
    except:
        item_wheelConfig = 'na'
        
    try:
        item_transmission = bs_item.find(itemprop='vehicleTransmission').text
    except:
        item_transmission = 'na'
        
    try:
        item_fueltype = bs_item.find(itemprop='fuelType').text
    except:
        item_fueltype = 'na'
        
    try:
        item_mileage = bs_item.find(itemprop='mileageFromOdometer').text
    except:
        item_mileage = 'na'
    
       
    item_carfax = bs_item.find('a', href=re.compile('^(https://reports.carproof.com)((?!:).)*$'))
    try:
        item_carfax_link = item_carfax.attrs['href']
    except:
        item_carfax = bs_item.find('a', href=re.compile('^(https://www.carproof.com)((?!:).)*$'))
        try: 
            item_carfax_link = item_carfax.attrs['href']
        except:
            item_carfax_link = 'na'
    
        
    try:
        item_dealer_add = bs_item.find(itemprop='address').text
    except:
        item_dealer_add = 'na'
    
   
        

    
    item_info_list.append(item_brand)
    item_info_list.append(item_model)
    item_info_list.append(item_date)
    item_info_list.append(item_price)
    item_info_list.append(item_color)
    item_info_list.append(item_config)
    item_info_list.append(item_condition)
    item_info_list.append(item_bodytype)
    item_info_list.append(item_wheelConfig)
    item_info_list.append(item_transmission)
    item_info_list.append(item_fueltype)
    item_info_list.append(item_mileage)
    item_info_list.append(item_carfax_link)
    item_info_list.append(item_dealer_add)
   
    
    return item_info_list
    
    
    

In [4]:
title=[]
price=[]
itemurl=[]

base_url = 'https://www.kijiji.ca'
init_url = 'https://www.kijiji.ca/b-cars-vehicles/city-of-toronto/c27l1700273'

page_num = get_page_num(init_url)

page_num



In [5]:
all_info_list = []
itemlist = []
for page in range(1,page_num):
    page_url = 'https://www.kijiji.ca/b-cars-vehicles/city-of-toronto/'+'page+'+ str(page)+'/c27l1700273'
    html = urlopen(page_url)
    bs = BeautifulSoup(html, 'html.parser')
    for link in bs.find_all('a', href=re.compile('^(/v-cars-trucks/)((?!:).)*$')):
        if 'href' in link.attrs:
            item_url = base_url + link.attrs['href']
            if '?' not in item_url:
                print(item_url)
                itemlist = get_item_info(item_url)
                print(itemlist)
                all_info_list.append(itemlist)
                itemlist = []
            
    

In [6]:
all_info_list

In [7]:
df = pd.DataFrame(all_info_list)

# Transformation

## Transform all information into pandas dataframe

In [8]:
df

In [9]:
columns_name = ['brand','model','model_year','list_price','color','configration','condition','body_type',\
               'wheel_config','transmission','fuel_type','mileage','carfax_link','dealer_address']


In [11]:
df.columns = columns_name

In [12]:
df.head()

In [13]:
df.to_csv('kijiji_car.csv')