In [14]:
from selenium import webdriver
from selenium.webdriver import Chrome, ChromeOptions
from bs4 import BeautifulSoup

import pandas as pd
import numpy as np
import time
import os
import datetime
from glob import glob
from tqdm import tqdm
import re

today = datetime.date.today().strftime("%y%m%d")

cols = ['title', 'fuel_type', 'mileage', 'power', 'origin', 'city', 'region', 'reg_month', 'reg_year', 'garanty', 'price_neg', 'price', 'link']

In [2]:
def get_driver():
    prefs = {"profile.managed_default_content_settings.images": 2}
    chrome_options = ChromeOptions()
    chrome_options.add_experimental_option("prefs", prefs)
    chrome_options.add_argument("--incognito")

    driver = Chrome("C:\\chromedriver_win32\\chromedriver.exe",options=chrome_options)
    
    return driver

In [None]:
wait_time = 0

df0 = pd.DataFrame(columns=cols)

driver = get_driver()

n_cars = 1000
n = 0
pn = 1
while (n < n_cars) | (pn <= 500):
    url = 'https://www.standvirtual.com/carros/?search%5Border%5D=created_at%3Adesc&page='+str(pn)
    driver.get(url)
    
    time.sleep(wait_time)
    
    content = driver.page_source
    soup = BeautifulSoup(content)        
    
    page = soup.findAll('div', attrs={'class':'offer-item__content ds-details-container'})
    
    if pn == 1:
        n_cars = int(soup.findAll('span', attrs={'class':'counter'})[0].text[1:-1].replace(' ',''))
    
    for i, p in enumerate(page):
        
        # Title
        title = p.find('a', attrs={'class':'offer-title__link'})['title'].strip()
        
        # Link
        link = p.find('a', attrs={'class':'offer-title__link'})['href']
        
        # Price
        price = float(p.find('span', attrs={'class':'offer-price__number ds-price-number'}).findAll('span')[0].text.replace(' ','').replace(',','.'))
        
        # Price negotiable
        price_neg = p.find('span', attrs={'class':'offer-price__details ds-price-complement'}).text.strip()
        
        ## Car parameters
        car_params = p.find('ul', attrs={'class':'ds-params-block'})
        
        # Fuel type
        fuel_types = car_params.findAll(attrs={'data-code':'fuel_type'})
        if len(fuel_types) > 0:
            fuel_type = fuel_types[0].text.strip()
        else:
            fuel_type = ''
            
        # Registration month
        reg_months = car_params.findAll(attrs={'data-code':'first_registration_month'})
        if len(reg_months) > 0:
            reg_month = reg_months[0].text.strip()
        else:
            reg_month = ''
            
        # Registration year
        reg_years = car_params.findAll(attrs={'data-code':'first_registration_year'})
        if len(reg_years) > 0:
            reg_year = float(reg_years[0].text.strip())
        else:
            reg_year = np.nan
            
        # Mileage
        mileages = car_params.findAll(attrs={'data-code':'mileage'})
        if len(mileages) > 0:
            mileage = float(mileages[0].text.replace('km','').replace(' ','').strip())
        else:
            mileage = np.nan
            
        # Power
        powers = car_params.findAll(attrs={'data-code':'power'})
        if len(powers) > 0:
            power = float(powers[0].text.replace('cv','').replace(' ','').strip())
        else:
            power = np.nan
            
        # City
        cities = p.findAll('span', attrs={'class':'ds-location-city'})
        if len(cities) > 0:
            city = cities[0].text.strip()
        else:
            city = ''
            
        # Region
        regions = p.findAll('span', attrs={'class':'ds-location-region'})
        if len(regions) > 0:
            region = regions[0].text.strip()[1:-1]
        else:
            region = ''
            
        # Garanty
        garanties = p.findAll('span', attrs={'class':'tag ds-tag'})
        if len(garanties) > 0:
            garanty = 1
        else:
            garanty = 0
            
        # Origin
        origins = p.findAll(attrs={'data-code':'origin'})
        if len(origins) > 0:
            origin = origins[0].text.strip().split(' ')[1]
        else:
            origin = ''
            
        
        df1 = pd.DataFrame([[title, fuel_type, mileage, power, origin, city, region, reg_month, reg_year, garanty, price_neg, price, link]], columns=cols)
        df0 = df0.append(df1, ignore_index=True)
        
        if n%1000 == 0:
            print('Car %d/%d (%d %%)'%(n,n_cars,n/n_cars*100))
        
        n += 1
    
    pn += 1
    
    time.sleep(wait_time)
    
driver.close()

In [None]:
df0 = df0.drop_duplicates()
df0.to_csv('data/standvirtual_cars_%s.csv'%(today), index=False)

## Add features

In [3]:
df0 = pd.read_csv(glob('data/standvirtual_cars_*.csv')[-1])

  interactivity=interactivity, compiler=compiler, result=result)


In [28]:
nix = np.where(~df0[df0.columns[len(cols):]].any(axis=1))[0]

driver = get_driver()
n = 0
for i in tqdm(nix, total=len(nix)):
    url = df0.at[i,'link']
    driver.get(url)

    content = driver.page_source
    soup = BeautifulSoup(content)

    feats = soup.findAll('li', attrs={'class':'offer-params__item'})
    for feat in feats:
        feat_name = feat.find(attrs={'class':'offer-params__label'}).text.strip()
        feat_value = feat.find(attrs={'class':'offer-params__value'}).text.strip()
        if not feat_name in df0:
            df0.insert(len(df0.columns), feat_name,'',True)
        df0.at[i,feat_name] = feat_value
    if n%100 == 0:
        df0.to_csv('data/standvirtual_cars_updated_%s.csv'%(today), index=False)
    n += 1
df0.to_csv('data/standvirtual_cars_updated_%s.csv'%(today), index=False)

100%|██████████████████████████████████████████████████████████████████████████| 12080/12080 [8:22:02<00:00,  2.49s/it]


In [29]:
df0.to_csv('data/standvirtual_cars_updated_%s.csv'%(today), index=False)