# Notebook Context

This is 2 of 6 Jupyter Notebooks associated with the used car project.

This Notebook covers web scraping of the data to be used in the project from [carsized.com](https://www.carsized.com/en/). This data was used in the second version of the modelling, described in notebook [06_Modelling_version2](https://github.com/rgdavies92/used-car-value/blob/main/06_Modelling_version2.ipynb), and was found to improve model performance!

In [1]:
# Import packages 

import requests
import numpy as np
import pandas as pd
from bs4 import BeautifulSoup
import re
import itertools

In [2]:
# Scrape the list of cars on carsized.com. This is the pre-scrape.

# I do this because I need to have a list of all different cars available on carsized.com before they
# can be scraped. This is like a pre-scrape before I scrape the dimensions for each of them.
carurl = []
# A small RegEx pattern to be used later
pattern = re.compile(r'https://www.carsized.com/en/cars[a-z-0-9]+')
url = "https://www.carsized.com/en/cars"
# This scrape uses Requests rather than Cloudscraper. No problems encountered.
r = requests.get(url)

# Check that the website responds well
if str(r) == "<Response [200]>":
    # Use BeautifulSoup to parse the HTML into some soup
    soup = BeautifulSoup(r.text, 'html.parser')
    # Obtain a list of all articles (cars) on this carsized.com site
    articles = soup.find_all("div", attrs={"class":"indexcontainer"})
    
    # Iterate over each car in the list and extract the important bit of the URL for that car, then append 
    # to carurl list
    for car in articles:
        y = re.findall(pattern, str(car))
        curl = (str(y).replace('https://www.carsized.com/en/cars',''))
        # Some small string cleaning before appending to the list
        carurl.append(curl.replace('[','').replace(']','').replace("'",""))
        

In [3]:
# Scrape the contents of carsized.com. This is the full web scrape of dimensions for each car documented.

# In order to obtain the car dimensions in a scrapable format I have to review the comparison of the desired
# car with some other car. For this purpose, I'm comparing every car to the 2008 Abarth 500. Completely arbitrary.

# Initialise some empty lists to hold the outputs
results = []
name=[]
wheelbase=[]
length=[]
width=[]
height=[]
groundclearance=[]
cargovol=[]
maxcargovol=[]

# Base url with the 2008 Abarth 500 for comparison.
site = "https://www.carsized.com/en/cars/compare/abarth-500-2008-3-door-hatchback-vs-"

# Iterate over cars in my pre-scraped list
for car in carurl:
    # Make an ampty holder for all attributes of the car being iterated over. This will contain data points
    # not worth scraping, as well as the ones that are worth scraping.
    allatts=[]
    # Use f-string to combine the base url and the car being iterated over. Then Requests again to return the HTML
    # Important to standardise all units when gathering this data! I'm going metric. 
    url = f"{site}{car}/?&units=metric"
    r = requests.get(url)

    # Check that the site responds well before proceeding
    if str(r) == "<Response [200]>":
        # Use BeautifulSoup to parse the HTML
        soup = BeautifulSoup(r.text, 'html.parser')
        # The meat I'm interested in is saved in a matrix-like table. I'm going to call this an article 
        articles = soup.find_all("div", attrs={"class":["dmatrixtitle","dmatrixtitlesup"]})
        
        # Extract the text from each entry in the matrix-like table
        for carsoup in articles:
            allatts.append(carsoup.text)
        
        # Extract important measurements from the allatts list and add them to the initialised holding lists.
        name.append(car)
        wheelbase.append(allatts[2])
        length.append(allatts[5])
        width.append(allatts[8])
        height.append(allatts[14])
        groundclearance.append(allatts[17])
        cargovol.append(allatts[20])
        maxcargovol.append(allatts[23])
    
    # Save the entire allatts list to the results list as a redundancy measure in case any were missed.
    results.append(allatts)

# Form a dataframe of the extracted measurements for each car.
df = pd.DataFrame({'name': name,'wheelbase_cm':wheelbase,'length_cm': length, 'width_cm':width,
                  'height_cm':height, 'ground_clearance_cm':groundclearance, 
                   'cargo_volume_L':cargovol, 'max_cargo_volume_L':maxcargovol})

In [4]:
# Convert continuous variables to float

# A simple function to tidy up units on a variety of columns
def remove_units(x):
    x=x.replace('cm','')
    x=x.replace('l','')
    x=x.replace('--','0')
    x=x.strip()
    x=float(x)
    return x

# Clean up units on extracted data
df.wheelbase_cm = df.wheelbase_cm.apply(lambda x: remove_units(x))
df.length_cm = df.length_cm.apply(lambda x: remove_units(x))
df.width_cm = df.width_cm.apply(lambda x: remove_units(x))
df.height_cm = df.height_cm.apply(lambda x: remove_units(x))
df.ground_clearance_cm = df.ground_clearance_cm.apply(lambda x: remove_units(x))
df.cargo_volume_L = df.cargo_volume_L.apply(lambda x: remove_units(x))
df.max_cargo_volume_L = df.max_cargo_volume_L.apply(lambda x: remove_units(x))


In [5]:
# Make make column - I should probably have called this brand but it's too late now. 

# This has been pasted from a .unique call in anorther cell.
makes = ['Abarth','Aixam','Alfa-Romeo','Alpine','Ariel','Aston-Martin','Audi','Austin',
        'BMW','Beauford','Bentley','Bowler','Bugatti','Buick',
        'CUPRA','Cadillac','Carbodies','Caterham','Chesil','Chevrolet','Chrysler','Citroen','Corvette',
        'DAF','DFSK','DS Automobiles','Dacia','Daewoo','Daihatsu','Daimler','Datsun','Delorean','Dodge',
        'Ferrari','Fiat','Ford','GMC','Great Wall','Infinity',
        'Hillman','Honda','Hummer','Hyundai','Infiniti','Isuzu','Iveco','Jaguar','Jeep','Jensen','KIA',
        'LEVC','Lada','Lamborghini','Lancia','Land Rover','Lexus','Lincoln','London Taxis International','Lotus',
        'MG','MINI','Mahindra','Maserati','Maybach','Mazda','McLaren','Mercedes-Benz','Microcar','Mitsubishi',
        'Mitsuoka','Morgan','Morris','Nissan','Noble','Opel','Packard','Perodua','Peugeot','Pilgrim','Polestar',
        'Pontiac','Porsche','Proton','REO','Radical','Rage','Raptor','Reliant','Renault','Replica','Reva','Riley',
        'Rolls-Royce','Rover','SEAT','SKODA','Saab','Sebring','Singer','Smart','Spyker','Ssangyong','Subaru',
        'Sunbeam','Suzuki','TVR','Tesla','Tiger','Toyota','Triumph','Ultima','Vauxhall','Venturi','Volkswagen',
        'Volvo','Westfield','Wolseley','Yamaha','Zenos','Alvis','Excalibur','Fisker','Genesis','Ssang-yong',
        'AC','BAC']

df['make']=''

# Try to standardise car makes between the AutoTrader data and Carsized. If the Carsized make matches the
# AutoTrader make then use the AutoTrader one.
for index, car in df.iterrows():
    for make in makes:
        if make.lower()+'-' in str(car[0]):
            df.loc[index,'make'] = make

In [6]:
# Make year column using RegEx

# Set all years to 0
df['year'] = '[0]'

# Extract a year from the car name from Carsized. Note that I don't care about cars with years from a long
# time ago so was able to limit the RegEx expression. Insert NaN if no year. 
for index, car in df.iterrows():
    if len(re.findall(r"[12][09][0-9][0-9]", car[0]))>0:
        df.loc[index,'year'] = int(re.findall(r"[12][09][0-9][0-9]", car[0])[0])
    else:
        df.loc[index,'year'] = np.nan


In [7]:
# Make body column 

# List of Bodies from both AutoTrader and Carsized. Note that there is UK and US english being used here!
bodies = ['Hatchback', 'SUV', 'Van', 'Saloon', 'Convertible', 'MPV', 'Coupe',
       'Estate', 'Pickup', 'Sedan', 'Cabriolet','Liftback','Roadster','Offroader','Fastback','double-cab','pick-up']

# Make the body column empty
df['body']=''

# If any of these bodies above are in the car name, then assing the UK enlish equivalent to the df.body column.
for index, car in df.iterrows():
    for body in bodies:
        if body.lower() in str(car[0]):
            df.loc[index,'body'] = body
        if df.loc[index,'body'] == 'Sedan':
            df.loc[index,'body'] = 'Saloon'
        if df.loc[index,'body'] == 'Cabriolet':
            df.loc[index,'body'] = 'Convertible'
        if df.loc[index,'body'] == 'Roadster':
            df.loc[index,'body'] = 'Convertible'
        if df.loc[index,'body'] == 'Liftback':
            df.loc[index,'body'] = 'Hatchback'
        if df.loc[index,'body'] == 'Offroader':
            df.loc[index,'body'] = 'SUV'
        if df.loc[index,'body'] == 'Fastback':
            df.loc[index,'body'] = 'Coupe'
        if df.loc[index,'body'] == 'double-cab':
            df.loc[index,'body'] = 'Pickup'
        if df.loc[index,'body'] == 'pick-up':
            df.loc[index,'body'] = 'Pickup'

In [8]:
# Make name column

# Save off the original name and initialise a new blank name column
df['original_name'] = df.name
df['name'] = ''

# Start from the original_name and make some small string adjustments and translations to finish with a UK enlgish
# name column which will correspond with the AutoTrader data
for index, car in df.iterrows():
    n = df.loc[index,'original_name']
    n = n.split(str(df.loc[index,'year']))[0]
    n = n.replace('-',' ')
    n = re.sub('alfa romeo','Alfa-Romeo',n)
    n = re.sub('aston martin','Aston-Martin',n)
    n = re.sub('rolls royce','Rolls-Royce',n)
    n = re.sub('mercedes benz','Mercedes-Benz',n)    
    n = re.sub('opel','Vauxhall',n)
    df.loc[index,'name']=n.title().strip().lower()

# A small number of manual fixes for cars with digits in their names which look like years!    
df.loc[756,'name'] = 'peugeot 1007'
df.loc[758,'name'] = 'peugeot 2008'
df.loc[759,'name'] = 'peugeot 2008'

In [9]:
# Write to csv - note that this data aren't ready for merging yet

# abspath = r'merge_data/01_carsized_dims.csv'
# df.to_csv(abspath, index=False, header=df.columns )

In [10]:
# Format the csv with ffill/bfill to be able to merge with cars dataframe on name, body, year 

# Make df with NaN where no volumetric data available for each year 2000-2022
carbodies = df.groupby(by=['name','body']).mean().reset_index()[['name','body']]

# Initialise some empty lists
name=[]
body=[]
year=[]

# For evert car name and car body combination, create 23 rows, one for each year between 2000 and 2022.
for index,row in carbodies.iterrows():
    for yr in range(2000,2023):
        name.append(row[0])
        body.append(row[1])
        year.append(yr)

# Make this into a df - still missing the measurments here        
allyeardf = pd.DataFrame({'name':name,'body':body,'year':year})

# The mesurements to be added to this dataframe
toadd = ['wheelbase_cm', 'length_cm', 'width_cm', 'height_cm',
       'ground_clearance_cm', 'cargo_volume_L', 'max_cargo_volume_L']

# Add columns of NaN for all elements of toadd
for name in toadd:
    allyeardf[name]=np.nan 
    
#Reorder the original df to match output of groupby function later
df1 = df[['name','body','year','wheelbase_cm','length_cm','width_cm','height_cm',
               'ground_clearance_cm','cargo_volume_L','max_cargo_volume_L']]

#If df1 has an entry where allyeardf is empty, take the entry with data! Makes df_tofill with ffill/bfill.
results=[]
for index, row in allyeardf.iterrows():
    name = row[0]
    body = row[1]
    year = row[2]
    realcar=df1[(df1.name==name) & (df1.body==body) & (df1.year==year)]
    if len(realcar==1):
        results.append(realcar.iloc[0,0:10])
    else:
        results.append(row)

df_tofill = pd.DataFrame(results, columns = allyeardf.columns)

#Ffill and bfill, improtantly ffill first
for col in toadd:
    df_tofill[col] = df_tofill.groupby(by=['name','body'])[col].transform(lambda x: x.ffill())
    df_tofill[col] = df_tofill.groupby(by=['name','body'])[col].transform(lambda x: x.bfill())
    
dfmerge=df_tofill.copy()

In [11]:
# Write to csv

# abspath = r'merge_data/02_carsized_dims_for_merge.csv'
# dfmerge.to_csv(abspath, index=False, header=dfmerge.columns )

In [12]:
# abspath = r'merge_data/02_carsized_dims_for_merge.csv'
# cars_size_to_merge = pd.read_csv(abspath)
# print('all',cars_size_to_merge.shape)

In [13]:
dfmerge.head(50)

Unnamed: 0,name,body,year,wheelbase_cm,length_cm,width_cm,height_cm,ground_clearance_cm,cargo_volume_L,max_cargo_volume_L
0,abarth 500,Hatchback,2000,230.0,365.7,162.7,148.5,10.4,185.0,610.0
1,abarth 500,Hatchback,2001,230.0,365.7,162.7,148.5,10.4,185.0,610.0
2,abarth 500,Hatchback,2002,230.0,365.7,162.7,148.5,10.4,185.0,610.0
3,abarth 500,Hatchback,2003,230.0,365.7,162.7,148.5,10.4,185.0,610.0
4,abarth 500,Hatchback,2004,230.0,365.7,162.7,148.5,10.4,185.0,610.0
5,abarth 500,Hatchback,2005,230.0,365.7,162.7,148.5,10.4,185.0,610.0
6,abarth 500,Hatchback,2006,230.0,365.7,162.7,148.5,10.4,185.0,610.0
7,abarth 500,Hatchback,2007,230.0,365.7,162.7,148.5,10.4,185.0,610.0
0,abarth 500,Hatchback,2008,230.0,365.7,162.7,148.5,10.4,185.0,610.0
9,abarth 500,Hatchback,2009,230.0,365.7,162.7,148.5,10.4,185.0,610.0
