In [1]:
from bs4 import BeautifulSoup
import requests
import pandas as pd
import re
import datetime
import time
from fake_useragent import UserAgent 
ua = UserAgent()
headers = {'user-agent': ua.random}

#data = requests.get(url, headers=headers)

#### Get Monthly Sales for 2019 & 2020

In [2]:
def get_monthly_sales_df(url, year_str, tbl_num):
    '''
    Return a time series df of monthly US car sales for every make and model 
    '''
    response = requests.get(url, headers=headers)
    page= response.text
    soup = BeautifulSoup(page, "lxml")
    
    #find car sales data and turn it into a list
    tbl_str = soup.find_all('tbody')[tbl_num].text
    tbl_list = tbl_str.split('\n')
    tbl_list = [i for i in tbl_list if i] 
    
    #create empty time series df for specified year
    date_time_str = year_str + '-01'
    start_date = datetime.datetime.strptime(date_time_str, '%Y-%m')
    index = pd.date_range(start_date, periods=12, freq='m')
    df = pd.DataFrame(index=index)
    
    #fill empty df with monthly sales for all makes and models
    col_name = ''
    idx = 0
    for x in range(0,len(tbl_list)//13):
        col_name = tbl_list[x+idx]
        list_vals = []
        for val in range(x+1,x+13):
            list_vals.append(tbl_list[idx+val])
        df[col_name]=list_vals
        idx = idx+12
    return df

new df called monthly_sales_df

In [3]:
url = 'https://www.goodcarbadcar.net/2019-us-vehicle-sales-figures-by-model/' 
df_sales = get_monthly_sales_df(url, '2019', 2)
url = 'https://www.goodcarbadcar.net/2020-us-vehicle-sales-figures-by-model/'
df = get_monthly_sales_df(url, '2020', 1)
monthly_sales_df = pd.concat((df_sales, df))
#monthly_sales_df.columns

#### Get All Model Yearly Sales for 2005-2020

In [4]:
def get_model_sales_df(url, year):
    '''
    Return a df of yearly US car sales for every make and model 
    columns = Model, Year, and Total_Sales
    '''
    #read url page into list of pandas dfs 
    response = requests.get(url, headers=headers)
    page = response.text
    df_list = pd.read_html(page)
    
    #find correct df based on number of data frames on url page and 
    if len(df_list) == 1 or year == '2019':
        df = pd.DataFrame(df_list[0])
    elif len(df_list) == 2:
        df = pd.DataFrame(df_list[1])
    else:
        df = pd.DataFrame(df_list[len(df_list)-1])
    
    #special case for 2020 data because sales data is in monthly sales format for each model
    if year == '2020':  
        soup = BeautifulSoup(page, "lxml")
    
        #find car sales data and turn it into a list
        tbl_str = soup.find_all('tbody')[1].text
        tbl_list = tbl_str.split('\n')
        tbl_list = [i for i in tbl_list if i] 
        
        columns = ('Model', 'Year', 'Total_Sales')
        df = pd.DataFrame(columns=columns)

        #fill empty df with monthly sales for all makes and models
        model_name = ''
        idx = 0
        index = 0
        for x in range(0,len(tbl_list)//13):
            model_name = tbl_list[x+idx]
            year_sum = 0
            for val in range(x+1,x+13):
                month_total = tbl_list[idx+val].replace(',','')
                month_total = int(month_total)
                year_sum = year_sum + month_total
            #df1 = pd.DataFrame([model_name, year_sum], columns = ['Model', year])
            df = df.append(pd.DataFrame({'Model': model_name, 'Year': year, 'Total_Sales': year_sum}, index=[index]), 
                           ignore_index=True)
            index = index+1
            #df.append(df1)
            idx = idx+12
        return df
    
    if year == '2012':
        df = df.iloc[:, [2,3]]
    elif year == '2005':
        df = df.iloc[:, [1,3]]
    elif year in ['2017','2018', '2019', '2020']:
        df = df.iloc[:, [0,4]]
    else:
        df = df.iloc[:, [1,2]]
    df.columns = ['Model', 'Total_Sales']
    df['Year'] = year
    
    #* indicate further breakdown of sum totals, overall totals be removed further down when duplicates are removed
    #other symbols refer to subnotes in the tables and are not apart of model names
    symbols = ['*', '²', '¹', '^', '†', '‡']
    
    for s in symbols:
        df['Model'] = df['Model'].str.replace(s,'')

    df.dropna(subset=['Model'], inplace=True)
    
    df['Model'] = df['Model'].str.rstrip()
    df['Model'] = df['Model'].str.lstrip()
    
    #remove first in set of duplicates b/c first is a sum of a car and the hybrid model
    
    df.drop_duplicates(subset='Model', keep='last', inplace=True)
    
    df['Total_Sales'] = df['Total_Sales'].apply(pd.to_numeric, errors='coerce')
    
    return df

In [5]:
url = 'https://www.goodcarbadcar.net/2006-usa-auto-sales-rankings-by-mode/'
yearly_sales_df= get_model_sales_df(url = url, year = '2005')

  df['Model'] = df['Model'].str.replace(s,'')


In [6]:
url_dict = {'2006': 'https://www.goodcarbadcar.net/2006-usa-auto-sales-rankings-by-mode/',
            '2007': 'https://www.goodcarbadcar.net/usa-2007-vehicle-sales-rankings-by-mode/',
            '2008': 'https://www.goodcarbadcar.net/2008-america-auto-sales-rankings-by-mode/',
            '2009': 'https://www.goodcarbadcar.net/usa-auto-sales-rankings-by-model-2009/',
            '2010': 'https://www.goodcarbadcar.net/2010-america-auto-sales-rankings-by-mode/',
            '2011': 'https://www.goodcarbadcar.net/top-268-best-selling-vehicles-2011-year/',
            '2012': 'https://www.goodcarbadcar.net/2012-usa-auto-sales-rankings-by-model7/',
            '2013': 'https://www.goodcarbadcar.net/usa-vehicle-sales-rankings-by-model-december-2013-year-end/',
            '2014': 'https://www.goodcarbadcar.net/usa-all-cars-sales-figures-2014-december-year-end/',
            '2015': 'https://www.goodcarbadcar.net/usa-car-sales-by-model-2015-year-end-december/',
            '2016': 'https://www.goodcarbadcar.net/usa-2016-vehicle-sales-by-model-manufacturer-brand/',
            '2017': 'https://www.goodcarbadcar.net/december-2017-year-end-u-s-passenger-car-sales-rankings-top-171-best-selling-cars-america-every-car-ranked/',
            '2018': 'https://www.goodcarbadcar.net/december-2018-the-best-selling-vehicles-in-america-every-vehicle-ranked/',
            '2019': 'https://www.goodcarbadcar.net/2019-us-vehicle-sales-figures-by-model/',
            '2020': 'https://www.goodcarbadcar.net/2020-us-vehicle-sales-figures-by-model/'
           }

years = ['2006']
for key, value in url_dict.items():
    #print(key)
    years.append(key)
    df= get_model_sales_df(url = value, year = key)
    #print(df)
    yearly_sales_df = pd.concat([yearly_sales_df, df], axis=0)
    #yearly_sales_df = pd.merge(yearly_sales_df, df, on='Model', how='outer')
    #print(yearly_sales_df[yearly_sales_df.Model.str.contains("Toyota")])

print(yearly_sales_df)

  df['Model'] = df['Model'].str.replace(s,'')
  df['Model'] = df['Model'].str.replace(s,'')
  df['Model'] = df['Model'].str.replace(s,'')
  df['Model'] = df['Model'].str.replace(s,'')
  df['Model'] = df['Model'].str.replace(s,'')
  df['Model'] = df['Model'].str.replace(s,'')
  df['Model'] = df['Model'].str.replace(s,'')
  df['Model'] = df['Model'].str.replace(s,'')
  df['Model'] = df['Model'].str.replace(s,'')
  df['Model'] = df['Model'].str.replace(s,'')
  df['Model'] = df['Model'].str.replace(s,'')
  df['Model'] = df['Model'].str.replace(s,'')
  df['Model'] = df['Model'].str.replace(s,'')
  df['Model'] = df['Model'].str.replace(s,'')


                     Model Total_Sales  Year
0            Ford F-Series    901463.0  2005
1      Chevrolet Silverado    705980.0  2005
2             Toyota Camry    431703.0  2005
3    Toyota Corolla/Matrix    341290.0  2005
4                Dodge Ram    400543.0  2005
..                     ...         ...   ...
298        Volvo 60-Series       15729  2020
299        Volvo 90-Series        3195  2020
300             Volvo XC40       23778  2020
301             Volvo XC60       32078  2020
302             Volvo XC90       34251  2020

[4518 rows x 3 columns]


In [7]:
# columns = ('Model','Total_Sales')
# total_sales_df = pd.DataFrame(columns=columns)
# total_sales_df = total_sales_df.fillna(0)
# total_sales_df['Model'] = yearly_sales_df['Model']

# total_sales_df['Total_Sales'] = yearly_sales_df[years].sum(axis=1)
# #print(total_sales_df)

In [8]:
# total_sales_df.sort_values('Total_Sales', ascending=False, inplace=True)
yearly_sales_df = yearly_sales_df[yearly_sales_df.Total_Sales != 0]

remove_strings = ["Market",'Total','Family','Brand','Passenger Cars, SUVs, Crossovers','Minivans','Pickup Trucks',
                 'Commercial Vans', 'COMPANY', 'MOTOR', 'GROUP', 'AMERICAN', 'AUTOMOBILES', 'JAGUAR', 'DAIMLER']

for string in remove_strings:
    yearly_sales_df = yearly_sales_df[~yearly_sales_df.Model.str.contains(string)]

#total_sales_df = total_sales_df[total_sales_df.Model != '*Market*']
yearly_sales_df = yearly_sales_df.reset_index(drop=True)
yearly_sales_df

Unnamed: 0,Model,Total_Sales,Year
0,Ford F-Series,901463.0,2005
1,Chevrolet Silverado,705980.0,2005
2,Toyota Camry,431703.0,2005
3,Toyota Corolla/Matrix,341290.0,2005
4,Dodge Ram,400543.0,2005
...,...,...,...
4388,Volvo 60-Series,15729,2020
4389,Volvo 90-Series,3195,2020
4390,Volvo XC40,23778,2020
4391,Volvo XC60,32078,2020


In [13]:
def get_brand_links(org_url):
    '''
    Return a df of yearly US car sales for every make and model 
    columns = Model, Year, and Total_Sales
    '''
    #org_url = 'https://www.carspecs.us/'
    response = requests.get(org_url, headers=headers)
    page = response.text
    soup = BeautifulSoup(page, "lxml")
    #tbl = soup.find_all(class_='modelnamesandmakes_item')
    #link_list = soup.find_all(id_ = 'homepage-browsemakes')
    brand_link_dict = dict()
    link_list = soup.find_all('li')
    for x in link_list[7:]:
        brand = x.text
        link = 'https://www.carspecs.us' + x.find('a')['href']
        brand_link_dict[brand] = link
        time.sleep(0.5)
    return brand_link_dict

In [14]:
def get_model_links(brand, url):
    '''
    Return a df of yearly US car sales for every make and model 
    columns = Model, Year, and Total_Sales
    '''
    #load url page with all car models for specified brand url
    response = requests.get(url, headers=headers) #random user agent
    page = response.text
    soup = BeautifulSoup(page, "lxml")
    
    #find first div with all model links
    div_list = soup.find_all('div', class_='pure-u-1 pure-u-md-1-2')
    
    #new df to store all makes and models for all years past 2004
#     columns = ('Model', 'Year', 'Model_url')
#     df = pd.DataFrame(columns=columns)
    model_link_dict = dict()
    
    #loop through all model links for specified brand 
    for x in div_list[1].find_all('li'):
        model = brand + ' ' + x.text 
        link = org_url + x.find('a')['href'] #model url
        
        #load model url to get list of model year urls
        response = requests.get(link)
        page = response.text
        soup = BeautifulSoup(page, "lxml")
        year_links = soup.find_all('li') #list of all year urls for specified model
        
        #loop through all year links starting at index 7
        idx = 7
#         index = 0
        for y in year_links[7:]:
            year = int(year_links[idx].text)
            if year > 2004: #only add year links that are 2005 to 2020
                #print(year)
                model_link = 'https://www.carspecs.us' + year_links[idx].find('a')['href']
#                 df = df.append(pd.DataFrame({'Model': model, 'Year': year, 'Model_url': model_link}, index=[index]), 
#                                ignore_index=True)
#                 index = index+1
                model_link_dict[model] = (year, model_link)
                idx = idx+1
        time.sleep(0.5) #pause 
            

    return model_link_dict

In [15]:
def get_model_specs(url):
#url = 'https://www.carspecs.us//cars/2020/acura/mdx'
    response = requests.get(url, headers=headers)
    page = response.text
    soup = BeautifulSoup(page, "lxml")

    doors =passengers =speed =horsepower =drive =engine =tank =volume =length =width =height = float('NaN')
    mpg = ''
    try:
        price = soup.find(text=re.compile('starting from'))
        price = price.findNext().text
    
    except:
        price = float('NaN')
    
    div_list = soup.find('div', class_='car-details').find_all('div')
    #print(div_list)

    for div in div_list[1:]:
        spec_list = div.text.split('\n')
        spec_list = [i for i in spec_list if i]
        #print(spec_list)
        if spec_list:
            if 'RPM' not in spec_list[0]:
                if 'Passenger Doors' in spec_list[0]:
                    doors = int(spec_list[-1])
                if 'Passenger Capacity' in spec_list[0]:
                    passengers = int(spec_list[-1])
                if 'mph' in spec_list[0]:
                    speed = spec_list[-1]
                if 'Horsepower' in spec_list[0]:
                    horsepower = spec_list[-1]
                if 'Drive type' in spec_list[0]:
                    drive = spec_list[-1]
                if 'combined' in spec_list[0]:
                    mpg = spec_list[-1]
                if 'Combined' in spec_list[0]:
                    mpg = spec_list[-1]
                if 'Engine type' in spec_list[0]:
                    engine = spec_list[-1]
                    engine = engine.replace('\t', '')
                if 'tank capacity' in spec_list[0]:
                    tank = spec_list[-1]
                if 'EPA interior' in spec_list[0]:
                    volume = spec_list[-1]
                if 'Length' in spec_list[0]:
                    length = spec_list[-1]
                if 'Width' in spec_list[0]:
                    width = spec_list[-1]
                if 'Height' in spec_list[0]:
                    height = spec_list[-1]
        
        if mpg == '':
            mpg = soup.find(text=re.compile('highway mpg'))
            try:
                mpg_list = mpg.split('/ ')
                city = mpg_list[0].lstrip('\r\n ')
                highway = mpg_list[1]
                mpg = (int(city[0:2])+int(highway[0:2]))/2
            except:
                mpg = 0
    
    return [price, doors, passengers, speed, horsepower, drive, mpg, engine, tank, volume, length, width, height]

In [None]:
# columns = ('Model', 'Year', 'url', 'price', 'doors', 'passengers', 'speed_sec', 'horsepower_hp', 'drive', 'mpg', 'engine', 
#            'tank_gal', 'volume_cuft', 'length_in', 'width_in', 'height_in')
# model_spec_df = pd.DataFrame(columns=columns) 

org_url = 'https://www.carspecs.us/'
brand_links = get_brand_links(org_url)
#print(brand_links)
all_model_links = dict()
for key, value in brand_links.items():
    all_model_links.update(get_model_links(key, brand_links[key]))
    #print(all_model_links)
#rint(all_model_links)

In [None]:
columns = ('Model', 'url', 'price', 'doors', 'passengers', 'speed_sec', 'horsepower_hp', 'drive', 'mpg', 'engine', 
           'tank_gal', 'volume_cuft', 'length_in', 'width_in', 'height_in')
model_spec_df = pd.DataFrame(columns=columns) 
idx = 0
for key, value in all_model_links.items():
    print(value)
    specs = get_model_specs(value[1])
    print(specs)
    print(value[0])
    print(value[1])
    model_spec_df = model_spec_df.append(pd.DataFrame({'Model': key,'Year':value[0], 'url': value[1], 'price': specs[0],'doors': specs[1],
                                                      'passengers': specs[2],'speed_sec': specs[3], 'horsepower_hp': specs[4],
                                                       'drive': specs[5],'mpg': specs[6],'engine': specs[7],
                                                       'tank_gal': specs[8], 'volume_cuft': specs[9], 'length_in': specs[10],
                                                      'width_in': specs[11],'height_in': specs[12]}, index=[idx]), ignore_index=True)
    idx = idx +1
    #print(model_spec_df)

In [None]:
model_spec_df['Year'] = model_spec_df['Year'].astype(str)
yearly_sales_df['Year'] = yearly_sales_df['Year'].astype(str)
# yearly_sales_df['Model_Merged'] = yearly_sales_df['Model'].str.strip() + yearly_sales_df['Year'].str.strip()
# yearly_sales_df['Model_Merged'] = yearly_sales_df['Model_Merged'].str.lower()
# yearly_sales_df['Model_Merged'] = yearly_sales_df['Model_Merged'].str.replace('-','')
# yearly_sales_df['Model_Merged'] = yearly_sales_df['Model_Merged'].str.replace('/','')
# model_spec_df['Model_Merged'] = model_spec_df['Model'].str.strip() + model_spec_df['Year'].str.strip()
# model_spec_df['Model_Merged'] = model_spec_df['Model_Merged'].str.lower()
# model_spec_df['Model_Merged'] = model_spec_df['Model_Merged'].str.replace('-','')
# model_spec_df['Model_Merged'] = model_spec_df['Model_Merged'].str.replace('/','')
model_spec_df.info()
# model_spec_df['Year'] = model_spec_df['Year'].astype(str)
# yearly_sales_df['Model_Merged'] = yearly_sales_df['Model'].str.strip() + yearly_sales_df['Year'].str.strip()
# yearly_sales_df['Model_Merged'] = yearly_sales_df['Model_Merged'].str.lower()
# yearly_sales_df['Model_Merged'] = yearly_sales_df['Model_Merged'].str.replace('-','')
# yearly_sales_df['Model_Merged'] = yearly_sales_df['Model_Merged'].str.replace('/','')
# model_spec_df['Model_Merged'] = model_spec_df['Model'].str.strip() + model_spec_df['Year'].str.strip()
# model_spec_df['Model_Merged'] = model_spec_df['Model_Merged'].str.lower()
# model_spec_df['Model_Merged'] = model_spec_df['Model_Merged'].str.replace('-','')
# model_spec_df['Model_Merged'] = model_spec_df['Model_Merged'].str.replace('/','')
model_spec_df.head()

In [None]:
#model_spec_df['Year'] = model_spec_df['Year'].astype(str)
#model_spec_sales_df = yearly_sales_df.merge(model_spec_df, how = 'left', on = ['Model_Merged'])
model_spec_sales_df = yearly_sales_df.merge(model_spec_df, how = 'left',on=["Model"])
model_spec_sales_df.info()

drop uneeded columns and remove rows with a lot of na values

In [None]:
model_spec_sales_df.drop(labels=['Model_Merged', 'Model_y', 'url'], axis=1, inplace=True)
# model_spec_sales_df.dropna(how = ['price', 'doors', 'passengers', 'speed_sec', 'horsepower_hp', 'drive', 'mpg', 
#                                   'engine', 'tank_gal', 'volume_cuft', 'length_in', 'width_in', 'height_in'], inplace=True)

Clean drive feature

In [None]:
model_spec_sales_df['drive'] = model_spec_sales_df['drive'].str.lower()
model_spec_sales_df['drive'] = model_spec_sales_df['drive'].str.lstrip()
model_spec_sales_df['drive'] = model_spec_sales_df['drive'].str.rstrip()
#model_spec_sales_df = model_spec_sales_df.dropna()
model_spec_sales_df['drive'].value_counts()

In [None]:
model_spec_sales_df['drive'] = model_spec_sales_df['drive'].str.replace('rear-wheel','rear wheel drive')
model_spec_sales_df['drive'] = model_spec_sales_df['drive'].str.replace('front-wheel','front wheel drive')
replace = ['full-time all wheel', 'attesa e-ts full-time all wheel', 'automatic full-time all wheel']

model_spec_sales_df['drive'] = model_spec_sales_df['drive'].str.replace(replace[1],'all wheel drive')
model_spec_sales_df['drive'] = model_spec_sales_df['drive'].str.replace(replace[2],'all wheel drive')
model_spec_sales_df['drive'] = model_spec_sales_df['drive'].str.replace(replace[2],'all wheel drive')
# for s in replace:
#     model_spec_sales_df['drive'] = model_spec_sales_df['drive'].str.replace(s,'all wheel drive')
model_spec_sales_df['drive'].value_counts()

In [None]:
model_spec_sales_df.info()
model_spec_sales_df.head()

In [None]:
replace_list = ['hp', 'mpg', 'gal.', 'cu.ft.', 'in.', ',', 'sec', '$', '$']
for s in replace_list:
    model_spec_sales_df = model_spec_sales_df.replace(s,'', regex=True) 
model_spec_sales_df['price'] = model_spec_sales_df['price'].str.replace('$','')

In [None]:
num_cols = ['price',  'speed_sec', 'horsepower_hp', 'mpg', 'tank_gal', 'volume_cuft', 
            'width_in', 'length_in','height_in']
for col in num_cols:
    model_spec_sales_df[col] = model_spec_sales_df[col].apply (pd.to_numeric, errors='coerce')
model_spec_sales_df['doors'] = model_spec_sales_df['doors'].astype(str)
model_spec_sales_df['passengers'] = model_spec_sales_df['passengers'].astype(str)

In [None]:
model_spec_sales_df.info()
model_spec_sales_df.sample(10)

In [None]:
model_spec_sales_df.to_pickle('../data/model_spec_sales_df.pkl')