In [23]:
import math
import numpy as np
from scipy import stats
import pandas as pd
import re
import datetime

# web scraping
from bs4 import BeautifulSoup
import requests
import time
import random
from fake_useragent import UserAgent 

# ignore warnings
import warnings
warnings.filterwarnings('ignore')

ua = UserAgent()
headers = {'user-agent': ua.random}

#### Get Monthly Sales for 2019 & 2020

In [4]:
def get_monthly_sales_df(url, year_str, tbl_num):
    '''
    A function that gets monthly sales of every make and model sold in the us for a specified year

    Parameters
    ----------
    url : url containg monthly sales data, in string format 
    year_str : year in string format
    tbl_num : index number corresponding to the location of the table on the webpage (1st table is at index 0)

    Returns
    -------
    Return a df of monthly US car sales for every make and model, months as columns
    '''
    response = requests.get(url, headers=headers)
    page= response.text
    soup = BeautifulSoup(page, "lxml")
    
    #find car sales data and turn it into a list
    tbl_str = soup.find_all('tbody')[tbl_num].text
    tbl_list = tbl_str.split('\n')
    tbl_list = [i for i in tbl_list if i] 
    
    #create empty time series df for specified year
    date_time_str = year_str + '-01'
    start_date = datetime.datetime.strptime(date_time_str, '%Y-%m')
    index = pd.date_range(start_date, periods=12, freq='m')
    df = pd.DataFrame(index=index)
    
    #fill empty df with monthly sales for all makes and models
    col_name = ''
    idx = 0
    for x in range(0,len(tbl_list)//13):
        col_name = tbl_list[x+idx]
        list_vals = []
        for val in range(x+1,x+13):
            list_vals.append(tbl_list[idx+val])
        df[col_name]=list_vals
        idx = idx+12
    return df

new df called monthly_sales_df

In [11]:
#only monthly sales data for years 2019 and 2020
url = 'https://www.goodcarbadcar.net/2019-us-vehicle-sales-figures-by-model/' 
df_2019 = get_monthly_sales_df(url, '2019', 2)
url = 'https://www.goodcarbadcar.net/2020-us-vehicle-sales-figures-by-model/'
df_2020 = get_monthly_sales_df(url, '2020', 0)
monthly_sales_df = pd.concat((df_2019, df_2020))

pickle df of monthly sales for 2019 and 2020 to data folder

In [14]:
monthly_sales_df.tail()

Unnamed: 0,Acura ILX,Acura MDX,Acura NSX,Acura RDX,Acura RLX,Acura TLX,Alfa Romeo 4C,Alfa Romeo Giulia,Alfa Romeo Stelvio,Audi A3,...,Ford Mustang Mach E,Genesis GV80,Kia K5,Kia Seltos,Land Rover Defender,Mazda 2,Tesla Model Y,Volkswagen Atlas Sport,Volkswagen ID.4,Volvo 40 Series
2020-08-31,1331,5165,3,5057,118,1973,7,683,886,796,...,0,0,3631,5314,405,0,10909,0,0,0
2020-09-30,1377,4920,10,4664,122,1848,7,740,960,863,...,0,0,5763,5613,439,0,11818,0,0,0
2020-10-31,1523,4732,22,5022,126,2365,2,864,1189,1094,...,0,0,7528,5542,2429,0,1687,3737,0,0
2020-11-30,1141,4491,9,4155,81,2014,2,709,977,899,...,0,58,7437,5122,1995,0,1386,3070,0,0
2020-12-31,1417,6040,14,5820,72,2285,3,987,1359,1250,...,3,1459,6029,6107,2776,3256,1928,4271,0,0


In [15]:
monthly_sales_df.to_pickle('../data/monthly_sales_df.pkl')

#### Get All Model Yearly Sales for 2005-2020

In [32]:
def get_model_sales_df(url, year):
    '''
    A function that gets all year end model sales for a specified year

    Parameters
    ----------
    url : url containg year end sales data for every make and model sold in the US, in string format
    year : year in string format

    Returns
    -------
    Return a df of yearly US car sales for every make and model 
    columns = Model, Year, and Total_Sales
    '''
    #read url page into list of pandas dfs 
    response = requests.get(url, headers=headers)
    page = response.text
    df_list = pd.read_html(page)
    
    #find correct df based on number of data frames on url page and 
    if len(df_list) == 1 or year == '2019':
        df = pd.DataFrame(df_list[0])
    elif len(df_list) == 2:
        df = pd.DataFrame(df_list[1])
    else:
        df = pd.DataFrame(df_list[len(df_list)-1])
    
    #special case for 2020 data because sales data is in monthly sales format for each model
    if year == '2020':  
        soup = BeautifulSoup(page, "lxml")
    
        #find car sales data and turn it into a list
        tbl_str = soup.find_all('tbody')[0].text
        tbl_list = tbl_str.split('\n')
        tbl_list = [i for i in tbl_list if i] 
        
        columns = ('Model', 'Year', 'Total_Sales')
        df = pd.DataFrame(columns=columns)

        #fill empty df with monthly sales for all makes and models
        model_name = ''
        idx = 0
        index = 0
        for x in range(0,len(tbl_list)//13):
            model_name = tbl_list[x+idx]
            year_sum = 0
            for val in range(x+1,x+13):
                month_total = tbl_list[idx+val].replace(',','')
                month_total = int(month_total)
                year_sum = year_sum + month_total
            #df1 = pd.DataFrame([model_name, year_sum], columns = ['Model', year])
            df = df.append(pd.DataFrame({'Model': model_name, 'Year': year, 'Total_Sales': year_sum}, index=[index]), 
                           ignore_index=True)
            index = index+1
            #df.append(df1)
            idx = idx+12
        return df
    
    #special cases for finding the location of the total sales based on years
    if year == '2012':
        df = df.iloc[:, [2,3]]
    elif year == '2005':
        df = df.iloc[:, [1,3]]
    elif year in ['2017','2018', '2019', '2020']:
        df = df.iloc[:, [0,4]]
    else:
        df = df.iloc[:, [1,2]]
    
    #add columns to df and year column
    df.columns = ['Model', 'Total_Sales']
    df['Year'] = year
    
    #* indicate further breakdown of sum totals, overall totals be removed further down when duplicates are removed
    #other symbols refer to subnotes in the tables and are not apart of model names
    symbols = ['*', '²', '¹', '^', '†', '‡']
    for s in symbols:
        df['Model'] = df['Model'].str.replace(s,'')

    #clean model and Total_Sales column
    df.dropna(subset=['Model'], inplace=True)
    df['Model'] = df['Model'].str.rstrip()
    df['Model'] = df['Model'].str.lstrip()
    df['Total_Sales'] = df['Total_Sales'].apply(pd.to_numeric, errors='coerce') #to numeric
    
    #remove first in set of duplicates b/c first is a sum of a car and the hybrid model
    df.drop_duplicates(subset='Model', keep='last', inplace=True)
    
    return df

In [139]:
#dictionary of all urls containing all year end sales data with year as the key
url_dict = {'2006': 'https://www.goodcarbadcar.net/2006-usa-auto-sales-rankings-by-mode/',
            '2007': 'https://www.goodcarbadcar.net/usa-2007-vehicle-sales-rankings-by-mode/',
            '2008': 'https://www.goodcarbadcar.net/2008-america-auto-sales-rankings-by-mode/',
            '2009': 'https://www.goodcarbadcar.net/usa-auto-sales-rankings-by-model-2009/',
            '2010': 'https://www.goodcarbadcar.net/2010-america-auto-sales-rankings-by-mode/',
            '2011': 'https://www.goodcarbadcar.net/top-268-best-selling-vehicles-2011-year/',
            '2012': 'https://www.goodcarbadcar.net/2012-usa-auto-sales-rankings-by-model7/',
            '2013': 'https://www.goodcarbadcar.net/usa-vehicle-sales-rankings-by-model-december-2013-year-end/',
            '2014': 'https://www.goodcarbadcar.net/usa-all-cars-sales-figures-2014-december-year-end/',
            '2015': 'https://www.goodcarbadcar.net/usa-car-sales-by-model-2015-year-end-december/',
            '2016': 'https://www.goodcarbadcar.net/usa-2016-vehicle-sales-by-model-manufacturer-brand/',
            '2017': 'https://www.goodcarbadcar.net/december-2017-year-end-u-s-passenger-car-sales-rankings-top-171-best-selling-cars-america-every-car-ranked/',
            '2018': 'https://www.goodcarbadcar.net/december-2018-the-best-selling-vehicles-in-america-every-vehicle-ranked/',
            '2019': 'https://www.goodcarbadcar.net/2019-us-vehicle-sales-figures-by-model/',
            '2020': 'https://www.goodcarbadcar.net/2020-us-vehicle-sales-figures-by-model/'
           }

url = 'https://www.goodcarbadcar.net/2006-usa-auto-sales-rankings-by-mode/'
yearly_sales_df= get_model_sales_df(url = url, year = '2005') #new yearl_sales_df to store all yearly sales data

years = ['2005']
#loop through dictionary with urls and stack data frames 
for key, value in url_dict.items():
    years.append(key)
#     print(years)
    df = get_model_sales_df(url = value, year = key) #get df of total sales
    yearly_sales_df = pd.concat([yearly_sales_df, df], axis=0)

print(yearly_sales_df.tail())

               Model Total_Sales  Year
299  Volvo 60-Series       15729  2020
300  Volvo 90-Series        3195  2020
301       Volvo XC40       23778  2020
302       Volvo XC60       32078  2020
303       Volvo XC90       34251  2020


In [140]:
#remove rows with no sales
yearly_sales_df = yearly_sales_df[yearly_sales_df.Total_Sales != 0]
yearly_sales_df.reset_index(drop=True, inplace=True) #reset index after stacking dfs

#Clean total sales dataframe by removing rows containing certain strings
remove_strings = ["Market",'Total','Family','Brand','Passenger Cars, SUVs, Crossovers','Minivans','Pickup Trucks',
                 'Commercial Vans', 'COMPANY', 'MOTOR', 'GROUP', 'AMERICAN', 'AUTOMOBILES', 'JAGUAR', 'DAIMLER']
for string in remove_strings:
    yearly_sales_df = yearly_sales_df[~yearly_sales_df.Model.str.contains(string)]
    
print(yearly_sales_df.shape)
print(yearly_sales_df.tail())
yearly_sales_df.to_pickle('../data/yearly_sales_df.pkl')

(4394, 3)
                Model Total_Sales  Year
4512  Volvo 60-Series       15729  2020
4513  Volvo 90-Series        3195  2020
4514       Volvo XC40       23778  2020
4515       Volvo XC60       32078  2020
4516       Volvo XC90       34251  2020


### Get all Make and Model Specs for Every Car Year Since 2005

In [37]:
def get_brand_links(org_url):
    '''
    A function that gets all links to makes and models for every car brand

    Parameters
    ----------
    url : url containg links to all brands

    Returns
    -------
    Return a dictionary of links to every make and model for every brand 
    Key = brand, value = brand url
    '''
    response = requests.get(org_url, headers=headers)
    page = response.text
    soup = BeautifulSoup(page, "lxml")
    
    brand_link_dict = dict() #empty dict for all urls
    link_list = soup.find_all('li') #get all list objects on the url page
    #loop through all list objects, starting at index 7 where the brand links start
    for x in link_list[7:]:
        brand = x.text
        link = 'https://www.carspecs.us' + x.find('a')['href']
        brand_link_dict[brand] = link
        time.sleep(0.5) #pause
    return brand_link_dict

In [38]:
def get_model_links(brand, url):
    '''
    A function that gets all links to every make, model, and year greater than 2005 ever sold by a specified car brand

    Parameters
    ----------
    url : url containg links to every make and model ever sold by a specified car brand, string format
    brand : brand name in string format

    Returns
    -------
    Return a dataframe of links to every make, model, and year for brand 
    columns = Model, Year, and Model url 
    '''
    #load url page with all car models for specified brand url
    response = requests.get(url, headers=headers) #random user agent
    page = response.text
    soup = BeautifulSoup(page, "lxml")
    
    #find first div with all model links
    div_list = soup.find_all('div', class_='pure-u-1 pure-u-md-1-2')
    
    #new df to store all makes and models for all years past 2004
    columns = ('Model', 'Year', 'Model_url')
    df = pd.DataFrame(columns=columns)
    
    #loop through all model links for specified brand 
    for x in div_list[1].find_all('li'):
        model = brand + ' ' + x.text #brand and model name
        link = org_url + x.find('a')['href'] #model url
        
        #load model url to get list of model year urls
        response = requests.get(link)
        page = response.text
        soup = BeautifulSoup(page, "lxml")
        year_links = soup.find_all('li') #list of all year urls for specified model
        
        #loop through all year links starting at index 7
        idx = 7
        index = 0
        for y in year_links[7:]:
            year = year_links[idx].text
            if year in years: #only add year links that are 2005 to 2020
                model_link = 'https://www.carspecs.us' + year_links[idx].find('a')['href'] #add orginal url to string
                df = df.append(pd.DataFrame({'Model': model, 'Year': year, 'Model_url': model_link}, index=[index]), 
                                ignore_index=True)
                index = index+1
                idx = idx+1
        time.sleep(0.5) #pause 
            
    return df

In [39]:
def get_model_specs(url,head):
    '''
    A function that gets all specs for a specified make, model, and year of a car

    Parameters
    ----------
    url : url containg links to every year make and model ever sold by a specified car brand, string format
    hear : header for a page request, in dictioanry format

    Returns
    -------
    Return a list of model specs 
    '''
    response = requests.get(url, headers=head)
    page = response.text
    soup = BeautifulSoup(page, "lxml")

    #set all values to nan
    doors =passengers =speed =horsepower =drive =engine =tank =volume =length =width =height = wheelbase =float('NaN')
    mpg = ''
    
    #try to find price
    try:
        price = soup.find(text=re.compile('starting from'))
        price = price.findNext().text
    
    except:
        price = float('NaN')
    
    #list of all divs containg car specs
    div_list = soup.find('div', class_='car-details').find_all('div')

    #loop through all divs and search for strings to find certain car specs and assign to varibales if found
    for div in div_list[1:]:
        #create list, index 0 containg the spec, and index 1 containg the spec value
        spec_list = div.text.split('\n')
        spec_list = [i for i in spec_list if i] #remove empty values in list
        if spec_list:
            if 'RPM' not in spec_list[0]:
                if 'Passenger Doors' in spec_list[0]:
                    doors = int(spec_list[-1])
                if 'Passenger Capacity' in spec_list[0]:
                    passengers = int(spec_list[-1])
                if 'mph' in spec_list[0]:
                    speed = spec_list[-1]
                if 'Horsepower' in spec_list[0]:
                    horsepower = spec_list[-1]
                if 'Drive type' in spec_list[0]:
                    drive = spec_list[-1]
                if 'combined' in spec_list[0]:
                    mpg = spec_list[-1]
                if 'Combined' in spec_list[0]:
                    mpg = spec_list[-1]
                if 'Engine type' in spec_list[0]:
                    engine = spec_list[-1]
                    engine = engine.replace('\t', '')
                if 'tank capacity' in spec_list[0]:
                    tank = spec_list[-1]
                if 'EPA interior' in spec_list[0]:
                    volume = spec_list[-1]
                if 'Length' in spec_list[0]:
                    length = spec_list[-1]
                if 'Width' in spec_list[0]:
                    width = spec_list[-1]
                if 'Height' in spec_list[0]:
                    height = spec_list[-1]
                if 'Wheelbase' in spec_list[0]:
                    wheelbase = spec_list[-1]
        
        #if mpg not found in spec lists, find combined mpg by averaged highway and city mpg by searching for strings
        if mpg == '':
            mpg = soup.find(text=re.compile('highway mpg'))
            try:
                mpg_list = mpg.split('/ ')
                city = mpg_list[0].lstrip('\r\n ')
                highway = mpg_list[1]
                mpg = (int(city[0:2])+int(highway[0:2]))/2
            except:
                mpg = float('NaN') #return nan if not found
                
    #time.sleep(.1+.5*random.random()) #random pause 
    
    return [price, doors, passengers, speed, horsepower, drive, mpg, engine, tank, volume, length, width, height,wheelbase]

#### Get a dictionary of links to all models of a car brand 

In [40]:
org_url = 'https://www.carspecs.us/'
brand_links = get_brand_links(org_url)

#### Get a dictionary of links to all years of a specific make and model 

In [41]:
columns = ('Model', 'Year', 'Model_url')
model_links_df = pd.DataFrame(columns=columns)
for key, value in brand_links.items():
    df = get_model_links(key, brand_links[key])
    model_links_df = model_links_df.append(df, ignore_index=True)
print(model_links_df)

           Model  Year                                     Model_url
0      Acura ILX  2020   https://www.carspecs.us/cars/2020/acura/ilx
1      Acura ILX  2019   https://www.carspecs.us/cars/2019/acura/ilx
2      Acura ILX  2018   https://www.carspecs.us/cars/2018/acura/ilx
3      Acura ILX  2017   https://www.carspecs.us/cars/2017/acura/ilx
4      Acura ILX  2016   https://www.carspecs.us/cars/2016/acura/ilx
...          ...   ...                                           ...
7020  Volvo XC90  2009  https://www.carspecs.us/cars/2009/volvo/xc90
7021  Volvo XC90  2008  https://www.carspecs.us/cars/2008/volvo/xc90
7022  Volvo XC90  2007  https://www.carspecs.us/cars/2007/volvo/xc90
7023  Volvo XC90  2006  https://www.carspecs.us/cars/2006/volvo/xc90
7024  Volvo XC90  2005  https://www.carspecs.us/cars/2005/volvo/xc90

[7025 rows x 3 columns]


#### Create new df containging specs of every make, model, and year of a car

In [45]:
columns = ('Model', 'url',  'drive','engine','price','doors', 'passengers', 'speed_sec', 'horsepower_hp', 'mpg',  
           'tank_gal', 'volume_cuft', 'length_in', 'width_in', 'height_in','wheelbase_in')
model_spec_df = pd.DataFrame(columns=columns) 

#loop through all rows to get all links for every make, model, and year and send to function to get specs
#append specs to model spec dataframe
idx = 0
for index, row in model_links_df.iterrows():
    specs = get_model_specs(row[2],headers)
    model_spec_df = model_spec_df.append(pd.DataFrame({'Model': row[0],'Year':row[1], 'url': row[2], 'drive': specs[5], 'engine': specs[7],
                                                       'price': specs[0],'doors': specs[1],'passengers': specs[2],'speed_sec': specs[3], 
                                                       'horsepower_hp': specs[4],'mpg': specs[6],
                                                       'tank_gal': specs[8], 'volume_cuft': specs[9], 'length_in': specs[10],
                                                      'width_in': specs[11],'height_in': specs[12],'wheelbase_in':specs[13]}, index=[idx]), 
                                                         ignore_index=True)
    idx = idx +1 #update index of spec df
    
    #pause after every 10 loops
    #change user agent, chosen randomly every 10 loops
    if idx%2 == 0:
        time.sleep(0.5+1*random.random())
        headers = {'user-agent': ua.random}

In [55]:
model_spec_df.to_pickle('../data/model_spec_df.pkl')
model_spec_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 7025 entries, 0 to 7024
Data columns (total 17 columns):
 #   Column         Non-Null Count  Dtype 
---  ------         --------------  ----- 
 0   Model          7025 non-null   object
 1   url            7025 non-null   object
 2   drive          7022 non-null   object
 3   engine         5818 non-null   object
 4   price          5278 non-null   object
 5   doors          3999 non-null   object
 6   passengers     3999 non-null   object
 7   speed_sec      5590 non-null   object
 8   horsepower_hp  6946 non-null   object
 9   mpg            6416 non-null   object
 10  tank_gal       6879 non-null   object
 11  volume_cuft    1765 non-null   object
 12  length_in      5639 non-null   object
 13  width_in       5916 non-null   object
 14  height_in      5937 non-null   object
 15  wheelbase_in   5961 non-null   object
 16  Year           7025 non-null   object
dtypes: object(17)
memory usage: 933.1+ KB


## Clean Data

Remove symbols and units from data in df so data can be changed to numeric datatype

In [247]:
model_spec_df2 = model_spec_df.copy()
yearly_sales_df2 = yearly_sales_df.copy()
replace_list = ['hp', 'mpg', 'gal.', 'cu.ft.', 'in.', ',', 'sec', '$', '$']

#loop through strings to remove in df
for s in replace_list:
    model_spec_df2 = model_spec_df2.replace(s,'', regex=True) 
model_spec_df2['price'] = model_spec_df2['price'].str.replace('$','') #endure dollar sign is removed

In [248]:
model_spec_df2.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 7025 entries, 0 to 7024
Data columns (total 17 columns):
 #   Column         Non-Null Count  Dtype  
---  ------         --------------  -----  
 0   Model          7025 non-null   object 
 1   url            7025 non-null   object 
 2   drive          7022 non-null   object 
 3   engine         5818 non-null   object 
 4   price          5278 non-null   object 
 5   doors          3999 non-null   float64
 6   passengers     3999 non-null   float64
 7   speed_sec      5590 non-null   object 
 8   horsepower_hp  6946 non-null   object 
 9   mpg            6416 non-null   object 
 10  tank_gal       6879 non-null   object 
 11  volume_cuft    1765 non-null   object 
 12  length_in      5639 non-null   object 
 13  width_in       5916 non-null   object 
 14  height_in      5937 non-null   object 
 15  wheelbase_in   5961 non-null   object 
 16  Year           7025 non-null   object 
dtypes: float64(2), object(15)
memory usage: 933.1+ KB


Change all columns to numeric except for drive, engine, doors, and passengers

In [249]:
num_cols = ['price',  'speed_sec', 'horsepower_hp', 'mpg', 'tank_gal', 'volume_cuft', 
            'width_in', 'length_in','height_in','wheelbase_in']
for col in num_cols:
    model_spec_df2[col] = model_spec_df2[col].apply(pd.to_numeric, errors='coerce')

#doors and passengers as strings so that it can be treated as categorical data
# model_spec_df2['doors'] = model_spec_df2['doors'].astype(str)
# model_spec_df2['passengers'] = model_spec_df2['passengers'].astype(str)
model_spec_df2.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 7025 entries, 0 to 7024
Data columns (total 17 columns):
 #   Column         Non-Null Count  Dtype  
---  ------         --------------  -----  
 0   Model          7025 non-null   object 
 1   url            7025 non-null   object 
 2   drive          7022 non-null   object 
 3   engine         5818 non-null   object 
 4   price          5278 non-null   float64
 5   doors          3999 non-null   float64
 6   passengers     3999 non-null   float64
 7   speed_sec      5587 non-null   float64
 8   horsepower_hp  6946 non-null   float64
 9   mpg            6416 non-null   float64
 10  tank_gal       6879 non-null   float64
 11  volume_cuft    1765 non-null   float64
 12  length_in      5639 non-null   float64
 13  width_in       5916 non-null   float64
 14  height_in      5935 non-null   float64
 15  wheelbase_in   5961 non-null   float64
 16  Year           7025 non-null   object 
dtypes: float64(12), object(5)
memory usage: 933.1+ KB


In [250]:
yearly_sales_df2.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 4394 entries, 0 to 4516
Data columns (total 3 columns):
 #   Column       Non-Null Count  Dtype 
---  ------       --------------  ----- 
 0   Model        4394 non-null   object
 1   Total_Sales  4311 non-null   object
 2   Year         4394 non-null   object
dtypes: object(3)
memory usage: 137.3+ KB


#### Left merge Sales df to specs df and find info

Clean Model Names in each Dataframe before merging

In [251]:
import re

def clean_name(name):
    name = name.lower()
    name = re.sub(r'[^\w\s]', ' ', name)
    
#     word_list = name.split()
#     word_list  = [word for word in word_list if word not in stopwords]
#     name = ' '.join(word_list)
    return name

def find_partial_match(name, name_list):
    if 'ford' in name:
        name = re.sub('[0-9\n]','series',name)
    for n in name_list:
        if n in name:
            return n
    return name

In [253]:
model_spec_df2['cleaned'] = model_spec_df2.apply(lambda x: clean_name(x['Model']), axis=1)
yearly_sales_df2['cleaned_name'] = yearly_sales_df2.apply(lambda x: clean_name(x['Model']), axis=1)


name_list = yearly_sales_df2.cleaned_name.unique()
model_spec_df2['cleaned_name'] = model_spec_df2.apply(lambda x: find_partial_match(x['cleaned'], name_list), axis=1)

yearly_sales_df2['Total_Sales'] = yearly_sales_df2['Total_Sales'].apply(pd.to_numeric, errors='coerce')

perform merge on Model and Year columns

In [254]:
model_spec_sales = yearly_sales_df2.merge(model_spec_df2, how = 'left',on=["cleaned_name",'Year'])
                                    
engine = model_spec_sales.groupby(['cleaned_name','Year'])['engine'].agg(pd.Series.mode)
engine = engine.apply(lambda y: float('NaN') if len(y)==0 else y)

drive = model_spec_sales.groupby(['cleaned_name','Year'])['drive'].agg(pd.Series.mode)
drive = drive.apply(lambda y: float('NaN') if len(y)==0 else y)
 
model_spec_sales_df = model_spec_sales.groupby(['cleaned_name','Year']).mean().reset_index()
    
model_spec_sales_df['engine'] = engine[0]
model_spec_sales_df['drive'] = drive[0]

# http://www.carqueryapi.com/
# https://www.fueleconomy.gov/feg/download.shtml
# https://www.reddit.com/r/datasets/comments/b6rcwv/i_scraped_32000_cars_including_the_price_and_115/
model_spec_sales_df.info()
# print(model_spec_sales_df[2000:3200])

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 4394 entries, 0 to 4393
Data columns (total 17 columns):
 #   Column         Non-Null Count  Dtype  
---  ------         --------------  -----  
 0   cleaned_name   4394 non-null   object 
 1   Year           4394 non-null   object 
 2   Total_Sales    4311 non-null   float64
 3   price          2576 non-null   float64
 4   doors          2072 non-null   float64
 5   passengers     2072 non-null   float64
 6   speed_sec      2397 non-null   float64
 7   horsepower_hp  2844 non-null   float64
 8   mpg            2806 non-null   float64
 9   tank_gal       2823 non-null   float64
 10  volume_cuft    956 non-null    float64
 11  length_in      2682 non-null   float64
 12  width_in       2777 non-null   float64
 13  height_in      2779 non-null   float64
 14  wheelbase_in   2788 non-null   float64
 15  engine         4394 non-null   object 
 16  drive          4394 non-null   object 
dtypes: float64(13), object(4)
memory usage: 583.7+ KB


We see that some of the cars with the top sales did not get any spec data. This needs to be further investigated to see if further merging could be done

Find columns total sales df and in specs df that did not merge

In [246]:
unique_sales_df = (yearly_sales_df2.merge(model_spec_df2, on='cleaned_name', how='outer', indicator=True)
            .query('_merge != "both"').drop(columns='_merge'))
unique_sales_df['cleaned_name'].value_counts().head(20) #find top rows that did not merge

honda ridgeline               13
chevrolet equinox             13
lincoln navigator             13
nissan pathfinder             13
honda fcx                     12
lincoln mkz                   12
gmc yukon xl                  10
toyota corolla matrix         10
lincoln mks                   10
lincoln mkx                   10
suzuki forenza reno            9
lincoln mkt                    9
lexus sc430                    9
lincoln town car               9
infiniti fx                    8
fiat 500l                      8
mitsubishi outlander sport     8
infiniti q50                   8
infiniti qx56                  8
alfa romeo 4c                  7
Name: cleaned_name, dtype: int64

Fill in null values, using previous values of the same model

In [285]:
model_spec_sales_df2 = model_spec_sales_df.groupby('cleaned_name').ffill().reindex(model_spec_sales_df.columns, axis=1)
model_spec_sales_df2['cleaned_name'] = model_spec_sales_df['cleaned_name']
# model_spec_sales_df2[2100:2150]
model_spec_sales_df2.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 4394 entries, 0 to 4393
Data columns (total 17 columns):
 #   Column         Non-Null Count  Dtype  
---  ------         --------------  -----  
 0   cleaned_name   4394 non-null   object 
 1   Year           4394 non-null   object 
 2   Total_Sales    4352 non-null   float64
 3   price          3193 non-null   float64
 4   doors          2780 non-null   float64
 5   passengers     2780 non-null   float64
 6   speed_sec      3051 non-null   float64
 7   horsepower_hp  3214 non-null   float64
 8   mpg            3205 non-null   float64
 9   tank_gal       3193 non-null   float64
 10  volume_cuft    1659 non-null   float64
 11  length_in      3140 non-null   float64
 12  width_in       3187 non-null   float64
 13  height_in      3185 non-null   float64
 14  wheelbase_in   3187 non-null   float64
 15  engine         4394 non-null   object 
 16  drive          4394 non-null   object 
dtypes: float64(13), object(4)
memory usage: 583.7+ KB


Even after filling null values there is a lot of missing spec data, maybe try finding more spec data

## Load more spec data

In [341]:
specs_df = pd.read_csv('/Users/racheldilley/Documents/car-sales-predictor-data/fullspecs.csv')

In [342]:
specs_df2 = specs_df.transpose()
specs_df2.columns = specs_df2.iloc[0]
specs_df2.drop(specs_df2.index[0], inplace=True)
specs_df3 = specs_df2[['MSRP','Gas Mileage', 'Engine', 'EPA Class', 'Drivetrain', 'Passenger Capacity', 'Passenger Doors',
                     'Body Style', 'Transmission','EPA Classification', 'Base Curb Weight (lbs)', 'Wheelbase (in)',
                      'Min Ground Clearance (in)', 'Height, Overall (in)', 'Fuel Tank Capacity, Approx (gal)',
                      'Fuel Economy Est-Combined (MPG)', 'Fuel System', 'Engine Type', 'Trans Description Cont.',
                      'Passenger Volume (ft³)']].reset_index()
list(specs_df2.columns)

['MSRP',
 'Gas Mileage',
 'Engine',
 'EPA Class',
 'Style Name',
 'Drivetrain',
 'Passenger Capacity',
 'Passenger Doors',
 'Body Style',
 'Transmission',
 'EPA Classification',
 'Base Curb Weight (lbs)',
 'Front Hip Room (in)',
 'Front Leg Room (in)',
 'Second Shoulder Room (in)',
 'Passenger Volume (ft³)',
 'Second Head Room (in)',
 'Front Shoulder Room (in)',
 'Second Hip Room (in)',
 'Front Head Room (in)',
 'Second Leg Room (in)',
 'Wheelbase (in)',
 'Min Ground Clearance (in)',
 'Track Width, Front (in)',
 'Width, Max w/o mirrors (in)',
 'Track Width, Rear (in)',
 'Height, Overall (in)',
 'Cargo Volume to Seat 1 (ft³)',
 'Cargo Volume to Seat 2 (ft³)',
 'Cargo Volume to Seat 3 (ft³)',
 'Fuel Tank Capacity, Approx (gal)',
 'Fuel Economy Est-Combined (MPG)',
 'EPA Fuel Economy Est - City (MPG)',
 'EPA Fuel Economy Est - Hwy (MPG)',
 'Engine Order Code',
 'SAE Net Torque @ RPM',
 'Fuel System',
 'Engine Type',
 'SAE Net Horsepower @ RPM',
 'Displacement',
 'First Gear Ratio (:1)',
 

In [343]:
col_names = [ 'name','MSRP','mpg', 'engine', 'class', 'drivetrain', 'capacity', 'doors',
                     'body_style', 'transmission','EPA_class', 'curb_weight_lbs', 'wheelbase_in',
                      'ground_clearance_in', 'height_in', 'fuel_tank_cap_gal',
                      'combined_mpg', 'fuel_system', 'engine_type', 'trans_descr',
                      'passenger_volume_cubft']
specs_df3.columns = col_names
specs_df3.head()

Unnamed: 0,name,MSRP,mpg,engine,class,drivetrain,capacity,doors,body_style,transmission,...,curb_weight_lbs,wheelbase_in,ground_clearance_in,height_in,fuel_tank_cap_gal,combined_mpg,fuel_system,engine_type,trans_descr,passenger_volume_cubft
0,2019 Acura RDX Specs: FWD w/Technology Pkg,"$40,600",22 mpg City/28 mpg Hwy,"Turbo Premium Unleaded I-4, 2.0 L",Small Sport Utility Vehicles 2WD,Front Wheel Drive,5,4,Sport Utility,Transmission: 10-Speed Automatic -inc: sequent...,...,3790,108.3,5.7,65.7,17.1,24.0,Gasoline Direct Injection,Turbo Premium Unleaded I-4,Automatic w/OD,104
1,2019 Acura RDX Specs: FWD w/Advance Pkg,"$45,500",22 mpg City/28 mpg Hwy,"Turbo Premium Unleaded I-4, 2.0 L",Small Sport Utility Vehicles 2WD,Front Wheel Drive,5,4,Sport Utility,Transmission: 10-Speed Automatic -inc: sequent...,...,3829,108.3,5.7,65.7,17.1,24.0,Gasoline Direct Injection,Turbo Premium Unleaded I-4,Automatic w/OD,104
2,2019 Acura RDX Specs: FWD w/A-Spec Pkg,"$43,600",22 mpg City/27 mpg Hwy,"Turbo Premium Unleaded I-4, 2.0 L",Small Sport Utility Vehicles 2WD,Front Wheel Drive,5,4,Sport Utility,Transmission: 10-Speed Automatic -inc: sequent...,...,3821,108.3,5.7,65.7,17.1,24.0,Gasoline Direct Injection,Turbo Premium Unleaded I-4,Automatic w/OD,104
3,2019 Acura RDX Specs: FWD,"$37,400",22 mpg City/28 mpg Hwy,"Turbo Premium Unleaded I-4, 2.0 L",Small Sport Utility Vehicles 2WD,Front Wheel Drive,5,4,Sport Utility,Transmission: 10-Speed Automatic -inc: sequent...,...,3783,108.3,5.7,65.7,17.1,24.0,Gasoline Direct Injection,Turbo Premium Unleaded I-4,Automatic w/OD,104
4,2019 Acura RDX Specs: AWD w/Technology Pkg,"$42,600",21 mpg City/27 mpg Hwy,"Turbo Premium Unleaded I-4, 2.0 L",Small Sport Utility Vehicles 4WD,All Wheel Drive,5,4,Sport Utility,Transmission: 10-Speed Automatic -inc: sequent...,...,4026,108.3,5.7,65.7,17.1,23.0,Gasoline Direct Injection,Turbo Premium Unleaded I-4,Automatic w/OD,104


### Clean new spec data

Clean car name

In [344]:
def get_year(name):
    return name[0:4]

def get_name(name):
    name = name[5:]
    sep = 'Specs:'
    stripped = name.split(sep, 1)[0]
    name = stripped.replace(' Specs:', '')
    return name

specs_df3['Year'] = specs_df3.apply(lambda x: get_year(x['name']), axis=1)
specs_df3['Model'] = specs_df3.apply(lambda x: get_name(x['name']), axis=1)
specs_df3.drop('name', axis=1, inplace=True)
specs_df3.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 32316 entries, 0 to 32315
Data columns (total 22 columns):
 #   Column                  Non-Null Count  Dtype 
---  ------                  --------------  ----- 
 0   MSRP                    32262 non-null  object
 1   mpg                     26292 non-null  object
 2   engine                  30341 non-null  object
 3   class                   30340 non-null  object
 4   drivetrain              30600 non-null  object
 5   capacity                32316 non-null  object
 6   doors                   32316 non-null  object
 7   body_style              30600 non-null  object
 8   transmission            29602 non-null  object
 9   EPA_class               19918 non-null  object
 10  curb_weight_lbs         19707 non-null  object
 11  wheelbase_in            30345 non-null  object
 12  ground_clearance_in     17826 non-null  object
 13  height_in               30284 non-null  object
 14  fuel_tank_cap_gal       30179 non-null  object
 15  co

In [345]:
# remove characters before chaning columns to numeric
specs_df3['MSRP'] = specs_df3['MSRP'].str.replace('$','') 
specs_df3['MSRP'] = specs_df3['MSRP'].str.replace(',','') 

#change columns to numeric
num_cols = [ 'MSRP',  'capacity', 'doors','curb_weight_lbs', 'wheelbase_in','ground_clearance_in', 'height_in',
            'fuel_tank_cap_gal','combined_mpg', 'passenger_volume_cubft']

for col in num_cols:
    specs_df3[col] = specs_df3[col].apply(pd.to_numeric, errors='coerce')

In [348]:
specs_df3.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 32316 entries, 0 to 32315
Data columns (total 22 columns):
 #   Column                  Non-Null Count  Dtype  
---  ------                  --------------  -----  
 0   MSRP                    32262 non-null  float64
 1   mpg                     26292 non-null  object 
 2   engine                  30341 non-null  object 
 3   class                   30340 non-null  object 
 4   drivetrain              30600 non-null  object 
 5   capacity                32316 non-null  int64  
 6   doors                   32316 non-null  int64  
 7   body_style              30600 non-null  object 
 8   transmission            29602 non-null  object 
 9   EPA_class               19918 non-null  object 
 10  curb_weight_lbs         19392 non-null  float64
 11  wheelbase_in            30301 non-null  float64
 12  ground_clearance_in     13899 non-null  float64
 13  height_in               30143 non-null  float64
 14  fuel_tank_cap_gal       30145 non-null

In [None]:
# group and find average of numerical columns
model_spec_sales_df = model_spec_sales.groupby(['Model','Year']).mean().reset_index()

str_cols = [ 'engine', 'class', 'drivetrain', 'body_style', 'transmission','EPA_class', 
             'fuel_system', 'engine_type', 'trans_descr',]

engine = model_spec_sales.groupby(['Model','Year'])['engine'].agg(pd.Series.mode)
engine = engine.apply(lambda y: float('NaN') if len(y)==0 else y)

drive = model_spec_sales.groupby(['cleaned_name','Year'])['drive'].agg(pd.Series.mode)
drive = drive.apply(lambda y: float('NaN') if len(y)==0 else y)
 
model_spec_sales_df = model_spec_sales.groupby(['cleaned_name','Year']).mean().reset_index()
    
model_spec_sales_df['engine'] = engine[0]
model_spec_sales_df['drive'] = drive[0]

Identify mismatches with largest value counts and replace strings

In [32]:
#drop rows with null Total Sales values
model_spec_sales_df2 = model_spec_sales_df2[model_spec_sales_df2['Total_Sales'].notna()]

fill in na values with existing model data from another year

In [33]:
all_model_specs = model_spec_df2.copy() 
all_model_specs['passengers'] = all_model_specs['passengers'].apply (pd.to_numeric, errors='coerce')
all_model_specs['doors'] = all_model_specs['doors'].apply (pd.to_numeric, errors='coerce')

In [34]:
models_grouped = all_model_specs.groupby('Model').mean().reset_index()
models_grouped

Unnamed: 0,Model,price,doors,passengers,speed_sec,horsepower_hp,mpg,tank_gal,volume_cuft,length_in,width_in,height_in,wheelbase_in
0,Acura ILX,22210.400000,4.0,5.000000,7.256250,188.250000,28.375000,13.200000,93.416667,180.925000,70.600000,55.600000,105.100000
1,Acura ILX Hybrid,,,,11.400000,111.000000,38.500000,13.200000,99.300000,179.100000,70.600000,55.600000,105.100000
2,Acura MDX,22319.818182,4.0,7.000000,6.763125,290.500000,19.968750,20.268750,137.520000,192.593750,77.920000,67.806250,108.981250
3,Acura NSX,157500.000000,2.0,2.000000,3.518000,516.400000,20.600000,16.180000,45.980000,175.660000,79.700000,47.460000,102.720000
4,Acura RDX,20406.000000,4.0,5.000000,6.921429,260.000000,21.821429,17.014286,103.750000,183.214286,73.814286,65.400000,105.471429
...,...,...,...,...,...,...,...,...,...,...,...,...,...
1223,Volvo XC60,26976.000000,4.0,5.000000,7.515000,247.727273,22.900000,18.500000,98.600000,66.930000,52.570000,47.770000,110.180000
1224,Volvo XC70,14240.833333,4.0,5.000000,7.310833,239.583333,21.416667,18.375000,116.250000,69.540000,64.640000,44.910000,110.080000
1225,Volvo XC90,23963.636364,4.0,6.333333,8.014667,259.666667,20.133333,19.960000,102.833333,83.208333,58.275000,47.266667,114.641667
1226,smart fortwo,9945.900000,2.0,2.000000,9.536364,74.833333,47.416667,8.600000,,106.133333,64.266667,52.750000,73.566667


In [35]:
#loop through 
for index, row in all_model_specs.iterrows():
    i = models_grouped[models_grouped['Model']==row['Model']].index.values
    if i:
        specs = list(models_grouped.iloc[i[0],:])
        specs2 = specs +[float('Nan'),float('Nan'),float('Nan'),float('Nan'),float('Nan')]
        r = list(row) 
        idx = 6
        ix = 1
        try:
            for x in r[4:]:
                    if math.isnan(x):
                        if pd.notna(specs2[ix]):
                            model_spec_sales_df2.iloc[index,idx] = math.floor(specs2[ix])
                            
        except:
            try:
                for x in r[6:]:
                        if math.isnan(int(x)): 
                            if pd.notna(specs2[ix]):
                                model_spec_sales_df2.iloc[index,idx] = math.floor(specs2[ix])
            except:
                continue
                
            idx = idx+1
            ix = ix+1


In [36]:
model_spec_sales_df2.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 4362 entries, 0 to 4444
Data columns (total 18 columns):
 #   Column         Non-Null Count  Dtype  
---  ------         --------------  -----  
 0   Model          4362 non-null   object 
 1   Total_Sales    4362 non-null   object 
 2   Year           4362 non-null   object 
 3   url            3151 non-null   object 
 4   drive          3149 non-null   object 
 5   engine         2963 non-null   object 
 6   price          4195 non-null   float64
 7   doors          3151 non-null   object 
 8   passengers     3151 non-null   object 
 9   speed_sec      2472 non-null   float64
 10  horsepower_hp  3122 non-null   float64
 11  mpg            3050 non-null   float64
 12  tank_gal       3100 non-null   float64
 13  volume_cuft    991 non-null    float64
 14  length_in      2894 non-null   float64
 15  width_in       3023 non-null   float64
 16  height_in      3023 non-null   float64
 17  wheelbase_in   3037 non-null   float64
dtypes: float

drop volume_cuft from model_specs_df because not enough data points and cant be easily estimated

In [37]:
model_spec_sales_df2['Total_Sales'] = model_spec_sales_df2['Total_Sales'].apply(pd.to_numeric, errors='coerce')

In [39]:
#remove cars with total sales below 25
model_spec_sales_dfo = model_spec_sales_df2[model_spec_sales_df2['Total_Sales'] > 25]
model_spec_sales_dfo.describe()

Unnamed: 0,Total_Sales,price,speed_sec,horsepower_hp,mpg,tank_gal,volume_cuft,length_in,width_in,height_in,wheelbase_in
count,4199.0,4047.0,2471.0,3121.0,3049.0,3099.0,991.0,2894.0,3023.0,3023.0,3037.0
mean,54266.792332,27545.468248,7.203318,251.332265,24.422926,18.542959,105.434712,172.725954,69.334969,58.668177,110.440319
std,88536.38317,33186.154258,1.598015,97.171802,11.494455,4.914443,21.987676,41.96475,9.870447,12.036324,10.06864
min,26.0,1989.0,2.51,66.0,10.5,1.9,42.0,9.5,7.5,21.1,78.7
25%,6561.0,11359.5,6.195,178.0,19.0,15.3,94.1,172.3,69.3,55.8,104.3
50%,22668.0,18249.0,7.18,240.0,22.5,18.0,102.9,183.9,72.2,58.1,109.3
75%,62338.5,31154.5,8.31,300.0,26.5,21.1,113.95,193.4,74.6,67.0,114.7
max,909330.0,334655.0,14.03,887.0,141.0,44.0,211.9,266.0,87.3,105.0,172.0


Remove outliers, more than 3 std dev from mean

In [40]:
model_spec_sales_dfo2 = model_spec_sales_dfo[(np.abs(stats.zscore(model_spec_sales_dfo[['Total_Sales','price', 'speed_sec', 
                                                                                      'horsepower_hp', 'mpg', 
                                                                                      'tank_gal', 'length_in', 
                                                                                      'width_in', 'height_in','wheelbase_in']])) < 2.5)] #filtering

In [41]:
model_spec_sales_dfo2.describe()

Unnamed: 0,Total_Sales,price,speed_sec,horsepower_hp,mpg,tank_gal,volume_cuft,length_in,width_in,height_in,wheelbase_in
count,4065.0,3917.0,2424.0,3007.0,2948.0,2983.0,951.0,2815.0,2914.0,2914.0,2928.0
mean,42095.533579,27698.60531,7.189583,251.786831,24.359057,18.441948,105.503785,172.329087,69.520031,58.462752,110.03154
std,52750.713171,33446.965676,1.60251,97.789202,11.568736,4.748522,22.392885,42.124811,9.795075,12.021664,9.235643
min,26.0,2052.0,2.51,66.0,12.0,1.9,42.0,9.5,7.5,21.1,78.7
25%,6152.0,11447.0,6.1775,178.0,19.0,15.5,94.0,172.1,69.4,55.7,103.9
50%,21342.0,18247.0,7.16,241.0,22.5,18.0,103.0,183.9,72.2,58.1,109.3
75%,55654.0,31198.0,8.3,300.0,26.0,21.0,114.45,193.45,74.6,66.875,114.6
max,273060.0,334655.0,14.03,887.0,141.0,44.0,211.9,241.25,87.3,105.0,160.5


Remove volume_cuft because not enough data points and url because not needed

In [42]:
model_spec_sales_dfo2.drop(labels=['volume_cuft', 'url'], axis=1, inplace=True)

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  return super().drop(


Clean drive feature

In [43]:
model_spec_sales_dfo2['drive'] = model_spec_sales_dfo2['drive'].str.lower()
model_spec_sales_dfo2['drive'] = model_spec_sales_dfo2['drive'].str.lstrip()
model_spec_sales_dfo2['drive'] = model_spec_sales_dfo2['drive'].str.rstrip()
model_spec_sales_dfo2['drive'].value_counts()

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  model_spec_sales_dfo2['drive'] = model_spec_sales_dfo2['drive'].str.lower()
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  model_spec_sales_dfo2['drive'] = model_spec_sales_dfo2['drive'].str.lstrip()
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  model_spec_sales_dfo2['drive'] = model_spec_sales_df

front wheel drive                               1240
rear wheel drive                                 730
all wheel drive                                  671
four wheel drive                                 291
front-wheel                                       41
rear-wheel                                        26
autotrac automatic full-time four-wheel            8
full-time instant traction(tm) all wheel           4
full-time all wheel                                4
part-time four-wheel                               3
full-time quattro all wheel                        3
automatic full-time all wheel                      2
all-mode 4wd part and full-time four-wheel         2
automatic full-time four-wheel                     2
part and full-time four-wheel                      1
real time(tm) automatic full-time four-wheel       1
full-time all4 all wheel                           1
versatrak automatic full-time all wheel            1
multi-mode part-time four-wheel               

In [46]:
model_spec_sales_dfo2['drive'] = model_spec_sales_dfo2['drive'].str.replace('rear-wheel','rear wheel drive')
model_spec_sales_dfo2['drive'] = model_spec_sales_dfo2['drive'].str.replace('front-wheel','front wheel drive')

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  model_spec_sales_dfo2['drive'] = model_spec_sales_dfo2['drive'].str.replace('rear-wheel','rear wheel drive')
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  model_spec_sales_dfo2['drive'] = model_spec_sales_dfo2['drive'].str.replace('front-wheel','front wheel drive')


In [49]:
#change values to other is value count is below 10
c = model_spec_sales_dfo2['drive'].value_counts()
model_spec_sales_dfo2['drive'] = np.where(model_spec_sales_dfo2['drive'].isin(c.index[c<10]), 'other',
                                          model_spec_sales_dfo2['drive'])

model_spec_sales_dfo2['drive'].value_counts()

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  model_spec_sales_dfo2['drive'] = np.where(model_spec_sales_dfo2['drive'].isin(c.index[c<10]), 'other',


front wheel drive    1281
rear wheel drive      756
all wheel drive       671
four wheel drive      291
other                  33
Name: drive, dtype: int64

investigate engine column

In [51]:
model_spec_sales_dfo2['engine'].value_counts()

Gas          2736
Electric       50
Flex Fuel      33
Hybrid         23
Diesel         12
Name: engine, dtype: int64

investigate passenger column

In [52]:
model_spec_sales_dfo2['passengers'].value_counts()

5.0    1333
nan     945
4.0     260
7.0     196
2.0     135
8.0      75
6.0      53
3.0      24
9.0      12
Name: passengers, dtype: int64

investigate doors column

In [53]:
model_spec_sales_dfo2['doors'].value_counts()

4.0    1673
nan     945
2.0     400
3.0      15
Name: doors, dtype: int64

Investigate duplicates

In [55]:
model_spec_sales_dfo2d = model_spec_sales_dfo2.dropna()
model_spec_sales_dfo2d = model_spec_sales_dfo2d[model_spec_sales_dfo2d.duplicated(subset=['Model','Year'], keep=False)]
model_spec_sales_dfo2d

Unnamed: 0,Model,Total_Sales,Year,drive,engine,price,doors,passengers,speed_sec,horsepower_hp,mpg,tank_gal,length_in,width_in,height_in,wheelbase_in


In [58]:
#remove rowws with nan doors or passengers columns
model_spec_sales_dfo3 = model_spec_sales_dfo2.copy()
model_spec_sales_dfo3 = model_spec_sales_dfo3[model_spec_sales_dfo3.doors != 'nan']
model_spec_sales_dfo3 = model_spec_sales_dfo3[model_spec_sales_dfo3.passengers != 'nan']

Add car classifications

In [None]:
def get_classification(row):
    l = row[12]
    wb = row[15]
    if math.isnan(l) or math.isnan(wb):
        return float('NaN')
    clas = ''
    if l > 195 and wb > 110:
        clas = 'large'
    elif l >= 180 and wb>= 105:
        clas = 'midsize'
    else:
        clas = 'small'
        
    return clas

In [None]:
model_spec_sales_df4 = model_spec_sales_dfo3.copy()
model_spec_sales_df4['Class'] = model_spec_sales_df4.apply(get_classification, axis=1)

pickle model sales and specs df to data folder

In [None]:
model_spec_sales_df4.to_pickle('../data/model_spec_sales_df.pkl')