In [19]:
from bs4 import BeautifulSoup
import requests
import pandas as pd
import re
import datetime
import time
import random
from fake_useragent import UserAgent 
import math
import numpy as np
from scipy import stats

ua = UserAgent()
user_agent_list = [
    'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_5) AppleWebKit/605.1.15 (KHTML, like Gecko) Version/13.1.1 Safari/605.1.15',
    'Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:77.0) Gecko/20100101 Firefox/77.0',
    'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_5) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/83.0.4103.97 Safari/537.36',
    'Mozilla/5.0 (Macintosh; Intel Mac OS X 10.15; rv:77.0) Gecko/20100101 Firefox/77.0',
    'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/83.0.4103.97 Safari/537.36',
]
headers = {'user-agent': ua.random}
print(ua.random)

Mozilla/5.0 (Windows NT 6.2; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/27.0.1453.93 Safari/537.36


#### Get Monthly Sales for 2019 & 2020

In [2]:
def get_monthly_sales_df(url, year_str, tbl_num):
    '''
    A function that gets monthly sales of every make and model sold in the us for a specified year

    Parameters
    ----------
    url : url containg monthly sales data, in string format 
    year_str : year in string format
    tbl_num : index number corresponding to the location of the table on the webpage (1st table is at index 0)

    Returns
    -------
    Return a df of monthly US car sales for every make and model, months as columns
    '''
    response = requests.get(url, headers=headers)
    page= response.text
    soup = BeautifulSoup(page, "lxml")
    
    #find car sales data and turn it into a list
    tbl_str = soup.find_all('tbody')[tbl_num].text
    tbl_list = tbl_str.split('\n')
    tbl_list = [i for i in tbl_list if i] 
    
    #create empty time series df for specified year
    date_time_str = year_str + '-01'
    start_date = datetime.datetime.strptime(date_time_str, '%Y-%m')
    index = pd.date_range(start_date, periods=12, freq='m')
    df = pd.DataFrame(index=index)
    
    #fill empty df with monthly sales for all makes and models
    col_name = ''
    idx = 0
    for x in range(0,len(tbl_list)//13):
        col_name = tbl_list[x+idx]
        list_vals = []
        for val in range(x+1,x+13):
            list_vals.append(tbl_list[idx+val])
        df[col_name]=list_vals
        idx = idx+12
    return df

new df called monthly_sales_df

In [3]:
#only monthly sales data for years 2019 and 2020
url = 'https://www.goodcarbadcar.net/2019-us-vehicle-sales-figures-by-model/' 
df_sales = get_monthly_sales_df(url, '2019', 2)
url = 'https://www.goodcarbadcar.net/2020-us-vehicle-sales-figures-by-model/'
df = get_monthly_sales_df(url, '2020', 1)
monthly_sales_df = pd.concat((df_sales, df))
#monthly_sales_df.columns

pickle df of monthly sales for 2019 and 2020 to data folder

In [4]:
monthly_sales_df.to_pickle('../data/monthly_sales_df.pkl')

#### Get All Model Yearly Sales for 2005-2020

In [5]:
def get_model_sales_df(url, year):
    '''
    A function that gets all year end model sales for a specified year

    Parameters
    ----------
    url : url containg year end sales data for every make and model sold in the US, in string format
    year : year in string format

    Returns
    -------
    Return a df of yearly US car sales for every make and model 
    columns = Model, Year, and Total_Sales
    '''
    #read url page into list of pandas dfs 
    response = requests.get(url, headers=headers)
    page = response.text
    df_list = pd.read_html(page)
    
    #find correct df based on number of data frames on url page and 
    if len(df_list) == 1 or year == '2019':
        df = pd.DataFrame(df_list[0])
    elif len(df_list) == 2:
        df = pd.DataFrame(df_list[1])
    else:
        df = pd.DataFrame(df_list[len(df_list)-1])
    
    #special case for 2020 data because sales data is in monthly sales format for each model
    if year == '2020':  
        soup = BeautifulSoup(page, "lxml")
    
        #find car sales data and turn it into a list
        tbl_str = soup.find_all('tbody')[1].text
        tbl_list = tbl_str.split('\n')
        tbl_list = [i for i in tbl_list if i] 
        
        columns = ('Model', 'Year', 'Total_Sales')
        df = pd.DataFrame(columns=columns)

        #fill empty df with monthly sales for all makes and models
        model_name = ''
        idx = 0
        index = 0
        for x in range(0,len(tbl_list)//13):
            model_name = tbl_list[x+idx]
            year_sum = 0
            for val in range(x+1,x+13):
                month_total = tbl_list[idx+val].replace(',','')
                month_total = int(month_total)
                year_sum = year_sum + month_total
            #df1 = pd.DataFrame([model_name, year_sum], columns = ['Model', year])
            df = df.append(pd.DataFrame({'Model': model_name, 'Year': year, 'Total_Sales': year_sum}, index=[index]), 
                           ignore_index=True)
            index = index+1
            #df.append(df1)
            idx = idx+12
        return df
    
    #special cases for finding the location of the total sales based on years
    if year == '2012':
        df = df.iloc[:, [2,3]]
    elif year == '2005':
        df = df.iloc[:, [1,3]]
    elif year in ['2017','2018', '2019', '2020']:
        df = df.iloc[:, [0,4]]
    else:
        df = df.iloc[:, [1,2]]
    
    #add columns to df and year column
    df.columns = ['Model', 'Total_Sales']
    df['Year'] = year
    
    #* indicate further breakdown of sum totals, overall totals be removed further down when duplicates are removed
    #other symbols refer to subnotes in the tables and are not apart of model names
    symbols = ['*', '²', '¹', '^', '†', '‡']
    for s in symbols:
        df['Model'] = df['Model'].str.replace(s,'')

    #clean model and Total_Sales column
    df.dropna(subset=['Model'], inplace=True)
    df['Model'] = df['Model'].str.rstrip()
    df['Model'] = df['Model'].str.lstrip()
    df['Total_Sales'] = df['Total_Sales'].apply(pd.to_numeric, errors='coerce') #to numeric
    
    #remove first in set of duplicates b/c first is a sum of a car and the hybrid model
    df.drop_duplicates(subset='Model', keep='last', inplace=True)
    
    return df

In [6]:
#dictionary of all urls containing all year end sales data with year as the key
url_dict = {'2006': 'https://www.goodcarbadcar.net/2006-usa-auto-sales-rankings-by-mode/',
            '2007': 'https://www.goodcarbadcar.net/usa-2007-vehicle-sales-rankings-by-mode/',
            '2008': 'https://www.goodcarbadcar.net/2008-america-auto-sales-rankings-by-mode/',
            '2009': 'https://www.goodcarbadcar.net/usa-auto-sales-rankings-by-model-2009/',
            '2010': 'https://www.goodcarbadcar.net/2010-america-auto-sales-rankings-by-mode/',
            '2011': 'https://www.goodcarbadcar.net/top-268-best-selling-vehicles-2011-year/',
            '2012': 'https://www.goodcarbadcar.net/2012-usa-auto-sales-rankings-by-model7/',
            '2013': 'https://www.goodcarbadcar.net/usa-vehicle-sales-rankings-by-model-december-2013-year-end/',
            '2014': 'https://www.goodcarbadcar.net/usa-all-cars-sales-figures-2014-december-year-end/',
            '2015': 'https://www.goodcarbadcar.net/usa-car-sales-by-model-2015-year-end-december/',
            '2016': 'https://www.goodcarbadcar.net/usa-2016-vehicle-sales-by-model-manufacturer-brand/',
            '2017': 'https://www.goodcarbadcar.net/december-2017-year-end-u-s-passenger-car-sales-rankings-top-171-best-selling-cars-america-every-car-ranked/',
            '2018': 'https://www.goodcarbadcar.net/december-2018-the-best-selling-vehicles-in-america-every-vehicle-ranked/',
            '2019': 'https://www.goodcarbadcar.net/2019-us-vehicle-sales-figures-by-model/',
            '2020': 'https://www.goodcarbadcar.net/2020-us-vehicle-sales-figures-by-model/'
           }

url = 'https://www.goodcarbadcar.net/2006-usa-auto-sales-rankings-by-mode/'
yearly_sales_df= get_model_sales_df(url = url, year = '2005') #new yearl_sales_df to store all yearly sales data

years = ['2005']
#loop through dictionary with urls and stack data frames 
for key, value in url_dict.items():
    years.append(key)
    df = get_model_sales_df(url = value, year = key) #get df of total sales
    yearly_sales_df = pd.concat([yearly_sales_df, df], axis=0)

print(yearly_sales_df)

  df['Model'] = df['Model'].str.replace(s,'')
  df['Model'] = df['Model'].str.replace(s,'')
  df['Model'] = df['Model'].str.replace(s,'')
  df['Model'] = df['Model'].str.replace(s,'')
  df['Model'] = df['Model'].str.replace(s,'')
  df['Model'] = df['Model'].str.replace(s,'')
  df['Model'] = df['Model'].str.replace(s,'')
  df['Model'] = df['Model'].str.replace(s,'')
  df['Model'] = df['Model'].str.replace(s,'')
  df['Model'] = df['Model'].str.replace(s,'')
  df['Model'] = df['Model'].str.replace(s,'')
  df['Model'] = df['Model'].str.replace(s,'')
  df['Model'] = df['Model'].str.replace(s,'')
  df['Model'] = df['Model'].str.replace(s,'')
  df['Model'] = df['Model'].str.replace(s,'')


                     Model Total_Sales  Year
0            Ford F-Series    901463.0  2005
1      Chevrolet Silverado    705980.0  2005
2             Toyota Camry    431703.0  2005
3    Toyota Corolla/Matrix    341290.0  2005
4                Dodge Ram    400543.0  2005
..                     ...         ...   ...
298        Volvo 60-Series       15729  2020
299        Volvo 90-Series        3195  2020
300             Volvo XC40       23778  2020
301             Volvo XC60       32078  2020
302             Volvo XC90       34251  2020

[4518 rows x 3 columns]


In [7]:
#remove rows with no sales
yearly_sales_df = yearly_sales_df[yearly_sales_df.Total_Sales != 0]
yearly_sales_df = yearly_sales_df.reset_index(drop=True) #reset index after stacking dfs

#Clean total sales dataframe by removing rows containing certain strings
remove_strings = ["Market",'Total','Family','Brand','Passenger Cars, SUVs, Crossovers','Minivans','Pickup Trucks',
                 'Commercial Vans', 'COMPANY', 'MOTOR', 'GROUP', 'AMERICAN', 'AUTOMOBILES', 'JAGUAR', 'DAIMLER']
for string in remove_strings:
    yearly_sales_df = yearly_sales_df[~yearly_sales_df.Model.str.contains(string)]

Unnamed: 0,Model,Total_Sales,Year
0,Ford F-Series,901463.0,2005
1,Chevrolet Silverado,705980.0,2005
2,Toyota Camry,431703.0,2005
3,Toyota Corolla/Matrix,341290.0,2005
4,Dodge Ram,400543.0,2005
...,...,...,...
4388,Volvo 60-Series,15729,2020
4389,Volvo 90-Series,3195,2020
4390,Volvo XC40,23778,2020
4391,Volvo XC60,32078,2020


In [8]:
def get_brand_links(org_url):
    '''
    A function that gets all links to makes and models for every car brand

    Parameters
    ----------
    url : url containg links to all brands

    Returns
    -------
    Return a dictionary of links to every make and model for every brand 
    Key = brand, value = brand url
    '''
    response = requests.get(org_url, headers=headers)
    page = response.text
    soup = BeautifulSoup(page, "lxml")
    
    brand_link_dict = dict() #empty dict for all urls
    link_list = soup.find_all('li') #get all list objects on the url page
    #loop through all list objects, starting at index 7 where the brand links start
    for x in link_list[7:]:
        brand = x.text
        link = 'https://www.carspecs.us' + x.find('a')['href']
        brand_link_dict[brand] = link
        time.sleep(0.5) #pause
    return brand_link_dict

In [9]:
def get_model_links(brand, url):
    '''
    A function that gets all links to every make, model, and year greater than 2005 ever sold by a specified car brand

    Parameters
    ----------
    url : url containg links to every make and model ever sold by a specified car brand, string format
    brand : brand name in string format

    Returns
    -------
    Return a dataframe of links to every make, model, and year for brand 
    columns = Model, Year, and Model url 
    '''
    #load url page with all car models for specified brand url
    response = requests.get(url, headers=headers) #random user agent
    page = response.text
    soup = BeautifulSoup(page, "lxml")
    
    #find first div with all model links
    div_list = soup.find_all('div', class_='pure-u-1 pure-u-md-1-2')
    
    #new df to store all makes and models for all years past 2004
    columns = ('Model', 'Year', 'Model_url')
    df = pd.DataFrame(columns=columns)
    
    #loop through all model links for specified brand 
    for x in div_list[1].find_all('li'):
        model = brand + ' ' + x.text #brand and model name
        link = org_url + x.find('a')['href'] #model url
        
        #load model url to get list of model year urls
        response = requests.get(link)
        page = response.text
        soup = BeautifulSoup(page, "lxml")
        year_links = soup.find_all('li') #list of all year urls for specified model
        
        #loop through all year links starting at index 7
        idx = 7
        index = 0
        for y in year_links[7:]:
            year = year_links[idx].text
            if year in years: #only add year links that are 2005 to 2020
                model_link = 'https://www.carspecs.us' + year_links[idx].find('a')['href'] #add orginal url to string
                df = df.append(pd.DataFrame({'Model': model, 'Year': year, 'Model_url': model_link}, index=[index]), 
                                ignore_index=True)
                index = index+1
                idx = idx+1
        time.sleep(0.5) #pause 
            
    return df

In [10]:
def get_model_specs(url,head):
    '''
    A function that gets all specs for a specified make, model, and year of a car

    Parameters
    ----------
    url : url containg links to every year make and model ever sold by a specified car brand, string format
    hear : header for a page request, in dictioanry format

    Returns
    -------
    Return a list of model specs 
    '''
    response = requests.get(url, headers=head)
    page = response.text
    soup = BeautifulSoup(page, "lxml")

    #set all values to nan
    doors =passengers =speed =horsepower =drive =engine =tank =volume =length =width =height = wheelbase =float('NaN')
    mpg = ''
    
    #try to find price
    try:
        price = soup.find(text=re.compile('starting from'))
        price = price.findNext().text
    
    except:
        price = float('NaN')
    
    #list of all divs containg car specs
    div_list = soup.find('div', class_='car-details').find_all('div')

    #loop through all divs and search for strings to find certain car specs and assign to varibales if found
    for div in div_list[1:]:
        #create list, index 0 containg the spec, and index 1 containg the spec value
        spec_list = div.text.split('\n')
        spec_list = [i for i in spec_list if i] #remove empty values in list
        if spec_list:
            if 'RPM' not in spec_list[0]:
                if 'Passenger Doors' in spec_list[0]:
                    doors = int(spec_list[-1])
                if 'Passenger Capacity' in spec_list[0]:
                    passengers = int(spec_list[-1])
                if 'mph' in spec_list[0]:
                    speed = spec_list[-1]
                if 'Horsepower' in spec_list[0]:
                    horsepower = spec_list[-1]
                if 'Drive type' in spec_list[0]:
                    drive = spec_list[-1]
                if 'combined' in spec_list[0]:
                    mpg = spec_list[-1]
                if 'Combined' in spec_list[0]:
                    mpg = spec_list[-1]
                if 'Engine type' in spec_list[0]:
                    engine = spec_list[-1]
                    engine = engine.replace('\t', '')
                if 'tank capacity' in spec_list[0]:
                    tank = spec_list[-1]
                if 'EPA interior' in spec_list[0]:
                    volume = spec_list[-1]
                if 'Length' in spec_list[0]:
                    length = spec_list[-1]
                if 'Width' in spec_list[0]:
                    width = spec_list[-1]
                if 'Height' in spec_list[0]:
                    height = spec_list[-1]
                if 'Wheelbase' in spec_list[0]:
                    wheelbase = spec_list[-1]
        
        #if mpg not found in spec lists, find combined mpg by averaged highway and city mpg by searching for strings
        if mpg == '':
            mpg = soup.find(text=re.compile('highway mpg'))
            try:
                mpg_list = mpg.split('/ ')
                city = mpg_list[0].lstrip('\r\n ')
                highway = mpg_list[1]
                mpg = (int(city[0:2])+int(highway[0:2]))/2
            except:
                mpg = float('NaN') #return nan if not found
                
    #time.sleep(.1+.5*random.random()) #random pause 
    
    return [price, doors, passengers, speed, horsepower, drive, mpg, engine, tank, volume, length, width, height,wheelbase]

#### Get a dictionary of links to all models of a car brand 

In [11]:
org_url = 'https://www.carspecs.us/'
brand_links = get_brand_links(org_url)

#### Get a dictionary of links to all years of a specific make and model 

In [12]:
columns = ('Model', 'Year', 'Model_url')
model_links_df = pd.DataFrame(columns=columns)
for key, value in brand_links.items():
    df = get_model_links(key, brand_links[key])
    model_links_df = model_links_df.append(df, ignore_index=True)
print(model_links_df)

#### Create new df containging specs of every make, model, and year of a car

In [20]:
columns = ('Model', 'url',  'drive','engine','price','doors', 'passengers', 'speed_sec', 'horsepower_hp', 'mpg',  
           'tank_gal', 'volume_cuft', 'length_in', 'width_in', 'height_in','wheelbase_in')
model_spec_df = pd.DataFrame(columns=columns) 

#loop through all rows to get all links for every make, model, and year and send to function to get specs
#append specs to model spec dataframe
idx = 0
for index, row in model_links_df.iterrows():
    specs = get_model_specs(row[2],headers)
    print(index)
    model_spec_df = model_spec_df.append(pd.DataFrame({'Model': row[0],'Year':row[1], 'url': row[2], 'drive': specs[5], 'engine': specs[7],
                                                       'price': specs[0],'doors': specs[1],'passengers': specs[2],'speed_sec': specs[3], 
                                                       'horsepower_hp': specs[4],'mpg': specs[6],
                                                       'tank_gal': specs[8], 'volume_cuft': specs[9], 'length_in': specs[10],
                                                      'width_in': specs[11],'height_in': specs[12],'wheelbase_in':specs[13]}, index=[idx]), 
                                                         ignore_index=True)
    idx = idx +1 #update index of spec df
    
    #pause after every 10 loops
    if idx%10 == 0:
        time.sleep(1+1*random.random())
        
        #change username, chosen randomly every 50 loops
        if idx%500 == 0:
            user_agent = random.choice(user_agent_list)
            headers = {'user-agent': ua.random}

0
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
27

ConnectionError: HTTPSConnectionPool(host='www.carspecs.us', port=443): Max retries exceeded with url: /cars/2008/bmw/1-series (Caused by NewConnectionError('<urllib3.connection.HTTPSConnection object at 0x7faed9ca7ac0>: Failed to establish a new connection: [Errno 60] Operation timed out'))

## Clean Data

Remove symbols and units from data in df so data can be changed to numeric datatype

In [None]:
model_spec_df2 = model_spec_df.copy()
replace_list = ['hp', 'mpg', 'gal.', 'cu.ft.', 'in.', ',', 'sec', '$', '$']

#loop through strings to remove in df
for s in replace_list:
    model_spec_df2 = model_spec_df2.replace(s,'', regex=True) 
model_spec_df2['price'] = model_spec_df2['price'].str.replace('$','') #endure dollar sign is removed

Change all columns to numeric except for drive, engine, doors, and passengers

In [None]:
num_cols = ['price',  'speed_sec', 'horsepower_hp', 'mpg', 'tank_gal', 'volume_cuft', 
            'width_in', 'length_in','height_in','wheelbase_in']
for col in num_cols:
    model_spec_df2[col] = model_spec_df2[col].apply(pd.to_numeric, errors='coerce')

#doors and passengers as strings so that it can be treated as categorical data
model_spec_df2['doors'] = model_spec_df2['doors'].astype(str)
model_spec_df2['passengers'] = model_spec_df2['passengers'].astype(str)
model_spec_df2.info()

#### Left merge Sales df to specs df and find info

perform intial merge on Model and Year columns

In [None]:
model_spec_sales_df = yearly_sales_df.merge(model_spec_df2, how = 'left',on=["Model",'Year'])
model_spec_sales_df.info()
model_spec_sales_df.head()

We see that some of the cars with the top sales did not get any spec data. This needs to be further investigated to see if further merging could be done

Find columns total sales df and in specs df that did not merge

In [None]:
unique_sales_df = (yearly_sales_df.merge(model_spec_df2, on='Model', how='outer', indicator=True)
            .query('_merge != "both"').drop(columns='_merge'))
unique_sales_df['Model'].value_counts().head(20) #find top rows that did not merge

Identify mismatches with largest value counts and replace strings

In [None]:
#investigate why chevy silverado is not merging
chevy_models = model_spec_df2[model_spec_df2['Model'].str.contains('Cheverolet S')].dropna() 
#remove strings from specs df so that Chevy Silverado has total sales data
model_spec_df2['Model'] = model_spec_df2['Model'].str.replace(' 1500','')
model_spec_df2['Model'] = model_spec_df2['Model'].str.replace(' 2500HD','')

In [None]:
#investigate why ford f-series is not merging
ford_f_models = model_spec_df2[model_spec_df2['Model'].str.contains('Ford F')].dropna()
#replace strings from specs df so that ford f-series has total sales data
model_spec_df2['Model'] = model_spec_df2['Model'].str.replace('150','Series') #150 model is most popular

In [None]:
#investigate why Mini Coopers are not merging
mini_models = model_spec_df2[model_spec_df2['Model'].str.contains('MINI')].dropna()
#replace strings from specs df so that ford f-series has total sales data
model_spec_df2['Model'] = model_spec_df2['Model'].str.replace('MINI','Mini')

Run a second merge with changes

In [None]:
model_spec_sales_df2 = yearly_sales_df.merge(model_spec_df2, how = 'left',on=["Model",'Year'])
model_spec_sales_df2.info()
model_spec_sales_df2.head()

In [None]:
#drop rows with null Total Sales values
model_spec_sales_df2 = model_spec_sales_df2[model_spec_sales_df2['Total_Sales'].notna()]

fill in na values with existing model data from another year

In [None]:
all_model_specs = model_spec_df2.copy() 
all_model_specs['passengers'] = all_model_specs['passengers'].apply (pd.to_numeric, errors='coerce')
all_model_specs['doors'] = all_model_specs['doors'].apply (pd.to_numeric, errors='coerce')

In [None]:
models_grouped = all_model_specs.groupby('Model').mean().reset_index()
models_grouped

In [None]:
for index, row in model_spec_sales_df2.iterrows():
    i = models_grouped[models_grouped['Model']==row['Model']].index.values
    if i:
        specs = list(models_grouped.iloc[i[0],:])
        specs2 = specs +[float('Nan'),float('Nan'),float('Nan'),float('Nan'),float('Nan')]
        r = list(row) 
        #print(specs2)
        #print(r)
        idx = 6
        ix = 1
        try:
            for x in r[4:]:
                #print(specs[idx])
                    if math.isnan(x): 
                        #print(specs[idx])
                        if pd.notna(specs2[ix]):
                            model_spec_sales_df2.iloc[index,idx] = math.floor(specs2[ix])
                            #print(model_spec_sales_df2.iloc[index,:])
                            
        except:
            try:
                for x in r[6:]:
                    #print(specs[idx])
                        if math.isnan(int(x)): 
                            #print(specs[idx])
                            if pd.notna(specs2[ix]):
                                model_spec_sales_df2.iloc[index,idx] = math.floor(specs2[ix])
                                #print(model_spec_sales_df2.iloc[index,:])
            except:
                continue
                
            idx = idx+1
            ix = ix+1


In [None]:
model_spec_sales_df2.info()

drop volume_cuft from model_specs_df because not enough data points and cant be easily estimated

In [None]:
model_spec_sales_df2['Total_Sales'] = model_spec_sales_df2['Total_Sales'].apply(pd.to_numeric, errors='coerce')

In [None]:
# model_spec_sales_df2.dropna(inplace=True)
model_spec_sales_df2.describe()

In [None]:
model_spec_sales_dfo = model_spec_sales_df2[model_spec_sales_df2['Total_Sales'] > 25]
model_spec_sales_dfo.describe()

Remove outliers, more than 3 std dev from mean

In [None]:
model_spec_sales_dfo2 = model_spec_sales_dfo[(np.abs(stats.zscore(model_spec_sales_dfo[['Total_Sales','price', 'speed_sec', 
                                                                                      'horsepower_hp', 'mpg', 
                                                                                      'tank_gal', 'length_in', 
                                                                                      'width_in', 'height_in','wheelbase_in']])) < 2.5)] #filtering

In [None]:
model_spec_sales_dfo2.describe()

Remove volume_cuft because not enough data points and url because not needed

In [None]:
model_spec_sales_dfo2.drop(labels=['volume_cuft', 'url'], axis=1, inplace=True)

Clean drive feature

In [None]:
model_spec_sales_dfo2['drive'] = model_spec_sales_dfo2['drive'].str.lower()
model_spec_sales_dfo2['drive'] = model_spec_sales_dfo2['drive'].str.lstrip()
model_spec_sales_dfo2['drive'] = model_spec_sales_dfo2['drive'].str.rstrip()
#model_spec_sales_dfo2 = model_spec_sales_dfo2.dropna()
model_spec_sales_dfo2['drive'].value_counts()

In [None]:
model_spec_sales_dfo2['drive'] = model_spec_sales_dfo2['drive'].str.replace('rear-wheel','rear wheel drive')
model_spec_sales_dfo2['drive'] = model_spec_sales_dfo2['drive'].str.replace('front-wheel','front wheel drive')
#replace = ['full-time all wheel', 'attesa e-ts full-time all wheel', 'automatic full-time all wheel']

model_spec_sales_dfo2['drive'] = model_spec_sales_dfo2['drive'].str.replace('automatic full-time all wheel',
                                                                        'all wheel drive')
model_spec_sales_dfo2['drive'] = model_spec_sales_dfo2['drive'].str.replace('versatrak ','')
model_spec_sales_dfo2['drive'] = model_spec_sales_dfo2['drive'].str.replace('full-time all wheel','all wheel drive')
model_spec_sales_dfo2['drive'] = model_spec_sales_dfo2['drive'].str.replace('real time automatic full-time four-wheel',
                                                                        'all wheel drive')
# for s in replace:
#     model_spec_sales_df['drive'] = model_spec_sales_df['drive'].str.replace(s,'all wheel drive')
model_spec_sales_dfo2['drive'].value_counts()

In [None]:
#model_spec_sales_dfo2.where(model_spec_sales_dfo2.apply(lambda x: x['drive'].map(x['drive'].value_counts()))>=10, "other")
c = model_spec_sales_dfo2['drive'].value_counts()
model_spec_sales_dfo2['drive'] = np.where(model_spec_sales_dfo2['drive'].isin(c.index[c<10]), 'other',
                                          model_spec_sales_dfo2['drive'])

model_spec_sales_dfo2['drive'].value_counts()

In [None]:
model_spec_sales_dfo2.info()
model_spec_sales_dfo2.describe()

investigate engine column

In [None]:
model_spec_sales_dfo2['engine'].value_counts()

investigate passenger column

In [None]:
model_spec_sales_dfo2['passengers'].value_counts()

investigate doors column

In [None]:
model_spec_sales_dfo2['doors'].value_counts()

Investigate duplicates

In [None]:
#model_spec_sales_dfo2d = model_spec_sales_dfo2.dropna()
model_spec_sales_dfo2d = model_spec_sales_dfo2d[model_spec_sales_dfo2d.duplicated(subset=['Model','Year'], keep=False)]
model_spec_sales_dfo2d

In [None]:
#model_spec_sales_dfo3 = model_spec_sales_dfo2.dropna()
model_spec_sales_dfo3 = model_spec_sales_dfo2.copy()
model_spec_sales_dfo3 = model_spec_sales_dfo3[model_spec_sales_dfo3.doors != 'nan']
model_spec_sales_dfo3 = model_spec_sales_dfo3[model_spec_sales_dfo3.passengers != 'nan']

In [None]:
model_spec_sales_dfo3.info()
model_spec_sales_dfo3.sample(10)

Add car classifications

In [None]:
def get_classification(row):
    l = row[12]
    wb = row[15]
    if math.isnan(l) or math.isnan(wb):
        return float('NaN')
    clas = ''
    if l > 195 and wb > 110:
        clas = 'large'
    elif l >= 180 and wb>= 105:
        clas = 'midsize'
    else:
        clas = 'small'
        
    return clas

In [None]:
model_spec_sales_df4 = model_spec_sales_dfo3.copy()
model_spec_sales_df4['Class'] = model_spec_sales_df4.apply(get_classification, axis=1)

In [None]:
# model_spec_sales_df4['passengers'] = model_spec_sales_df4['passengers'].apply (pd.to_numeric, errors='coerce')
# model_spec_sales_df4['doors'] = model_spec_sales_df4['doors'].apply (pd.to_numeric, errors='coerce')

model_spec_sales_df4.info()

pickle model sales and specs df to data folder

In [None]:
model_spec_sales_df4.to_pickle('../data/model_spec_sales_df.pkl')