In [254]:
import pandas as pd
import numpy as np

# text cleaning
import re

### Load data

In [288]:
full_model_spec_sales_df = pd.read_csv('../Data/partial_model_spec_sales_df.csv')

full_model_spec_sales_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 4352 entries, 0 to 4351
Data columns (total 29 columns):
 #   Column               Non-Null Count  Dtype  
---  ------               --------------  -----  
 0   Unnamed: 0           4352 non-null   int64  
 1   Total_Sales          4352 non-null   float64
 2   Year                 4352 non-null   int64  
 3   cleaned_name         4352 non-null   object 
 4   curb_weight_lbs      2127 non-null   float64
 5   ground_clearance_in  1694 non-null   float64
 6   class                0 non-null      float64
 7   body_style           2377 non-null   object 
 8   engine_type          2370 non-null   object 
 9   trans_descr          0 non-null      float64
 10  speed_sec            3301 non-null   float64
 11  horsepower_hp        3575 non-null   float64
 12  width_in             3559 non-null   float64
 13  msrp                 3690 non-null   float64
 14  pass_capacity        3401 non-null   float64
 15  doors                3401 non-null   f

### Clean data

In [289]:
# drop columns with small numbers 
drop_cols = ['ground_clearance_in', 'cylinders', 'lug_vol_cuft', 'trans_descr', 'class']
full_model_spec_sales_df2 = full_model_spec_sales_df.copy()
full_model_spec_sales_df2.drop(drop_cols, axis=1, inplace=True)

#drop rows with less than 100 sales
full_model_spec_sales_df2 = full_model_spec_sales_df2[full_model_spec_sales_df2['Total_Sales'] > 100]

full_model_spec_sales_df2.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 4092 entries, 0 to 4351
Data columns (total 24 columns):
 #   Column           Non-Null Count  Dtype  
---  ------           --------------  -----  
 0   Unnamed: 0       4092 non-null   int64  
 1   Total_Sales      4092 non-null   float64
 2   Year             4092 non-null   int64  
 3   cleaned_name     4092 non-null   object 
 4   curb_weight_lbs  2116 non-null   float64
 5   body_style       2366 non-null   object 
 6   engine_type      2359 non-null   object 
 7   speed_sec        3142 non-null   float64
 8   horsepower_hp    3408 non-null   float64
 9   width_in         3396 non-null   float64
 10  msrp             3526 non-null   float64
 11  pass_capacity    3253 non-null   float64
 12  doors            3253 non-null   float64
 13  wheelbase_in     3539 non-null   float64
 14  height_in        3537 non-null   float64
 15  tank_cap_gal     3525 non-null   float64
 16  length_in        3354 non-null   float64
 17  vol_cubft     

In [290]:
def clean_categorical_column(value, remove_words, replace_dict):
    try:
        value = value.lower()
        value2 = re.sub(r'[^\w\s]', '', value)
        
        #loop through strings to remove in df
        for key, value in replace_dict.items():
            value2 = value2.replace(key, value) 
        
        for word in remove_words:
            value2 = value2.replace(word, '')

        return value2.strip()
    
    except:
        return value

### Clean drive column

In [291]:
# replace strings
replace_dict = {'4': 'four ', '2': 'two', 'drive': '', 'awd': 'all wheel', '4 ': 'four ', '  ': ' ', '4 ': 'four ',
               'frontwheel': 'front wheel', 'allwheel': 'all wheel', "rearwheel": 'rear wheel', 'frontwheel': 'front wheel',
               'fourwheel': 'four wheel', 'two wheel': 'front wheel'}

# #loop through strings to remove in df
# for key, value in replace_dict.items():
#     full_model_spec_sales_df2 = full_model_spec_sales_df2.replace(key, value, regex=True) 

remove_words = ['fulltime', 'automatic', 'quattro', 'instant', 'tractiontm', 'autotrac', 'allfour', 'versatrak',
               'multimode', 'parttime']
full_model_spec_sales_df2['drivetrain'] = full_model_spec_sales_df2.apply(lambda x: 
                                                                          clean_categorical_column(x['drivetrain'], remove_words, replace_dict), axis=1)

full_model_spec_sales_df2['drivetrain'].value_counts()

front wheel    1625
rear wheel      925
all wheel       684
four wheel      364
Name: drivetrain, dtype: int64

### Clean fuel type

In [292]:
def clean_fuel_type(value):
    try:
        value = value.lower()
        
        if 'premium' in value or 'gas v6' in value or 'regular unleaded' in value or 'gas v8' in value:
            return 'gas'
        elif 'midgrade' in value or 'turbocharged' in value or 'regular' in value or 'flat' in value:
            return 'gas'
        elif 'gas/electric' in value or 'electric/gas' in value or 'hybrid' in value:
            return 'hybrid'
        elif 'e85' in value or 'flex' in value or 'gasethanol' in value:
            return 'flex fuel'
        elif 'electricity' in value or 'electric' in value:
            return 'electric'
        elif 'diesel' in value:
            return 'diesel'
        elif 'four' in value or 'turbo' in value or 'gas' in value:
            return 'gas'
        else:
            return 'gas'
    except:
        return value

remove_words = ['ity']
replace_dict = {}
full_model_spec_sales_df2['fuel'] = full_model_spec_sales_df2.apply(lambda x: 
                                                                          clean_categorical_column(x['fuel_typ'], remove_words, replace_dict), axis=1)

full_model_spec_sales_df2['fuel'] = full_model_spec_sales_df2.apply(lambda x: 
                                                                          clean_fuel_type(x['fuel']), axis=1)


full_model_spec_sales_df2['fuel'].value_counts()[0:20]

gas          3305
electric      131
flex fuel     103
diesel         22
hybrid         14
Name: fuel, dtype: int64

### Clean class EPA

In [299]:
def clean_epa_class(value):
    try:
        value = value.lower()
        
        if 'special purpose' in value:
            return 'special purpose'
        elif '' == value:
            return 'midsize'
        elif 'wagon' == value:
            return 'small station wagon'
        elif 'pickup' in value:
            return 'pickup truck'
        elif 'full size' in value:
            return 'large'
        else:
            return value
    except:
        return value

# replace strings
replace_dict2 = {'sport utility vehicles': 'suv','sport utility vehicle': 'suv', 'twowd': '2wd', 'four wd': '4wd',
                'twoseaters': 'two seaters', '  ': ' ', 'minivans':'minivan', 'wagons':'wagon',
                 'seaters':'seater', '2wd minivan': 'minivan 2wd', 'sport utililty': 'suv', 
                'twoseater': 'two seater', 'wgn':'wagon', 'sport utility': 'suv', '4wd suv': 'suv 4wd', 
                 '2wd suv': 'suv 2wd', 'awd suv': 'suv 4wd',
                'trucks': 'truck', '4wd special purpose' : 'special purpose 4wd', '4wd  pickup truck': 'pickup truck 4wd',
                '  ':' ', 'fwd suv': 'suv 4wd', '2wd van': 'minivan 2wd', 'awd': '4wd', '2':'two'}

remove_words = ['cars', 'standard', 'car', 'sedan', 'tbd', 'vehicle', '4wd', '2wd', 'twowd']
full_model_spec_sales_df2['class'] = full_model_spec_sales_df2.apply(lambda x: 
                                                                          clean_categorical_column(x['class_EPA'], remove_words, replace_dict2), axis=1)

full_model_spec_sales_df2['class'] = full_model_spec_sales_df2.apply(lambda x: 
                                                                          clean_epa_class(x['class']), axis=1)


full_model_spec_sales_df2['class'].value_counts()

suv                      564
midsize                  537
compact                  423
large                    300
subcompact               254
small suv                213
two seater               207
small station wagon      121
minivan                   82
minicompact               76
midsize station wagon     23
pickup truck              16
special purpose           15
Name: class, dtype: int64

### Clean Transmission 

In [300]:
def clean_transmission(value):
    try:
        value = value.lower()
        
        if 'semi-automatic' in value or 'dual clutch' in value or 'dualclutch' in value or 'double clutch' in value:
            return 'DCT'
        elif 'continuously variable transmission' in value or 'cvt' in value:
            return 'CVT'
        elif 'dsg' in value or 'tronic' in value:
            return 'DCT'
        elif 'fully automatic' in value or 'single speed' in value or 'doppelkupplung' in value:
            return 'AT'
        elif 'manual' in value:
            return 'MT'
        elif 'automatic' in value or 'auto' in value or 'mct' in value or 'amg' in value:
            return 'AT'
        else:
            return 'AT'
    except:
        return value

# replace strings
replace_dict = {}

remove_words = []
full_model_spec_sales_df2['transmission'] = full_model_spec_sales_df2.apply(lambda x: 
                                                                          clean_categorical_column(x['trans'], remove_words, replace_dict2), axis=1)

full_model_spec_sales_df2['transmission'] = full_model_spec_sales_df2.apply(lambda x: 
                                                                          clean_transmission(x['transmission']), axis=1)


full_model_spec_sales_df2['transmission'].value_counts()[0:20]

AT     1547
DCT     757
MT      703
CVT      18
Name: transmission, dtype: int64

### drop uneeded columns

In [301]:
drop_cols = ['trans', 'class_EPA', 'engine', 'engine_type', 'fuel_typ', 'Unnamed: 0',  'body_style']
full_model_spec_sales_df3 = full_model_spec_sales_df2.copy()
full_model_spec_sales_df3.drop(drop_cols, axis = 1, inplace=True)

full_model_spec_sales_df3.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 4092 entries, 0 to 4351
Data columns (total 20 columns):
 #   Column           Non-Null Count  Dtype  
---  ------           --------------  -----  
 0   Total_Sales      4092 non-null   float64
 1   Year             4092 non-null   int64  
 2   cleaned_name     4092 non-null   object 
 3   curb_weight_lbs  2116 non-null   float64
 4   speed_sec        3142 non-null   float64
 5   horsepower_hp    3408 non-null   float64
 6   width_in         3396 non-null   float64
 7   msrp             3526 non-null   float64
 8   pass_capacity    3253 non-null   float64
 9   doors            3253 non-null   float64
 10  wheelbase_in     3539 non-null   float64
 11  height_in        3537 non-null   float64
 12  tank_cap_gal     3525 non-null   float64
 13  length_in        3354 non-null   float64
 14  vol_cubft        2907 non-null   float64
 15  mpg_comb         3539 non-null   float64
 16  drivetrain       3598 non-null   object 
 17  fuel          

### Investigate and remove outliers

In [302]:
full_model_spec_sales_df3.length_in=np.where(full_model_spec_sales_df3.length_in > 130 ,full_model_spec_sales_df3.length_in, float('NaN'))
full_model_spec_sales_df3.length_in=np.where(full_model_spec_sales_df3.length_in < 230 ,full_model_spec_sales_df3.length_in, float('NaN'))

full_model_spec_sales_df3.wheelbase_in=np.where(full_model_spec_sales_df3.wheelbase_in > 65 ,full_model_spec_sales_df3.wheelbase_in, float('NaN'))
full_model_spec_sales_df3.wheelbase_in=np.where(full_model_spec_sales_df3.wheelbase_in < 200 ,full_model_spec_sales_df3.wheelbase_in, float('NaN'))

full_model_spec_sales_df3.width_in=np.where(full_model_spec_sales_df3.width_in > 50 ,full_model_spec_sales_df3.width_in, float('NaN'))
full_model_spec_sales_df3.width_in=np.where(full_model_spec_sales_df3.width_in < 90 ,full_model_spec_sales_df3.width_in, float('NaN'))

full_model_spec_sales_df3.height_in=np.where(full_model_spec_sales_df3.height_in > 50 ,full_model_spec_sales_df3.height_in, float('NaN'))
full_model_spec_sales_df3.height_in=np.where(full_model_spec_sales_df3.height_in < 100 ,full_model_spec_sales_df3.height_in, float('NaN'))

full_model_spec_sales_df3.vol_cubft=np.where(full_model_spec_sales_df3.vol_cubft >  70,full_model_spec_sales_df3.vol_cubft, float('NaN'))
# full_model_spec_sales_df3.tank_cap_gal=np.where(full_model_spec_sales_df3.tank_cap_gal < 100 ,full_model_spec_sales_df3.tank_cap_gal, float('NaN'))

full_model_spec_sales_df3.describe()

Unnamed: 0,Total_Sales,Year,curb_weight_lbs,speed_sec,horsepower_hp,width_in,msrp,pass_capacity,doors,wheelbase_in,height_in,tank_cap_gal,length_in,vol_cubft,mpg_comb
count,4092.0,4092.0,2116.0,3142.0,3408.0,3052.0,3526.0,3253.0,3253.0,3539.0,3320.0,3525.0,3017.0,2428.0,3539.0
mean,52807.072825,2012.321848,3768.631242,7.279109,247.858544,72.363886,33328.610909,4.981347,3.592778,110.22829,62.437826,18.277873,185.238508,107.858408,23.823571
std,84332.066439,4.728129,781.532424,1.607286,98.06657,4.279427,30006.989549,1.232859,0.763973,9.763794,7.208488,4.671239,14.484097,26.431945,10.364373
min,102.0,2005.0,1822.666667,2.51,66.0,50.6,1634.0,2.0,2.0,73.5,50.1,1.9,139.6,71.6,11.0
25%,7074.0,2008.0,3271.857143,6.29,174.0,70.375,14401.75,5.0,4.0,104.3,57.1,15.3,175.7,93.25,19.0
50%,22903.0,2012.0,3671.461039,7.22,240.0,72.5,27111.086957,5.0,4.0,109.3,59.2,18.0,186.2,101.0,22.0
75%,61317.25,2016.0,4234.5,8.36,300.0,74.8,40784.955357,5.0,4.0,114.6,68.0,20.74,194.9,108.4,26.0
max,909330.0,2020.0,6090.0,14.03,887.0,87.3,583050.0,9.0,4.0,160.721053,93.665,44.0,228.9,262.0,141.0


In [303]:
full_model_spec_sales_df3[full_model_spec_sales_df3['length_in'] > 230]

Unnamed: 0,Total_Sales,Year,cleaned_name,curb_weight_lbs,speed_sec,horsepower_hp,width_in,msrp,pass_capacity,doors,wheelbase_in,height_in,tank_cap_gal,length_in,vol_cubft,mpg_comb,drivetrain,fuel,class,transmission


### Save as CSV

In [304]:
full_model_spec_sales_df3.to_csv('../Data/full_model_spec_sales_df_cleaned.csv')

null_cols = ['vol_cubft', 'mpg_comb', 'drivetrain', 'fuel_typ','trans', 'class_EPA', 'engine']

null_df = full_model_spec_sales_df3[full_model_spec_sales_df2[null_cols].isna().all(1)] #.groupby('cleaned_name')
null_df['cleaned_name'].value_counts()

lincoln mks              10
chrysler town country    10
infiniti fx               8
infiniti qx56             8
mercedes benz slk         7
                         ..
bmw 1 2 ³                 1
ford bronco sport         1
scion xd                  1
toyota yaris              1
dodge nitro               1
Name: cleaned_name, Length: 289, dtype: int64