In [254]:
import pandas as pd
import numpy as np

# text cleaning
import re

### Load data

In [122]:
full_model_spec_sales_df = pd.read_csv('../Data/full_model_spec_sales_df.csv')

full_model_spec_sales_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 4092 entries, 0 to 4091
Data columns (total 26 columns):
 #   Column           Non-Null Count  Dtype  
---  ------           --------------  -----  
 0   Unnamed: 0       4092 non-null   int64  
 1   Unnamed: 0.1     4092 non-null   int64  
 2   Total_Sales      4092 non-null   float64
 3   Year             4092 non-null   int64  
 4   cleaned_name     4092 non-null   object 
 5   curb_weight_lbs  2125 non-null   float64
 6   body_style       2375 non-null   object 
 7   engine_type      2368 non-null   object 
 8   speed_sec        3151 non-null   float64
 9   horsepower_hp    3417 non-null   float64
 10  length_in        3349 non-null   float64
 11  width_in         3405 non-null   float64
 12  msrp             3535 non-null   float64
 13  pass_capacity    3262 non-null   float64
 14  doors            3262 non-null   float64
 15  wheelbase_in     3548 non-null   float64
 16  height_in        3546 non-null   float64
 17  tank_cap_gal  

### Clean data

In [124]:
# drop columns with small numbers 
# drop_cols = ['ground_clearance_in', 'cylinders', 'lug_vol_cuft']
full_model_spec_sales_df2 = full_model_spec_sales_df.copy()
# full_model_spec_sales_df2.drop(drop_cols, axis=1, inplace=True)

#drop rows with less than 100 sales
full_model_spec_sales_df2 = full_model_spec_sales_df2[full_model_spec_sales_df2['Total_Sales'] > 100]

full_model_spec_sales_df2.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 4092 entries, 0 to 4091
Data columns (total 26 columns):
 #   Column           Non-Null Count  Dtype  
---  ------           --------------  -----  
 0   Unnamed: 0       4092 non-null   int64  
 1   Unnamed: 0.1     4092 non-null   int64  
 2   Total_Sales      4092 non-null   float64
 3   Year             4092 non-null   int64  
 4   cleaned_name     4092 non-null   object 
 5   curb_weight_lbs  2125 non-null   float64
 6   body_style       2375 non-null   object 
 7   engine_type      2368 non-null   object 
 8   speed_sec        3151 non-null   float64
 9   horsepower_hp    3417 non-null   float64
 10  length_in        3349 non-null   float64
 11  width_in         3405 non-null   float64
 12  msrp             3535 non-null   float64
 13  pass_capacity    3262 non-null   float64
 14  doors            3262 non-null   float64
 15  wheelbase_in     3548 non-null   float64
 16  height_in        3546 non-null   float64
 17  tank_cap_gal  

In [132]:
def clean_categorical_column(value, remove_words, replace_dict):
    try:
        value = value.lower()
        value2 = re.sub(r'[^\w\s]', '', value)
        
        #loop through strings to remove in df
        for key, value in replace_dict.items():
            value2 = value2.replace(key, value) 
        
        for word in remove_words:
            value2 = value2.replace(word, '')

        return value2.strip()
    
    except:
        return value

### Clean drive column

In [207]:
# replace strings
replace_dict = {'4': 'four ', '2': 'two', 'drive': '', 'awd': 'all wheel', '4 ': 'four ', '  ': ' ', '4 ': 'four ',
               'frontwheel': 'front wheel', 'allwheel': 'all wheel', "rearwheel": 'rear wheel', 'frontwheel': 'front wheel',
               'fourwheel': 'four wheel', 'two wheel': 'front wheel'}

# #loop through strings to remove in df
# for key, value in replace_dict.items():
#     full_model_spec_sales_df2 = full_model_spec_sales_df2.replace(key, value, regex=True) 

remove_words = ['fulltime', 'automatic', 'quattro', 'instant', 'tractiontm', 'autotrac', 'allfour', 'versatrak',
               'multimode', 'parttime']
full_model_spec_sales_df2['drivetrain'] = full_model_spec_sales_df2.apply(lambda x: 
                                                                          clean_categorical_column(x['drivetrain'], remove_words, replace_dict), axis=1)

full_model_spec_sales_df2['drivetrain'].value_counts()

front wheel    1632
rear wheel      919
all wheel       684
four wheel      364
Name: drivetrain, dtype: int64

### Clean fuel type

In [225]:
def clean_fuel_type(value):
    try:
        value = value.lower()
        
        if 'premium' in value or 'gas v6' in value or 'regular unleaded' in value or 'gas v8' in value:
            return 'gas'
        elif 'midgrade' in value or 'turbocharged' in value or 'regular' in value or 'flat' in value:
            return 'gas'
        elif 'gas/electric' in value or 'electric/gas' in value or 'hybrid' in value:
            return 'hybrid'
        elif 'e85' in value or 'flex' in value or 'gasethanol' in value:
            return 'flex fuel'
        elif 'electricity' in value or 'electric' in value:
            return 'electric'
        elif 'diesel' in value:
            return 'diesel'
        elif 'four' in value or 'turbo' in value or 'gas' in value:
            return 'gas'
        else:
            return value
    except:
        return value

remove_words = ['ity']
replace_dict = {}
full_model_spec_sales_df2['fuel'] = full_model_spec_sales_df2.apply(lambda x: 
                                                                          clean_categorical_column(x['fuel_typ'], remove_words, replace_dict), axis=1)

full_model_spec_sales_df2['fuel'] = full_model_spec_sales_df2.apply(lambda x: 
                                                                          clean_fuel_type(x['fuel']), axis=1)


full_model_spec_sales_df2['fuel'].value_counts()[0:20]

gas          3324
flex fuel     115
electric       76
diesel         47
hybrid         11
Name: fuel, dtype: int64

### Clean class EPA

In [226]:
def clean_epa_class(value):
    try:
        value = value.lower()
        
        if 'special purpose' in value:
            return 'special purpose'
        elif '' == value:
            return 'midsize'
        elif 'wagon' == value:
            return 'small station wagon'
        elif 'pickup' in value:
            return 'pickup truck'
        elif 'full size' in value:
            return 'large'
        else:
            return value
    except:
        return value

# replace strings
replace_dict2 = {'sport utility vehicles': 'suv','sport utility vehicle': 'suv', 'twowd': '2wd', 'four wd': '4wd',
                'twoseaters': 'two seaters', '  ': ' ', 'minivans':'minivan', 'wagons':'wagon',
                 'seaters':'seater', '2wd minivan': 'minivan 2wd', 'sport utililty': 'suv', 
                'twoseater': 'two seater', 'wgn':'wagon', 'sport utility': 'suv', '4wd suv': 'suv 4wd', 
                 '2wd suv': 'suv 2wd', 'awd suv': 'suv 4wd',
                'trucks': 'truck', '4wd special purpose' : 'special purpose 4wd', '4wd  pickup truck': 'pickup truck 4wd',
                '  ':' ', 'fwd suv': 'suv 4wd', '2wd van': 'minivan 2wd', 'awd': '4wd'}

remove_words = ['cars', 'standard', 'car', 'sedan', 'tbd', 'vehicle', '4wd', '2wd']
full_model_spec_sales_df2['class'] = full_model_spec_sales_df2.apply(lambda x: 
                                                                          clean_categorical_column(x['class_EPA'], remove_words, replace_dict2), axis=1)

full_model_spec_sales_df2['class'] = full_model_spec_sales_df2.apply(lambda x: 
                                                                          clean_epa_class(x['class']), axis=1)


full_model_spec_sales_df2['class'].value_counts()

suv                      559
midsize                  504
compact                  405
large                    282
subcompact               246
small suv                211
two seater               195
small station wagon      116
minivan                   78
minicompact               76
midsize station wagon     23
pickup truck              15
special purpose           15
Name: class, dtype: int64

### Clean Transmission 

In [227]:
def clean_transmission(value):
    try:
        value = value.lower()
        
        if 'semi-automatic' in value or 'dual clutch' in value or 'dualclutch' in value or 'double clutch' in value:
            return 'DCT'
        elif 'continuously variable transmission' in value or 'cvt' in value:
            return 'CVT'
        elif 'dsg' in value or 'tronic' in value:
            return 'DCT'
        elif 'fully automatic' in value or 'single speed' in value or 'doppelkupplung' in value:
            return 'AT'
        elif 'manual' in value:
            return 'MT'
        elif 'automatic' in value or 'auto' in value or 'mct' in value or 'amg' in value:
            return 'AT'
        else:
            return 'AT'
    except:
        return value

# replace strings
replace_dict = {}

remove_words = []
full_model_spec_sales_df2['transmission'] = full_model_spec_sales_df2.apply(lambda x: 
                                                                          clean_categorical_column(x['trans'], remove_words, replace_dict2), axis=1)

full_model_spec_sales_df2['transmission'] = full_model_spec_sales_df2.apply(lambda x: 
                                                                          clean_transmission(x['transmission']), axis=1)


full_model_spec_sales_df2['transmission'].value_counts()[0:20]

AT     1459
DCT     757
MT      683
CVT      18
Name: transmission, dtype: int64

### drop uneeded columns

In [237]:
drop_cols = ['trans', 'class_EPA', 'engine', 'engine_type', 'fuel_typ', 'Unnamed: 0', 'Unnamed: 0.1', 'body_style']
full_model_spec_sales_df3 = full_model_spec_sales_df2.copy()
full_model_spec_sales_df3.drop(drop_cols, axis = 1, inplace=True)

full_model_spec_sales_df3.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 4092 entries, 0 to 4091
Data columns (total 20 columns):
 #   Column           Non-Null Count  Dtype  
---  ------           --------------  -----  
 0   Total_Sales      4092 non-null   float64
 1   Year             4092 non-null   int64  
 2   cleaned_name     4092 non-null   object 
 3   curb_weight_lbs  2125 non-null   float64
 4   speed_sec        3151 non-null   float64
 5   horsepower_hp    3417 non-null   float64
 6   length_in        3349 non-null   float64
 7   width_in         3405 non-null   float64
 8   msrp             3535 non-null   float64
 9   pass_capacity    3262 non-null   float64
 10  doors            3262 non-null   float64
 11  wheelbase_in     3548 non-null   float64
 12  height_in        3546 non-null   float64
 13  tank_cap_gal     3534 non-null   float64
 14  vol_cubft        2855 non-null   float64
 15  mpg_comb         3540 non-null   float64
 16  drivetrain       3599 non-null   object 
 17  fuel          

### Investigate and remove outliers

In [262]:
full_model_spec_sales_df3.length_in=np.where(full_model_spec_sales_df3.length_in > 130 ,full_model_spec_sales_df3.length_in, float('NaN'))
full_model_spec_sales_df3.length_in=np.where(full_model_spec_sales_df3.length_in < 230 ,full_model_spec_sales_df3.length_in, float('NaN'))

full_model_spec_sales_df3.wheelbase_in=np.where(full_model_spec_sales_df3.wheelbase_in > 65 ,full_model_spec_sales_df3.wheelbase_in, float('NaN'))
full_model_spec_sales_df3.wheelbase_in=np.where(full_model_spec_sales_df3.wheelbase_in < 200 ,full_model_spec_sales_df3.wheelbase_in, float('NaN'))

full_model_spec_sales_df3.width_in=np.where(full_model_spec_sales_df3.width_in > 50 ,full_model_spec_sales_df3.width_in, float('NaN'))
full_model_spec_sales_df3.width_in=np.where(full_model_spec_sales_df3.width_in < 90 ,full_model_spec_sales_df3.width_in, float('NaN'))

full_model_spec_sales_df3.height_in=np.where(full_model_spec_sales_df3.height_in > 50 ,full_model_spec_sales_df3.height_in, float('NaN'))
full_model_spec_sales_df3.height_in=np.where(full_model_spec_sales_df3.height_in < 100 ,full_model_spec_sales_df3.height_in, float('NaN'))

full_model_spec_sales_df3.vol_cubft=np.where(full_model_spec_sales_df3.vol_cubft >  70,full_model_spec_sales_df3.vol_cubft, float('NaN'))
# full_model_spec_sales_df3.tank_cap_gal=np.where(full_model_spec_sales_df3.tank_cap_gal < 100 ,full_model_spec_sales_df3.tank_cap_gal, float('NaN'))

full_model_spec_sales_df3.describe()

Unnamed: 0,Total_Sales,Year,curb_weight_lbs,speed_sec,horsepower_hp,length_in,width_in,msrp,pass_capacity,doors,wheelbase_in,height_in,tank_cap_gal,vol_cubft,mpg_comb
count,4092.0,4092.0,2125.0,3151.0,3417.0,3015.0,3061.0,3535.0,3262.0,3262.0,3548.0,3329.0,3534.0,2394.0,3540.0
mean,52807.072825,2012.321848,3770.228568,7.274884,247.961638,185.206395,72.374283,33345.830853,4.981399,3.593901,110.235067,62.435561,18.281127,107.853097,23.829553
std,84332.066439,4.728129,780.260996,1.606929,97.961146,14.440425,4.277418,29970.800684,1.231157,0.763217,9.752325,7.198866,4.665798,26.372732,10.359974
min,102.0,2005.0,1822.666667,2.51,66.0,139.6,50.6,1634.0,2.0,2.0,73.5,50.1,1.9,71.6,11.0
25%,7074.0,2008.0,3277.52381,6.275,174.0,175.7,70.4,14478.0,5.0,4.0,104.3,57.1,15.3,93.3,19.0
50%,22903.0,2012.0,3676.857143,7.215,240.0,186.2,72.5,27246.0,5.0,4.0,109.3,59.2,18.0,101.0,22.0
75%,61317.25,2016.0,4233.0,8.36,300.0,194.9,74.8,40831.25,5.0,4.0,114.6,68.0,20.722321,108.4,26.0
max,909330.0,2020.0,6090.0,14.03,887.0,228.9,87.3,583050.0,9.0,4.0,160.721053,93.665,44.0,262.0,141.0


In [260]:
full_model_spec_sales_df3[full_model_spec_sales_df3['length_in'] > 230]

Unnamed: 0,Total_Sales,Year,cleaned_name,curb_weight_lbs,speed_sec,horsepower_hp,length_in,width_in,msrp,pass_capacity,doors,wheelbase_in,height_in,tank_cap_gal,vol_cubft,mpg_comb,drivetrain,fuel,class,transmission


### Save as CSV

In [102]:
full_model_spec_sales_df3.to_csv('../Data/full_model_spec_sales_df_cleaned.csv')

null_cols = ['vol_cubft', 'mpg_comb', 'drivetrain', 'fuel_typ','trans', 'class_EPA', 'engine']

null_df = full_model_spec_sales_df3[full_model_spec_sales_df2[null_cols].isna().all(1)] #.groupby('cleaned_name')
null_df['cleaned_name'].value_counts()

chrysler town country    10
infiniti fx               8
infiniti qx56             8
mercedes benz slk         7
infiniti m                7
                         ..
saturn l                  1
buick terrazda            1
ford bronco sport         1
scion xd                  1
dodge nitro               1
Name: cleaned_name, Length: 291, dtype: int64