In [30]:
import pandas as pd

# text cleaning
import re

### Load data

In [71]:
full_model_spec_sales_df = pd.read_csv('../Data/full_model_spec_sales_df.csv')

full_model_spec_sales_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 4352 entries, 0 to 4351
Data columns (total 27 columns):
 #   Column               Non-Null Count  Dtype  
---  ------               --------------  -----  
 0   Unnamed: 0           4352 non-null   int64  
 1   Total_Sales          4352 non-null   float64
 2   Year                 4352 non-null   int64  
 3   cleaned_name         4352 non-null   object 
 4   curb_weight_lbs      2136 non-null   float64
 5   ground_clearance_in  1694 non-null   float64
 6   body_style           2386 non-null   object 
 7   engine_type          2379 non-null   object 
 8   speed_sec            3310 non-null   float64
 9   horsepower_hp        3584 non-null   float64
 10  length_in            3511 non-null   float64
 11  width_in             3568 non-null   float64
 12  msrp                 3699 non-null   float64
 13  pass_capacity        3410 non-null   float64
 14  doors                3410 non-null   float64
 15  wheelbase_in         3711 non-null   f

### Clean data

In [72]:
# drop columns with small numbers 
drop_cols = ['ground_clearance_in', 'cylinders', 'lug_vol_cuft']
full_model_spec_sales_df2 = full_model_spec_sales_df.copy()
full_model_spec_sales_df2.drop(drop_cols, axis=1, inplace=True)

#drop rows with less than 100 sales
full_model_spec_sales_df2 = full_model_spec_sales_df2[full_model_spec_sales_df2['Total_Sales'] > 100]

full_model_spec_sales_df2.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 4092 entries, 0 to 4351
Data columns (total 24 columns):
 #   Column           Non-Null Count  Dtype  
---  ------           --------------  -----  
 0   Unnamed: 0       4092 non-null   int64  
 1   Total_Sales      4092 non-null   float64
 2   Year             4092 non-null   int64  
 3   cleaned_name     4092 non-null   object 
 4   curb_weight_lbs  2125 non-null   float64
 5   body_style       2375 non-null   object 
 6   engine_type      2368 non-null   object 
 7   speed_sec        3151 non-null   float64
 8   horsepower_hp    3417 non-null   float64
 9   length_in        3349 non-null   float64
 10  width_in         3405 non-null   float64
 11  msrp             3535 non-null   float64
 12  pass_capacity    3262 non-null   float64
 13  doors            3262 non-null   float64
 14  wheelbase_in     3548 non-null   float64
 15  height_in        3546 non-null   float64
 16  tank_cap_gal     3534 non-null   float64
 17  vol_cubft     

In [77]:
def clean_categorical_column(value, remove_words):
    try:
        value = value.lower()
        value2 = re.sub(r'[^\w\s]', '', value)
        
        for word in remove_words:
            value2 = value2.replace(word, '')

        return value2.strip()
    
    except:
        return value

### Clean drive column

In [78]:
# replace strings
replace_dict = {'4': 'four ', '2': 'two', 'drive': '', 'awd': 'all wheel', '4 ': 'four ', '  ': ' ', '4 ': 'four ',
               'frontwheel': 'front wheel', 'allwheel': 'all wheel', "rearwheel": 'rear wheel', 'frontwheel': 'front wheel',
               'fourwheel': 'four wheel'}

#loop through strings to remove in df
for key, value in replace_dict.items():
    full_model_spec_sales_df2 = full_model_spec_sales_df2.replace(key, value, regex=True) 

remove_words = ['fulltime', 'automatic', 'quattro', 'instant', 'tractiontm', 'autotrac', 'allfour', 'versatrak',
               'multimode', 'parttime']
full_model_spec_sales_df2['drivetrain'] = full_model_spec_sales_df2.apply(lambda x: 
                                                                          clean_categorical_column(x['drivetrain'], remove_words), axis=1)

full_model_spec_sales_df2['drivetrain'].value_counts()

front wheel    1626
rear wheel      919
all wheel       684
four wheel      364
two wheel         6
Name: drivetrain, dtype: int64

### Clean fuel type

In [98]:
def clean_fuel_type(value):
    try:
        value = value.lower()
        
        if 'premium' in value:
            return 'premium'
        elif 'gas/electric' in value or 'electric/gas' in value or 'hybrid' in value:
            return 'hybrid'
        elif 'gas/ethanol' in value:
            return 'gas/ethanol'
        elif 'electricity' in value or 'electric' in value:
            return 'electric'
        elif 'diesel' in value:
            return 'diesel'
        elif 'regular unleaded' in value or 'gasoline' in value or 'gas' in value:
            return 'regular'
        else:
            return 'regular'
    except:
        return value

# replace strings
# replace_dict = {'4': 'four ', '2': 'two', 'drive': '', 'awd': 'all wheel', '4 ': 'four ', '  ': ' ', '4 ': 'four ',
#                'frontwheel': 'front wheel', 'allwheel': 'all wheel', "rearwheel": 'rear wheel', 'frontwheel': 'front wheel',
#                'fourwheel': 'four wheel'}

# #loop through strings to remove in df
# for key, value in replace_dict.items():
#     full_model_spec_sales_df2 = full_model_spec_sales_df2.replace(key, value, regex=True) 

# remove_words = ['fulltime', 'automatic', 'quattro', 'instant', 'tractiontm', 'autotrac', 'allfour', 'versatrak',
#                'multimode', 'parttime']
remove_words = ['ity']
full_model_spec_sales_df2['fuel'] = full_model_spec_sales_df2.apply(lambda x: 
                                                                          clean_categorical_column(x['fuel'], remove_words), axis=1)

full_model_spec_sales_df2['fuel'] = full_model_spec_sales_df2.apply(lambda x: 
                                                                          clean_fuel_type(x['fuel_typ']), axis=1)


full_model_spec_sales_df2['fuel'].value_counts()

regular        2476
premium         878
gas/ethanol      77
electric         51
diesel           48
hybrid           43
Name: fuel, dtype: int64

### Clean class EPA

In [100]:
full_model_spec_sales_df2['class_EPA'].value_counts()

Compact Cars                   260
Large Cars                     199
Subcompact Cars                194
Mid-Size Cars                  187
Midsize Cars                   163
                              ... 
twoWD sport Utility Vehicle      1
Mini-Compact Car                 1
AWD Sport Utility Vehicle        1
Full Size                        1
Compact Sedan                    1
Name: class_EPA, Length: 80, dtype: int64

In [None]:
full_model_spec_sales_df.to_csv('../Data/partial_model_spec_sales_df.csv')

null_cols = ['vol_cubft', 'mpg_comb', 'drivetrain', 'fuel_typ','trans', 'class_EPA', 'engine']

null_df = full_model_spec_sales_df[full_model_spec_sales_df[null_cols].isna().all(1)] #.groupby('cleaned_name')
null_df['cleaned_name'].value_counts()