Dataset: https://www.epa.gov/compliance-and-fuel-economy-data/data-cars-used-testing-fuel-economy

In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
%matplotlib inline
import seaborn as sns
sns.set_style('darkgrid')

alpha08 = pd.read_csv('all_alpha_08.csv', sep = ',')
alpha18 = pd.read_csv('all_alpha_18.csv', sep = ',')

In [2]:
alpha08.describe()

Unnamed: 0,Displ,Unadj Cmb MPG
count,2404.0,2205.0
mean,3.748918,23.916104
std,1.335785,6.36617
min,1.3,10.0184
25%,2.5,19.1139
50%,3.5,23.9213
75%,4.8,27.8693
max,8.4,65.7778


In [3]:
alpha18.describe()

Unnamed: 0,Displ,Cyl,Air Pollution Score,Greenhouse Gas Score
count,1609.0,1609.0,1611.0,1611.0
mean,3.055687,5.47918,3.958411,4.711359
std,1.344574,1.749121,1.824303,1.657429
min,1.2,3.0,1.0,1.0
25%,2.0,4.0,3.0,4.0
50%,3.0,6.0,3.0,5.0
75%,3.6,6.0,5.0,6.0
max,8.0,16.0,10.0,10.0


In [4]:
#number of samples in each dataset
#number of columns in each dataset
alpha08.shape

(2404, 18)

In [5]:
alpha18.shape

(1611, 18)

In [6]:
#duplicate rows in each dataset
sum(alpha18.duplicated())

0

In [7]:
sum(alpha08.duplicated())

25

In [8]:
#datatypes of columns
alpha08.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2404 entries, 0 to 2403
Data columns (total 18 columns):
 #   Column                Non-Null Count  Dtype  
---  ------                --------------  -----  
 0   Model                 2404 non-null   object 
 1   Displ                 2404 non-null   float64
 2   Cyl                   2205 non-null   object 
 3   Trans                 2205 non-null   object 
 4   Drive                 2311 non-null   object 
 5   Fuel                  2404 non-null   object 
 6   Sales Area            2404 non-null   object 
 7   Stnd                  2404 non-null   object 
 8   Underhood ID          2404 non-null   object 
 9   Veh Class             2404 non-null   object 
 10  Air Pollution Score   2404 non-null   object 
 11  FE Calc Appr          2205 non-null   object 
 12  City MPG              2205 non-null   object 
 13  Hwy MPG               2205 non-null   object 
 14  Cmb MPG               2205 non-null   object 
 15  Unadj Cmb MPG        

In [9]:
alpha18.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1611 entries, 0 to 1610
Data columns (total 18 columns):
 #   Column                Non-Null Count  Dtype  
---  ------                --------------  -----  
 0   Model                 1611 non-null   object 
 1   Displ                 1609 non-null   float64
 2   Cyl                   1609 non-null   float64
 3   Trans                 1611 non-null   object 
 4   Drive                 1611 non-null   object 
 5   Fuel                  1611 non-null   object 
 6   Cert Region           1611 non-null   object 
 7   Stnd                  1611 non-null   object 
 8   Stnd Description      1611 non-null   object 
 9   Underhood ID          1611 non-null   object 
 10  Veh Class             1611 non-null   object 
 11  Air Pollution Score   1611 non-null   int64  
 12  City MPG              1611 non-null   object 
 13  Hwy MPG               1611 non-null   object 
 14  Cmb MPG               1611 non-null   object 
 15  Greenhouse Gas Score 

In [33]:
#features with missing values
alpha18.notnull().sum()

Model                   1611
Displ                   1609
Cyl                     1609
Trans                   1611
Drive                   1611
Fuel                    1611
Cert Region             1611
Stnd                    1611
Stnd Description        1611
Underhood ID            1611
Veh Class               1611
Air Pollution Score     1611
City MPG                1611
Hwy MPG                 1611
Cmb MPG                 1611
Greenhouse Gas Score    1611
SmartWay                1611
Comb CO2                1611
dtype: int64

In [34]:
alpha08.notnull().sum()

Model                   2404
Displ                   2404
Cyl                     2205
Trans                   2205
Drive                   2311
Fuel                    2404
Sales Area              2404
Stnd                    2404
Underhood ID            2404
Veh Class               2404
Air Pollution Score     2404
FE Calc Appr            2205
City MPG                2205
Hwy MPG                 2205
Cmb MPG                 2205
Unadj Cmb MPG           2205
Greenhouse Gas Score    2205
SmartWay                2404
dtype: int64

In [12]:
#number of non-null unique values for features in each dataset
alpha08.isnull().sum()

Model                     0
Displ                     0
Cyl                     199
Trans                   199
Drive                    93
Fuel                      0
Sales Area                0
Stnd                      0
Underhood ID              0
Veh Class                 0
Air Pollution Score       0
FE Calc Appr            199
City MPG                199
Hwy MPG                 199
Cmb MPG                 199
Unadj Cmb MPG           199
Greenhouse Gas Score    199
SmartWay                  0
dtype: int64

In [13]:
alpha18.isnull().sum()

Model                   0
Displ                   2
Cyl                     2
Trans                   0
Drive                   0
Fuel                    0
Cert Region             0
Stnd                    0
Stnd Description        0
Underhood ID            0
Veh Class               0
Air Pollution Score     0
City MPG                0
Hwy MPG                 0
Cmb MPG                 0
Greenhouse Gas Score    0
SmartWay                0
Comb CO2                0
dtype: int64

In [14]:
alpha08.duplicated().sum()

25

In [15]:
alpha18.duplicated().sum()

0

In [16]:
alpha08.dtypes

Model                    object
Displ                   float64
Cyl                      object
Trans                    object
Drive                    object
Fuel                     object
Sales Area               object
Stnd                     object
Underhood ID             object
Veh Class                object
Air Pollution Score      object
FE Calc Appr             object
City MPG                 object
Hwy MPG                  object
Cmb MPG                  object
Unadj Cmb MPG           float64
Greenhouse Gas Score     object
SmartWay                 object
dtype: object

In [17]:
alpha18.dtypes

Model                    object
Displ                   float64
Cyl                     float64
Trans                    object
Drive                    object
Fuel                     object
Cert Region              object
Stnd                     object
Stnd Description         object
Underhood ID             object
Veh Class                object
Air Pollution Score       int64
City MPG                 object
Hwy MPG                  object
Cmb MPG                  object
Greenhouse Gas Score      int64
SmartWay                 object
Comb CO2                 object
dtype: object

In [18]:
alpha08.SmartWay.nunique()

2

In [19]:
alpha18.SmartWay.nunique()

3

In [22]:
alpha08["Sales Area"].nunique()

3

In [23]:
alpha18["Cert Region"].nunique()

2

In [24]:
alpha08.Trans.nunique()

14

In [25]:
alpha18.Trans.nunique()

26

In [26]:
alpha08.Trans.unique()

array(['Auto-S5', 'Man-6', 'Auto-S6', 'Auto-AV', 'Auto-S7', 'Auto-L4',
       'Auto-L6', 'Auto-4', 'Man-5', 'Auto-L5', nan, 'Auto-6', 'S8',
       'Auto-S4', 'Auto-L7'], dtype=object)

In [27]:
alpha18.Trans.unique()

array(['SemiAuto-6', 'AMS-8', 'SemiAuto-9', 'AutoMan-6', 'Auto-8',
       'AMS-6', 'AMS-7', 'Man-6', 'SemiAuto-8', 'SemiAuto-7', 'Auto-6',
       'Auto-10', 'SemiAuto-10', 'Man-7', 'Auto-9', 'CVT', 'Auto-4',
       'AutoMan-7', 'SCV-7', 'Auto-1', 'SCV-10', 'Auto-7', 'Man-5',
       'SCV-8', 'SCV-6', 'SemiAuto-5'], dtype=object)

In [28]:
alpha08.Fuel.unique()

array(['Gasoline', 'ethanol/gas', 'ethanol', 'diesel', 'CNG'],
      dtype=object)

In [29]:
alpha18.Fuel.unique()

array(['Gasoline', 'Gasoline/Electricity', 'Diesel', 'Ethanol/Gas',
       'Electricity'], dtype=object)