# Fixing Null Values and Duplicates
Use `data_08.csv` and `data_18.csv`

In [1]:
# load datasets
import pandas as pd

df_08 = pd.read_csv('data_08.csv')
df_18 = pd.read_csv('data_18.csv')

### Null Values

In [2]:
# view missing value count for each feature
df_08.isnull().sum()

model                   0
displ                   0
cyl                     0
trans                   0
drive                   0
fuel                    0
veh_class               0
air_pollution_score     0
city_mpg                0
hwy_mpg                 0
cmb_mpg                 0
greenhouse_gas_score    0
smartway                0
dtype: int64

In [3]:
# get number of rows with at least one null value
sum(df_08.isnull().any(axis=1))

0

Since all of these rows have all these columns missing, we'll probably just drop them. But let's take a look at these rows just in case.

In [4]:
df_08[df_08.isnull().any(axis=1)]

Unnamed: 0,model,displ,cyl,trans,drive,fuel,veh_class,air_pollution_score,city_mpg,hwy_mpg,cmb_mpg,greenhouse_gas_score,smartway


In [5]:
df_18.isnull().sum()

model                   0
displ                   0
cyl                     0
trans                   0
drive                   0
fuel                    0
veh_class               0
air_pollution_score     0
city_mpg                0
hwy_mpg                 0
cmb_mpg                 0
greenhouse_gas_score    0
smartway                0
dtype: int64

In [6]:
sum(df_18.isnull().any(axis=1))

0

In [7]:
df_18[df_18.isnull().any(axis=1)]

Unnamed: 0,model,displ,cyl,trans,drive,fuel,veh_class,air_pollution_score,city_mpg,hwy_mpg,cmb_mpg,greenhouse_gas_score,smartway


In [8]:
# drop rows with any null values
df_08.dropna(inplace=True)
df_18.dropna(inplace=True)

In [9]:
# check if any of the columns have null values - should print False
df_08.isnull().sum().any()

False

In [10]:
# check if any of the columns have null values - should print False
df_18.isnull().sum().any()

False

In [11]:
df_08.shape

(986, 13)

In [12]:
df_18.shape

(794, 13)

### Duplicates

In [13]:
df_08.duplicated().sum()

0

In [14]:
df_18.duplicated().sum()

0

In [15]:
# view them like this
# reminder - first one not included by default
df_08[df_08.duplicated()]

Unnamed: 0,model,displ,cyl,trans,drive,fuel,veh_class,air_pollution_score,city_mpg,hwy_mpg,cmb_mpg,greenhouse_gas_score,smartway


In [16]:
df_18[df_18.duplicated()]

Unnamed: 0,model,displ,cyl,trans,drive,fuel,veh_class,air_pollution_score,city_mpg,hwy_mpg,cmb_mpg,greenhouse_gas_score,smartway


In [17]:
df_08.drop_duplicates(inplace=True)
df_18.drop_duplicates(inplace=True)

In [18]:
df_08.duplicated().sum()

0

In [19]:
df_18.duplicated().sum()

0

In [20]:
df_08.to_csv('data_08.csv', index=False)
df_18.to_csv('data_18.csv', index=False)