# Handling missing values

In [1]:
import pandas as pd

In [17]:
df = pd.read_csv("./data/landslides.csv")
df.head()

Unnamed: 0,id,date,time,country_name,state/province,population,landslide_type,trigger,fatalities
0,34,3/2/07,Night,United States,Virginia,16000,Landslide,Rain,
1,42,3/22/07,,United States,Ohio,17288,Landslide,Rain,
2,56,4/6/07,,United States,Pennsylvania,15930,Landslide,Rain,
3,59,4/14/07,,Canada,Quebec,42786,Riverbank collapse,Rain,
4,61,4/15/07,,United States,Kentucky,6903,Landslide,Downpour,0.0


## Check and handle missing values

In [18]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1693 entries, 0 to 1692
Data columns (total 9 columns):
 #   Column          Non-Null Count  Dtype  
---  ------          --------------  -----  
 0   id              1693 non-null   int64  
 1   date            1690 non-null   object 
 2   time            629 non-null    object 
 3   country_name    1693 non-null   object 
 4   state/province  1692 non-null   object 
 5   population      1693 non-null   int64  
 6   landslide_type  1692 non-null   object 
 7   trigger         1691 non-null   object 
 8   fatalities      1446 non-null   float64
dtypes: float64(1), int64(2), object(6)
memory usage: 119.2+ KB


In [19]:
df.isna().sum()

id                   0
date                 3
time              1064
country_name         0
state/province       1
population           0
landslide_type       1
trigger              2
fatalities         247
dtype: int64

### Drops rows with missing date values

In [20]:
df = df[~df['date'].isnull()]

In [21]:
df

Unnamed: 0,id,date,time,country_name,state/province,population,landslide_type,trigger,fatalities
0,34,3/2/07,Night,United States,Virginia,16000,Landslide,Rain,
1,42,3/22/07,,United States,Ohio,17288,Landslide,Rain,
2,56,4/6/07,,United States,Pennsylvania,15930,Landslide,Rain,
3,59,4/14/07,,Canada,Quebec,42786,Riverbank collapse,Rain,
4,61,4/15/07,,United States,Kentucky,6903,Landslide,Downpour,0.0
...,...,...,...,...,...,...,...,...,...
1688,7535,12/7/15,,United States,North Carolina,1646,Rockfall,,0.0
1689,7537,2/22/16,0:00,United States,West Virginia,51400,Mudslide,Unknown,0.0
1690,7539,2/23/16,,United States,West Virginia,2406,Landslide,Rain,0.0
1691,7540,2/26/16,21:06,United States,West Virginia,1048,Rockfall,Unknown,0.0


In [22]:
df['time'].value_counts()

Night            97
Morning          87
Afternoon        58
Early morning    36
3:00:00          12
                 ..
1:13              1
9:40:00           1
11:50:00          1
                  1
21:06             1
Name: time, Length: 159, dtype: int64

In [28]:
df['time'].isna().sum()

0

In [26]:
df['time'] = df['time'].fillna('Not Known')

In [27]:
df['time']

0           Night
1       Not Known
2       Not Known
3       Not Known
4       Not Known
          ...    
1688    Not Known
1689         0:00
1690    Not Known
1691        21:06
1692         8:00
Name: time, Length: 1690, dtype: object

In [29]:
mean = df['fatalities'].mean()

In [30]:
mean

1.4622314622314623

In [31]:
df['fatalities'] = df['fatalities'].fillna(mean) 

In [32]:
df['fatalities'].isna().sum()

0