# Intro to Pandas

## Missing Data

In [1]:
# Basic Imports
import numpy as np
from pandas import Series,DataFrame
import pandas as pd

### Missing Data on Series

In [2]:
#Now we'll learn how to deal with missing data, a very common task when analyzing datasets!

data = Series(['A','B', np.nan, 'D', np.nan])

In [3]:
#Show data
data

0      A
1      B
2    NaN
3      D
4    NaN
dtype: object

In [4]:
#Find the missing values
data.isnull()

0    False
1    False
2     True
3    False
4     True
dtype: bool

In [5]:
#We can simply drop the NAN 
data.dropna()

0    A
1    B
3    D
dtype: object

### Missing Data on DF

In [6]:
# In a DataFrame we need to be a little more careful!

dframe = DataFrame(data = [[1,2,3,4,5],[np.nan,7,8,9],[np.nan,np.nan,12,13],
                           [14,np.nan,np.nan,np.nan],[np.nan,np.nan,np.nan,np.nan]],
                   columns=list('ABCDE'), dtype='float')

In [7]:
#Show
dframe

Unnamed: 0,A,B,C,D,E
0,1.0,2.0,3.0,4.0,5.0
1,,7.0,8.0,9.0,
2,,,12.0,13.0,
3,14.0,,,,
4,,,,,


In [8]:
clean_dframe = dframe.dropna()

In [9]:
#Show
clean_dframe

Unnamed: 0,A,B,C,D,E
0,1.0,2.0,3.0,4.0,5.0


**Note:** All rows where an NA occured was a drop of the entire row!

### Drop rows missing all data

In [10]:
# show the original df
dframe

Unnamed: 0,A,B,C,D,E
0,1.0,2.0,3.0,4.0,5.0
1,,7.0,8.0,9.0,
2,,,12.0,13.0,
3,14.0,,,,
4,,,,,


In [11]:
# drop rows containing all NaN
dframe.dropna(how='all')

Unnamed: 0,A,B,C,D,E
0,1.0,2.0,3.0,4.0,5.0
1,,7.0,8.0,9.0,
2,,,12.0,13.0,
3,14.0,,,,


### Drop columns with missing data

In [14]:
# Specify the axis
dframe.dropna(axis=1)

#This should drop all columns out since every column contains at least 1 NAN

0
1
2
3
4


#### We can also use a threshold parameter on data points

In [17]:
dframe

Unnamed: 0,A,B,C,D,E
0,1.0,2.0,3.0,4.0,5.0
1,,7.0,8.0,9.0,
2,,,12.0,13.0,
3,14.0,,,,
4,,,,,


#### Dropping based on data points

On rows

In [20]:
# Droppin any rows that dont have at least 2 data points
dframe.dropna(thresh=2)

Unnamed: 0,A,B,C,D,E
0,1.0,2.0,3.0,4.0,5.0
1,,7.0,8.0,9.0,
2,,,12.0,13.0,


On columns

In [21]:
# Droppin any column that dont have at least 3 data points
dframe.dropna(thresh=3, axis=1)

Unnamed: 0,C,D
0,3.0,4.0
1,8.0,9.0
2,12.0,13.0
3,,
4,,


### Filling NaN values

Single value

In [23]:
# show the df
dframe

Unnamed: 0,A,B,C,D,E
0,1.0,2.0,3.0,4.0,5.0
1,,7.0,8.0,9.0,
2,,,12.0,13.0,
3,14.0,,,,
4,,,,,


In [26]:
# With a single value 
dframe.fillna(1)

Unnamed: 0,A,B,C,D,E
0,1.0,2.0,3.0,4.0,5.0
1,1.0,7.0,8.0,9.0,1.0
2,1.0,1.0,12.0,13.0,1.0
3,14.0,1.0,1.0,1.0,1.0
4,1.0,1.0,1.0,1.0,1.0


Different values

In [28]:
#Can also fill in different values for different columns
dframe.fillna({'A':0, 'B':1, 'C':2, 'D':3, 'E':4})

Unnamed: 0,A,B,C,D,E
0,1.0,2.0,3.0,4.0,5.0
1,0.0,7.0,8.0,9.0,4.0
2,0.0,1.0,12.0,13.0,4.0
3,14.0,1.0,2.0,3.0,4.0
4,0.0,1.0,2.0,3.0,4.0


In [29]:
dframe

Unnamed: 0,A,B,C,D,E
0,1.0,2.0,3.0,4.0,5.0
1,,7.0,8.0,9.0,
2,,,12.0,13.0,
3,14.0,,,,
4,,,,,


In [30]:
#Note that we still have access to the original dframe

dframe.fillna(method='ffill')

Unnamed: 0,A,B,C,D,E
0,1.0,2.0,3.0,4.0,5.0
1,1.0,7.0,8.0,9.0,5.0
2,1.0,7.0,12.0,13.0,5.0
3,14.0,7.0,12.0,13.0,5.0
4,14.0,7.0,12.0,13.0,5.0


In [31]:
dframe

Unnamed: 0,A,B,C,D,E
0,1.0,2.0,3.0,4.0,5.0
1,,7.0,8.0,9.0,
2,,,12.0,13.0,
3,14.0,,,,
4,,,,,


We can feel the df with the mean

In [33]:
df_mean= np.round(dframe.mean(), 1)
df_mean

A    7.5
B    4.5
C    7.7
D    8.7
E    5.0
dtype: float64

In [36]:
# feel the df with the mean
dframe.fillna(df_mean, inplace=True)

In [37]:
#Now let's see the dframe
dframe

Unnamed: 0,A,B,C,D,E
0,1.0,2.0,3.0,4.0,5.0
1,7.5,7.0,8.0,9.0,5.0
2,7.5,4.5,12.0,13.0,5.0
3,14.0,4.5,7.7,8.7,5.0
4,7.5,4.5,7.7,8.7,5.0


# Exercises!