In [1]:
# In this notebook we are going to discuss Data Cleaning -  Casting Dtypes and handling missing values



In [47]:
import pandas as pd
import numpy as np

In [48]:
data = {
    "first" : ["Corey" , "Jane" , "John","Chris" , np.nan , None , "NA"],
    "last" : ["Schafer" , "Doe" , "Doe", "Schafer", np.nan , np.nan , "Missing"],
    "email" : ["corey@gmail.com" , "jane@gmail.com" , "John@gmail.com", None , np.nan, "Anonymous@gmail.com" , "NA"],
    "age" : ['33','55','63','36',None , None , "Missing"]
}

In [49]:
df = pd.DataFrame(data)


In [50]:
df

Unnamed: 0,first,last,email,age
0,Corey,Schafer,corey@gmail.com,33
1,Jane,Doe,jane@gmail.com,55
2,John,Doe,John@gmail.com,63
3,Chris,Schafer,,36
4,,,,
5,,,Anonymous@gmail.com,
6,,Missing,,Missing


In [51]:
# we have few methods available in pandas that let's you check whether Na data is present in df or not

# Also , how many NA values are there 
# drop rows with na values
# filling the na values



In [52]:
# First Checking NA values are present or not

df.isnull()

Unnamed: 0,first,last,email,age
0,False,False,False,False
1,False,False,False,False
2,False,False,False,False
3,False,False,True,False
4,True,True,True,True
5,True,True,False,True
6,False,False,False,False


In [53]:
df.isnull().sum() # number of null values in each column

first    2
last     2
email    2
age      2
dtype: int64

In [54]:
# Total number of null values

df.isnull().sum().sum()

8

In [55]:
# Lets drop all rows that contain Missing values (None , np.nan)

df.dropna() # default axis =0, how ="any"

Unnamed: 0,first,last,email,age
0,Corey,Schafer,corey@gmail.com,33
1,Jane,Doe,jane@gmail.com,55
2,John,Doe,John@gmail.com,63
6,,Missing,,Missing


In [56]:
# Please take a note when how is set to any then dropna will drop whole rows even is there is one
# one missing value in row

# But if we look in above dataframe people can have few data missing 
# like missing email or first name

# So in that case we only need to delete rows that has all data missing

# this can be done bu setting how to all

df.dropna(how="all")

Unnamed: 0,first,last,email,age
0,Corey,Schafer,corey@gmail.com,33
1,Jane,Doe,jane@gmail.com,55
2,John,Doe,John@gmail.com,63
3,Chris,Schafer,,36
5,,,Anonymous@gmail.com,
6,,Missing,,Missing


In [57]:
# Right now axis is set to 0 that is row but if set axis to 1
# it will return empty df because we can see that every column has atleast one nan value

df.dropna(axis=1 , how="any")

0
1
2
3
4
5
6


In [58]:
# We can also specify the specific column name according to which
# we need to drop the nan value

# For doing this we have to specify the subset value

df.dropna(axis=0 , how="any" , subset =["email"])

# this will drop all the rows that have missing email Id

Unnamed: 0,first,last,email,age
0,Corey,Schafer,corey@gmail.com,33
1,Jane,Doe,jane@gmail.com,55
2,John,Doe,John@gmail.com,63
5,,,Anonymous@gmail.com,
6,,Missing,,Missing


In [59]:
df.replace({
    "NA" : np.nan,
    "Missing" : np.nan
}, inplace = True)

In [60]:
df

Unnamed: 0,first,last,email,age
0,Corey,Schafer,corey@gmail.com,33.0
1,Jane,Doe,jane@gmail.com,55.0
2,John,Doe,John@gmail.com,63.0
3,Chris,Schafer,,36.0
4,,,,
5,,,Anonymous@gmail.com,
6,,,,


In [61]:
df.dropna(how="all", inplace=True)

In [62]:
df

Unnamed: 0,first,last,email,age
0,Corey,Schafer,corey@gmail.com,33.0
1,Jane,Doe,jane@gmail.com,55.0
2,John,Doe,John@gmail.com,63.0
3,Chris,Schafer,,36.0
5,,,Anonymous@gmail.com,


In [63]:
df.isna()

Unnamed: 0,first,last,email,age
0,False,False,False,False
1,False,False,False,False
2,False,False,False,False
3,False,False,True,False
5,True,True,False,True


In [64]:
df.isnull()

Unnamed: 0,first,last,email,age
0,False,False,False,False
1,False,False,False,False
2,False,False,False,False
3,False,False,True,False
5,True,True,False,True


In [65]:
 df.fillna(0, inplace = True)

In [66]:
df

Unnamed: 0,first,last,email,age
0,Corey,Schafer,corey@gmail.com,33
1,Jane,Doe,jane@gmail.com,55
2,John,Doe,John@gmail.com,63
3,Chris,Schafer,0,36
5,0,0,Anonymous@gmail.com,0


In [68]:
df.dtypes

first    object
last     object
email    object
age      object
dtype: object

In [76]:
df["age"] = df["age"].astype("float")

In [77]:
df["age"].dtype

dtype('float64')

In [78]:
df["age"].mean()

37.4