In [1]:
# We've seen a preview of how Pandas handles missing values using the None type and NumPy NaN values. Missing values are pretty
# common in data cleaning activities. And, missing values can be there for any number of reasons, and I just want to touch on a
# few here.

# For instance, if you are running a survey and a respondant didn't answer a question the missing value is actually an omission.
# This kind of missing data is called Missing at Random if there are other variables that might be used to predict the variable
# which is missing. In my work when I delivery surveys I often find that missing data, say the interest in being involved in a 
# follow up study, often has some correlation with another data field, like gender or ethnicity. If there is no relationship to
# other variables, then we call this data Missing Completely at Random (MCAR).

# These are just two examples of missing data, and there are many more. For instance, data might be missing because it wasn't
# collected, either by the process responsible for collecting that data, such as a researcher, or because it wouldn't make sense
# if it were collected. This last example is extremely common when you start joining DataFrames together from multiple sources,
# such as joining a list of people at a university with a list of offices in the university (students generally don't have
# offices).

# Let's look at some ways of handling missing data in pandas.

In [1]:
import pandas as pd
import numpy as np
list= [{'name':'Tuba','age': 23, 'CGPA': 3.8,'final mark':np.nan,'address':'Dhaka','study':'DIU'},
        {'name':'Plabon','age': 25, 'CGPA': 3.4,'final mark':28,'address':'Dhaka','study':'AUST'},
        {'name':'Plabi','age': 25, 'CGPA': 3.4,'final mark':29,'address':'khulna','study':'AUST'},
        {'name':'Maliha','age': 22, 'CGPA': 3.75,'final mark': np.nan, 'address':'Dhaka','study':'NSU'},  
        {'name':'Raima','age': 22, 'CGPA': 3.65,'final mark':28,'address':'Dhaka','study':'EWU'},
        {'name':'Ripa','age': 22, 'CGPA': 3.45,'final mark':np.nan,'address':'Dhaka','study':'DIU'}
        ]

df=pd.DataFrame(list)
df

Unnamed: 0,name,age,CGPA,final mark,address,study
0,Tuba,23,3.8,,Dhaka,DIU
1,Plabon,25,3.4,28.0,Dhaka,AUST
2,Plabi,25,3.4,29.0,khulna,AUST
3,Maliha,22,3.75,,Dhaka,NSU
4,Raima,22,3.65,28.0,Dhaka,EWU
5,Ripa,22,3.45,,Dhaka,DIU


In [2]:
# We can actually use the function .isnull() to create a boolean mask of the whole dataframe. This effectively
# broadcasts the isnull() function to every cell of data.

mask = df.isnull()
mask

Unnamed: 0,name,age,CGPA,final mark,address,study
0,False,False,False,True,False,False
1,False,False,False,False,False,False
2,False,False,False,False,False,False
3,False,False,False,True,False,False
4,False,False,False,False,False,False
5,False,False,False,True,False,False


In [3]:
# This can be useful for processing rows based on certain columns of data. Another useful operation is to be
# able to drop all of those rows which have any missing data, which can be done with the dropna() function.

df.dropna()

Unnamed: 0,name,age,CGPA,final mark,address,study
1,Plabon,25,3.4,28.0,Dhaka,AUST
2,Plabi,25,3.4,29.0,khulna,AUST
4,Raima,22,3.65,28.0,Dhaka,EWU


In [9]:
# Note how the rows indexed with 2, 3, 7, and 11 are now gone. One of the handy functions that Pandas has for
# working with missing values is the filling function, fillna(). This function takes a number or parameters.
# You could pass in a single value which is called a scalar value to change all of the missing data to one
# value. This isn't really applicable in this case, but it's a pretty common use case.

# So, if we wanted to fill all missing values with 0, we would use fillna

df.fillna(0,inplace = True)
df

Unnamed: 0,name,age,CGPA,final mark,address,study
0,Tuba,23,3.8,0.0,Dhaka,DIU
1,Plabon,25,3.4,28.0,Dhaka,AUST
2,Plabi,25,3.4,29.0,khulna,AUST
3,Maliha,22,3.75,0.0,Dhaka,NSU
4,Raima,22,3.65,28.0,Dhaka,EWU
5,Ripa,22,3.45,0.0,Dhaka,DIU


In [10]:
# Note that the inplace attribute causes pandas to fill the values inline and does not return a copy of the
# dataframe, but instead modifies the dataframe you have.

In [13]:
df.set_index("CGPA")
df.sort_index()
df

Unnamed: 0,name,age,CGPA,final mark,address,study
0,Tuba,23,3.8,0.0,Dhaka,DIU
1,Plabon,25,3.4,28.0,Dhaka,AUST
2,Plabi,25,3.4,29.0,khulna,AUST
3,Maliha,22,3.75,0.0,Dhaka,NSU
4,Raima,22,3.65,28.0,Dhaka,EWU
5,Ripa,22,3.45,0.0,Dhaka,DIU


In [22]:
list2= [{'name':'Tuba','age': 23, 'CGPA': 3.8,'final mark':np.nan,'address':'Dhaka','study':'DIU'},
        {'name':'Plabon','age': 25, 'CGPA': 3.4,'final mark':np.nan,'address':'Dhaka','study':'AUST'},
        {'name':'Plabi','age': 25, 'CGPA': 3.4,'final mark':np.nan,'address':'khulna','study':'AUST'}]

df2 = pd.DataFrame(list2)
df2.fillna(method = 'ffill')
df2

Unnamed: 0,name,age,CGPA,final mark,address,study
0,Tuba,23,3.8,,Dhaka,DIU
1,Plabon,25,3.4,,Dhaka,AUST
2,Plabi,25,3.4,,khulna,AUST


In [6]:
import pandas as pd
df_list = {'a':[1,2,1,4,5],
          'b':[1,5,3,7,3],
          'c':['a','b','c','d','e'],
          'd':['hello','hi','hi','hello','hello']}
new_df = pd.DataFrame(df_list)
new_df

Unnamed: 0,a,b,c,d
0,1,1,a,hello
1,2,5,b,hi
2,1,3,c,hi
3,4,7,d,hello
4,5,3,e,hello


In [3]:
# We can replace 1's with 100, let's try the value-to-value approach
new_df.replace(1, 100)

Unnamed: 0,a,b,c
0,100,100,a
1,2,5,b
2,100,3,c
3,4,7,d
4,5,3,e


In [5]:
# How about changing two values? Let's try the list approach For example, we want to change 1's to 100 and 3's
# to 300

new_df.replace([1, 3], [100, 300])

Unnamed: 0,a,b,c
0,100,100,a
1,2,5,b
2,100,300,c
3,4,7,d
4,5,300,e


In [7]:
# To replace using a regex we make the first parameter to replace the regex pattern we want to match, the
# second parameter the value we want to emit upon match, and then we pass in a third parameter "regex=True".

# Here's my solution, first matching any number of characters then ending in hello,then change the value into hi.

new_df.replace(to_replace = '.*hello$', value = 'hi', regex = True)

Unnamed: 0,a,b,c,d
0,1,1,a,hi
1,2,5,b,hi
2,1,3,c,hi
3,4,7,d,hi
4,5,3,e,hi
