## Identifying Missing Data

In [3]:

import pandas as pd
import numpy as np
technologies = {
    'Courses':["Spark","PySpark","Hadoop","Python","pandas",np.nan],
    'Fee' :[20000,25000,26000,23093,24000,np.nan],
    'Duration':['30day','40days','35days','45days',np.nan,np.nan],
    'Discount':[1000,np.nan,1200,2500,np.NaN,np.nan],
    '':[np.nan,np.nan,np.nan,np.nan,np.nan,np.nan]
              }
index_labels=['r1','r2','r3','r4','r5','']
df = pd.DataFrame(technologies,index=index_labels)
print(df)

    Courses      Fee Duration  Discount    
r1    Spark  20000.0    30day    1000.0 NaN
r2  PySpark  25000.0   40days       NaN NaN
r3   Hadoop  26000.0   35days    1200.0 NaN
r4   Python  23093.0   45days    2500.0 NaN
r5   pandas  24000.0      NaN       NaN NaN
        NaN      NaN      NaN       NaN NaN


In [8]:
# Identifying missing data
# Calculating missing values in every column
print("Column wise missing values: ",df.isna().sum())

# Calculating total missing values in data
total = df.isna().sum().sum()
print("Total number of missings in complete data: ",total)

Column wise missing values:  Courses     1
Fee         1
Duration    2
Discount    3
            6
dtype: int64
Total number of missings in complete data:  13


## pandas dropna() Syntax


#### Syntax:

> DataFrame.dropna(axis=0, how='any', thresh=None, subset=None, inplace=False)
    
- **value**: Takes either scalar, dict, Series, or DataFrame but not list.
method – Takes one of these values {‘backfill’, ‘bfill’, ‘pad’, ‘ffill’, None}. Default None.
- **axis**: 0 or ‘index’, 1 or ‘columns’. Used to specifiy axis to fill the values.
- **inplace**: Default False. When used True, it updates existing DataFrame object.
- **limit**: Specify how many fills should happen. This is the maximum number of consecutive NaN values replaced with specified value.
- **downcast**: It takes a dict of key-value pair that specifies data type to downcast . Like Float64 to int64, date to string e.t.c    

## Drop Rows with All NaN Values

In [20]:

# Drop rows that has all Nan Values
df1 = df.dropna(how='all')
print(df1)

    Courses      Fee Duration  Discount    
r1    Spark  20000.0    30day    1000.0 NaN
r2  PySpark  25000.0   40days       NaN NaN
r3   Hadoop  26000.0   35days    1200.0 NaN
r4   Python  23093.0   45days    2500.0 NaN
r5   pandas  24000.0      NaN       NaN NaN


## Drop Columns with All NaN Values

In [21]:

# Drop columns that has all Nan Values
df = df.dropna(how='all',axis=1)
print(df)


    Courses      Fee Duration  Discount
r1    Spark  20000.0    30day    1000.0
r2  PySpark  25000.0   40days       NaN
r3   Hadoop  26000.0   35days    1200.0
r4   Python  23093.0   45days    2500.0
r5   pandas  24000.0      NaN       NaN
        NaN      NaN      NaN       NaN


## Drop Rows & Columns that Contains NaN

In [22]:
# Drop rows that contains nan values
df2=df.dropna()
print(df2)


   Courses      Fee Duration  Discount
r1   Spark  20000.0    30day    1000.0
r3  Hadoop  26000.0   35days    1200.0
r4  Python  23093.0   45days    2500.0


In [5]:
# Drop columns that contains nan values
df2=df.dropna()
print(df2)


   Courses      Fee Duration Discount
r1   Spark  20000.0    30day     1000
r3  Hadoop  26000.0   35days     1200
r4  Python  23093.0   45days     2500


## Execute pandas dropna() on Specific Selected Columns

In [6]:

# Drop rows that has NaN values on selected columns
df2=df.dropna(subset=['Courses','Duration'])
print(df2)


    Courses      Fee Duration Discount
r1    Spark  20000.0    30day     1000
r2  PySpark  25000.0   40days      NaN
r3   Hadoop  26000.0   35days     1200
r4   Python  23093.0   45days     2500


## Drop NaN Values with Threshold

In [23]:
import pandas as pd
import numpy as np
technologies = {
    'Courses':["Spark","PySpark","Hadoop","Python","pandas",np.nan],
    'Fee' :[20000,25000,26000,23093,24000,np.nan],
    'Duration':['30day','40days','35days','45days',np.nan,np.nan],
    'Discount':[1000,np.nan,1200,2500,np.NaN,np.nan],
    '':[np.nan,np.nan,np.nan,np.nan,np.nan,np.nan]
              }
index_labels=['r1','r2','r3','r4','r5','']
df = pd.DataFrame(technologies,index=index_labels)
print(df)


    Courses      Fee Duration  Discount    
r1    Spark  20000.0    30day    1000.0 NaN
r2  PySpark  25000.0   40days       NaN NaN
r3   Hadoop  26000.0   35days    1200.0 NaN
r4   Python  23093.0   45days    2500.0 NaN
r5   pandas  24000.0      NaN       NaN NaN
        NaN      NaN      NaN       NaN NaN


In [24]:
# With threshold, 
# Keep only the rows with at least 2 non-NA values.
df2=df.dropna(thresh=2)
df2

Unnamed: 0,Courses,Fee,Duration,Discount,Unnamed: 5
r1,Spark,20000.0,30day,1000.0,
r2,PySpark,25000.0,40days,,
r3,Hadoop,26000.0,35days,1200.0,
r4,Python,23093.0,45days,2500.0,
r5,pandas,24000.0,,,


## Filling Missing Data

### pandas.DataFrame.fillna() Syntax
    
#### Syntax:
> DataFrame.fillna(value=None, method=None, axis=None, inplace=False, limit=None, downcast=None)
    
- **value**: Takes either scalar, dict, Series, or DataFrame but not list.
- **method**: Takes one of these values {‘backfill’, ‘bfill’, ‘pad’, ‘ffill’, None}. Default None.
- **axis**: 0 or ‘index’, 1 or ‘columns’. Used to specifiy axis to fill the values.
- **inplace**: Default False. When used True, it updates existing DataFrame object.
- **limit**: Specify how many fills should happen. This is the maximum number of consecutive NaN values replaced with specified value.
- **downcast**: It takes a dict of key-value pair that specifies data type to downcast . Like Float64 to int64, date to string e.t.c    

In [31]:

# Create DataFrame
import pandas as pd
import numpy as np
df = pd.DataFrame(({
     'Courses':["Spark",'Java',"Scala",'Python'],
     'Fee' :[20000,np.nan,26000,24000],
     'Duration':['30days','40days','NA','40days'],
     'Discount':[1000,np.nan,2500,None]
               }))
print(df)


  Courses      Fee Duration  Discount
0   Spark  20000.0   30days    1000.0
1    Java      NaN   40days       NaN
2   Scala  26000.0       NA    2500.0
3  Python  24000.0   40days       NaN


## pandas fillna NaN with None Value

In [19]:
df = pd.DataFrame([[np.nan, 2, np.nan, 0],
                    [3, 4, np.nan, 1],
                    [np.nan, np.nan, np.nan, np.nan],
                    [np.nan, 3, np.nan, 4]],
                   columns=list("ABCD"))
df

Unnamed: 0,A,B,C,D
0,,2.0,,0.0
1,3.0,4.0,,1.0
2,,,,
3,,3.0,,4.0


In [20]:
# fillna to replace all NaN
df1=df.fillna('None')
print(df1)


      A     B     C     D
0  None   2.0  None   0.0
1   3.0   4.0  None   1.0
2  None  None  None  None
3  None   3.0  None   4.0


In [21]:
print(df1.info())

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 4 entries, 0 to 3
Data columns (total 4 columns):
 #   Column  Non-Null Count  Dtype 
---  ------  --------------  ----- 
 0   A       4 non-null      object
 1   B       4 non-null      object
 2   C       4 non-null      object
 3   D       4 non-null      object
dtypes: object(4)
memory usage: 256.0+ bytes
None


In [22]:
# fillna to replace all NaN
df2=df.fillna(0)
print(df2)
print()
print(df2.info())

     A    B    C    D
0  0.0  2.0  0.0  0.0
1  3.0  4.0  0.0  1.0
2  0.0  0.0  0.0  0.0
3  0.0  3.0  0.0  4.0

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 4 entries, 0 to 3
Data columns (total 4 columns):
 #   Column  Non-Null Count  Dtype  
---  ------  --------------  -----  
 0   A       4 non-null      float64
 1   B       4 non-null      float64
 2   C       4 non-null      float64
 3   D       4 non-null      float64
dtypes: float64(4)
memory usage: 256.0 bytes
None


## pandas fillna on One Column

In [24]:
# Fill missing in one column only

df['B'] = df['B'].fillna(0)
print(df)

     A    B   C    D
0  NaN  2.0 NaN  0.0
1  3.0  4.0 NaN  1.0
2  NaN  0.0 NaN  NaN
3  NaN  3.0 NaN  4.0


## fillna on Multiple Columns

In [30]:

# fillna() on multiple columns
df[['C','D']] = df[['C','D']].fillna(0)
df

Unnamed: 0,A,B,C,D
0,,2.0,0.0,0.0
1,3.0,4.0,0.0,1.0
2,,,0.0,0.0
3,,3.0,0.0,4.0


In [33]:
# fillna() on multiple columns with multiple values
df2 =  df.fillna(value={'Discount':'0','Fee':10000})
print(df2)


  Courses      Fee Duration Discount
0   Spark  20000.0   30days   1000.0
1    Java  10000.0   40days        0
2   Scala  26000.0       NA   2500.0
3  Python  24000.0   40days        0


## Fill with limit param

In [34]:
# Create DataFrame
import pandas as pd
import numpy as np
df = pd.DataFrame(({
     'Courses':["Spark",'Java',"Scala",'Python'],
     'Fee' :[20000,np.nan,26000,24000],
     'Duration':['30days','40days','NA','40days'],
     'Discount':[1000,np.nan,2500,None]
               }))
print(df)


  Courses      Fee Duration  Discount
0   Spark  20000.0   30days    1000.0
1    Java      NaN   40days       NaN
2   Scala  26000.0       NA    2500.0
3  Python  24000.0   40days       NaN


In [37]:

# fill with limit
df2=df.fillna(value={'Discount':0,'Fee':0},limit=len(df2))
print(df2)


  Courses      Fee Duration  Discount
0   Spark  20000.0   30days    1000.0
1    Java      0.0   40days       0.0
2   Scala  26000.0       NA    2500.0
3  Python  24000.0   40days       0.0


In [38]:
# Create DataFrame
import pandas as pd
import numpy as np
df = pd.DataFrame(({
     'Courses':["Spark",'Java',"Scala",'Python'],
     'Fee' :[20000,np.nan,26000,24000]
     
               }))
print(df)


  Courses      Fee
0   Spark  20000.0
1    Java      NaN
2   Scala  26000.0
3  Python  24000.0


In [39]:
len(df)

4