# D06 
Missing Value, Train & Test, 
Pandas missing data is represented by two value:
None: None is a Python singleton object that is often used for missing data in Python code.
NaN : NaN (an acronym for Not a Number), is a special floating-point value recognized by all systems that use the standard IEEE floating-point representation
Pandas treat None and NaN as essentially interchangeable for indicating missing or null values. To facilitate this convention, there are several useful functions for detecting, removing, and replacing null values in Pandas DataFrame :
isnull(), notnull(), dropna(), fillna(), replace(), interpolate()

In [None]:
# Libraries
import pandas as pd
import numpy as np
from pydataset import data

In [None]:
# data
dict1 = {'English': [100, 90, np.nan, 95,100],
        'Maths'   : [30, 45, 56, np.nan, np.nan],
        'Science' : [np.nan, 40, 80, 98,np.nan]}
  
# creating a dataframe from list
df = pd.DataFrame(dict1, index =['S01','S02','S03','S04','S05'])
df  

In [None]:
df.describe()

In [None]:
df.info()

## check missing values

In [None]:
df.isnull().any() # is there any missing value in Df

In [None]:
# using isnull() function  - T & F at each data point
df.isnull()

In [None]:
print("\n Missing in Full Data Frame :", df.isnull().sum().sum())
print("Missing in each column : ",df.isnull().sum(axis=0))
print("\n Missing in each row : ", df.isnull().sum(axis=1))

In [None]:
print('Missing values in Maths column ', pd.isnull(df.Maths).sum())
print('Missing values in Student S02 ', pd.isnull(df.loc['S02']).sum(axis=0))

In [None]:
df

## check non-missing values

In [None]:
df.notnull().sum().sum()

## Fill Missing Values

In [None]:
df

In [None]:
# Fill missing values with previous values  of rows 
df.fillna(method ='pad')
#No previous row for S01, hence Science marks are still NaN

In [None]:
# Fill missing values with nex values  of rows 
df.fillna(method ='bfill')
#No next row for S05, hence Maths & Science marks are still NaN 

In [None]:
df['Science']

In [None]:
# filling a column null values using fillna() 
df["Science"].fillna(99, inplace = False) 
#inplace True will replace values permanently

In [None]:
# replace function
df["Science"].replace(to_replace = np.nan, value = -99) 

In [None]:
# replace all null values in DF with -100
df.replace(to_replace = np.nan, value = -100) 

## Interpolate
interpolate the missing values using Linear method. Note that Linear method ignore the index and treat the values as equally spaced.

In [None]:
# to interpolate the missing values 
df.interpolate(method ='linear', limit_direction ='forward')

In [None]:
df
# English : S02(90) - S04(95) = 5/2 = 2.25  
# S03 = 90 + 2.25 = 92.5
# other cases = previous value

In [None]:
# to interpolate the missing values 
df.interpolate(method ='linear', limit_direction ='backward')

## Drop Missing values

In [None]:
df.dropna()
# drop rows with at least on NaN value in their row

In [None]:
df.dropna(how = 'all')   # how = all, any
#drop only those rows which have all row data missing, nil here

In [None]:
df.isna().sum(axis=0)
# 5 rows, Maths & Science have 2 each missing out of 5: hence 3 data points non-missing 

In [None]:
df.dropna(axis=1, thresh = 4)
# 4 non missing values in Column, then drop that columns
# here Maths and Science 

In [None]:
df.notnull().sum(axis=1)

In [None]:
# NA on selected columns
df.dropna(subset =['Maths','Science'], thresh=1)
#drop those rows which have 1 non missing value : here it S05

In [None]:
print("% of missing values", df.isna().mean().mean())
# % of mssing values
print("No of Data Points ", df.shape[0] * df.shape[1])
print("No of Missing Values ", df.isna().sum().sum())
print(5/15)

### Can we say, drop that column or row which have more than 40% values missing

In [None]:
df.shape
# 5 rows, 3 columns

In [None]:
print('Non Missing in Column ', np.ceil(.7 * 5), ' Non Missing in Row ', np.ceil(.6 *3))

In [None]:
df.dropna(axis=1, thresh = np.ceil(.7 * 5))
#keep only those columns which have non-missing values >=4 (70%)

In [None]:
#keep only those columns at most 20% missing value or at least 80% non missing values
df.dropna(axis=1, thresh = (1.0 -.2)* df.shape[0])

In [None]:
df.dropna(axis=0, thresh = np.round(.7 *3,0))
#drop rows which have at least 2 non-missing values in each row, show all columns

## additional library
msno

In [None]:
#pip install missingno

In [None]:
import missingno as msno
import matplotlib.pyplot as plt
import seaborn as sns

In [None]:
df.info()

In [None]:
# Gives a bar chart of the missing values
msno.bar(df)

In [None]:
# Gives positional information of the missing values
msno.matrix(df)
#From the matrix plot, you can see where the missing values are located

In [None]:
# Gives a heatmap of how missing values are related
msno.heatmap(df)
#it can sometimes reveal interesting connections between missing values of different features.

In [None]:

plt.figure(figsize=(10,6))
sns.heatmap(df.isna().transpose(), cmap="YlGnBu", cbar_kws={'label': 'Missing Data'})
plt.show();
#plt.savefig("visualizing_missing_data_with_heatmap_Seaborn_Python.png", dpi=100)

In [None]:
plt.figure(figsize=(10,6))
sns.displot( data=df.isna().melt(value_name="missing"),  y="variable",  hue="missing", multiple="fill",  aspect=1.25)
plt.show();

In [None]:
msno.dendrogram(df)