# Data Extraction


In [None]:
#importing required library's
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

# Importing data
df=pd.read_csv("../input/recipedata/recipeData1.csv",encoding='latin-1')

# Basic Checks

In [None]:
print("No.of Row's and Columns of data \n *********************** \n  ",df.shape,
      "\n\n Top 5 records of the data \n ************************",df.head(5),
      "\n\n ************************** \n \n Bottom 5 records of the data \n\n ******************",df.tail(5))

# Identifying Missing values

#isnull(),isna,notna()

#syntax:

#pd.isnull(obj)

#df.isnull()


To identify missing values we have a various kinds of methods.
1. If you know the missing values column: Sort the data by missing values/NaN values wise either ascending or descending order, we can find the NaN values bottom of the dataframe.
In the given dataframe "Style" columns have missing values.

In [None]:
#sorting in ascending order
df.sort_values(by='Style',ascending=True).tail(5)

In [None]:
#sorting in descending order
df.sort_values(by='Style',ascending=False).tail(5)

2. If, don't know the missing values columns:

    a>isnull() : Shows True if missing value exist in dataframe columns cell else False.
    
    b>isna() : Shows True if missing value exist in dataframe columns cell else False.
    
    c>notna(): Shows False if missing value exist in dataframe columns cell else True.

In [None]:
#Verifying missing values in top 5 records
df.head(5)

In [None]:
#checking missing values with isnull()
print("************checking missing values with isnull()*************** \n \n")
df.isnull().head(5)

In [None]:

#checking missing values with isna()
print("************ checking missing values with isna() ******************* \n \n")
df.isna().head(5)


In [None]:
#checking missing values with notna()
print("************ checking missing values with notna() ******************* \n \n")
df.notna().head(5)


If, wants to check 
1. How many columns contains missing values
2. Count of missing values by column wise
3. Missing value percentage by columns wise

In [None]:
#using sum function to get totals
df.isnull().sum()

In [None]:
#calculate the percentage of missing values 
df_missing=pd.DataFrame(df.isnull().sum(),columns=["Missing_Data"])
df_missing['Total_Records']=df.shape[0]
df_missing['Percentage']=round(((df_missing.Missing_Data/df_missing.Total_Records)*100),1)
df_missing

# Missing Values Imputation/Handling/Replacing

We have various method to replace/handle/impute missing values. 

Here explaning only few top methods.

1.fillna(): 

    a> Replace missing values by constant values
    b> User define values
    C> Replace missing values in required columns or rows
    
fillna(value=None, method=None, axis=None, inplace=False, limit=None, downcast=None)

For your reference/Exploration: intepolate() & asfreq()

In [None]:
# Scenario-1: Replace missing values with 0(constant value)
# Missing value columns: ['BoilGravity','MashThickness','PitchRate','PrimaryTemp','PrimingMethod','PrimingAmount','UserId','Style']
df_miss=df[['BoilGravity','MashThickness','PitchRate','PrimaryTemp','PrimingMethod','PrimingAmount','UserId','Style']]

#checking required columns exist or not in dataframe
df_miss.columns

In [None]:
#verifying missing values
df_miss.head()

In [None]:
# Replacing missing values with 0
df_miss.fillna(0).head(10)

In [None]:
# Replacing missing values with 0 only for top 5 rows
df_miss.fillna(0,limit=).head(10)

In [None]:
# Scenario-2: Replace missing values with preceding value
# Missing value columns: ['BoilGravity','MashThickness','PitchRate','PrimaryTemp','PrimingMethod','PrimingAmount','UserId','Style']
df_miss.fillna(method='ffill').head(10)

In [None]:
# Scenario-3: Replace missing values with succeeding value
# Missing value columns: ['BoilGravity','MashThickness','PitchRate','PrimaryTemp','PrimingMethod','PrimingAmount','UserId','Style']
df_miss.fillna(method='bfill').head(10)

In [None]:
# Scenario-4: Replace missing value columns with BoilGravity=1.9,MashThickness=4,Style=other
# Missing value columns: ['BoilGravity','MashThickness','PitchRate','PrimaryTemp','PrimingMethod','PrimingAmount','UserId','Style']
missing_replacements={"BoilGravity" : "1.9" ,"MashThickness" : "4","Style" :"other"}
df_miss.fillna(value=missing_replacements).head(10)

SKLEARN providing "SimpleImputer" to replace missing values with basic statistics, user define values and constant values.

In [None]:
#Scenario-5: Replace missing values with Mean value
# Note : Mean applicable only numeric columns
#sklearn. SimpleImputer(*, missing_values=''/nan, strategy='mean/median/most_frequent/constant', fill_value=None, verbose=0, copy=True, add_indicator=False)

# considering only numeric columns
df_num=df_miss.select_dtypes(exclude='object')
from sklearn.impute import SimpleImputer
imp_mean=SimpleImputer(missing_values=np.nan ,strategy='mean')
imp_mean_trans=pd.DataFrame(imp_mean.fit_transform(df_num))
imp_mean_trans.columns=df_num.columns
imp_mean_trans.head()

In [None]:
#Scenario-6: Replace missing values with Median value
# Note : Median applicable only numeric columns
imp_median=SimpleImputer(missing_values=np.nan ,strategy='median')
imp_median_trans=pd.DataFrame(imp_median.fit_transform(df_num))
imp_median_trans.columns=df_num.columns
imp_median_trans.head()

In [None]:
#Scenario-7: Replace missing values with Mode/Most Frequent value
imp_mode=SimpleImputer(missing_values=np.nan ,strategy='most_frequent')
imp_mode_trans=pd.DataFrame(imp_mode.fit_transform(df_num))
imp_mode_trans.columns=df_num.columns
imp_mode_trans.head()

In [None]:
#Scenario-8: Replace missing values with constant value
# considering only character columns
df_char=df_miss.select_dtypes(include='object')
df_char.head()

In [None]:
imp_const=SimpleImputer(missing_values=np.nan, strategy='constant', fill_value='other')
imp_const_trans=pd.DataFrame(imp_const.fit_transform(df_char))
imp_const_trans.columns=df_char.columns
imp_const_trans.head()

if you like it, please Upvote and give feedback/comment. 

Thank you.