In [None]:

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))


## Table of content
- A. [Prepare data with Null values for the example](#section-a)
- B. [Methods to handle the missing values](#section-b)
    - 1. [Drop the data](#section-drop)
    - 2. [Fill the places with any of 5 Ms (Mean/Median/Mode/Max/Min)](#section-5m)
    - 3. [Fill the place with some constant value](#section-constant)
    - 4. [Predict the missing values](#section-predict)
        - 4.1.[Model-based strategy](#section-model-based)
        - 4.2.[Progressive model-based strategy](#section-progressive-model-based)
    - 5. [Use models which support missing values](#section-model)

<a id="section-a"></a>
# A. Prepare data with Null values for the example


In [None]:
data_ = pd.read_csv("../input/tabular-playground-series-sep-2021/train.csv",index_col = 0)

In [None]:
# keeping just 50 rows for my example.
def create_data():
    data = data_.iloc[0:50,30:50]
    return data
data = create_data()
print("Shape of the data------>{}".format(data.shape))
data.head()

In [None]:
# find the missing values.
features = data.columns.tolist()[0:-1]


#find the missing values w.r.t. column
colum_missing = data.isnull().sum()
# find the missing values w.r.t. row(number of missing values in the particular row)
row_missing = data[features].isnull().sum(axis=1)

# add the missing values to row to the dataframe as a new value
data['no_of_missing_data'] = row_missing

In [None]:
colum_missing

In [None]:
print(f"Total number of missing values in training dataset---->{data.shape[0]}")

# compare this to the whole data
no_of_missing_rows = (data['no_of_missing_data'] != 0).sum()
print("\n{0:{fill}{align}80}\n".format(" Data Summary " , fill = "=", align = "^"))
print(f"Total rows -----------------------> {data.shape[0]}\nNumber of rows has missing data---> {no_of_missing_rows}\n{'-'*50}\nNumber of rows has full data--------> {data.shape[0] - no_of_missing_rows}")


<a id="section-b"></a>
# B. Methods to handle the missing values


<a id="section-drop"></a>
## 1. Drop the data

In this method, we simply delete the rows or features/columns which has the Null value. We will delete a row if there are more missing values (say 70-75%) same goes for the columns. This is only preferred to use when we have enough samples in the dataset.  We can delete a feature/column when it has less feature importance over prediction. One has to make sure there is no add of bias, after we have removed the data.

Build-in functions:
- dropna() --> https://pandas.pydata.org/docs/reference/api/pandas.DataFrame.dropna.html

Pros: 
1. Removing this unwanted data sometimes make our model more accurate.
2. Deleting a column with less importance is better, since there is no use of keeping this with full of null values and no use of speding time handling this.

Cons:
1. Loss of information.
2. Redure prediction accuracy --> when we have a big number of missing values.

In [None]:
# Example.
row_drop = create_data()
# Drop the rows with null value
row_drop.dropna( how = 'any' , inplace = True) # we can change how to any/all . if 'all' the row will be deleted when it has all values as null values.
row_drop.isnull().sum()

In [None]:
# Drop the column --> here the f37  has '3' values. we can delete if we want to..
data = create_data()
data = data_.drop('f37', axis = 1)
data.head()

<a id="section-5m"></a>
## 2. Fill the places with any of 5 Ms (Mean/Median/Mode/Max/Min)

In this method, we can replace the null value with some approximations (Average(mean), Median, Mode, Min, and Max). This method can be used with the numerical columns. Even this is an approximate calculation for the null value, It is better than deleting the rows and columns. The Mean, Median, Mode are a statistical approach to handling the missing values. 

Build-in functions:
- fillna() --> https://pandas.pydata.org/docs/reference/api/pandas.DataFrame.fillna.html ( with different methods)
- SimpleImputer--> https://scikit-learn.org/stable/modules/generated/sklearn.impute.SimpleImputer.html

Pros:
1. There is no loss of information.
2. Better approach with small dataset.

Cons:
1. Imputing the approximations add variance and bias.

### Example with SimpleImputer() ---> for mean / median / mode
Chnage the strategy to ----> mean / median / most_frequent(mode) --------->median is the most frequently used strategy

In [None]:
from sklearn.impute import SimpleImputer

imputer = SimpleImputer(missing_values = np.nan, strategy = "mean")

data_before = data.copy()
#print(data_before.isnull().sum())
data_after = pd.DataFrame(imputer.fit_transform(data_before))
#print(data_after.isnull().sum())

### Example with replace() ---> for max / min

In [None]:
data.head(5)

In [None]:
# Example to replace with max in that column.
data_max = data.copy()
for f in data.columns.tolist():
    data_max[f] = data_max[f].replace(np.NaN, data[f].max())
data_max.head(5)    

Above we can see (4,f44) has NaN value in the original data. But after replacement that value is filled with the max() of that particular column.<br>
Change max() with min() to use min function.

<a id="section-constant"></a>
## 3. Fill the place with some constant value

In this method, we will add some constant fixed values in place of null values. for example, people's gender has only two categories, some follower classification has 4-5 classes, etc. Here we can add another class/category named 'Unknow' this will add more information to the dataset at the same time prevent information loss. In numerical columns, we can take some constant values to mark these places as null values. We can use -9999 or any number which seems correct for you. <br>
Will be a  problem for categorical values, since they need to convert into numerical form. We will use some encoding technology for this like "one-hot encoding". This will increase the number of columns when we are adding one more class.

Build-in functions:
- SimpleImputer--> https://scikit-learn.org/stable/modules/generated/sklearn.impute.SimpleImputer.html

Pros:
1. no loss of information here.
2. We have more control oue the new class/feature--> we can delete the data belonged to this class or do what ever we want.

Cons:
1. Peoblem in Encoding which may redice our accuracy.

In [None]:
from sklearn.impute import SimpleImputer

imputer = SimpleImputer(missing_values = np.nan, strategy = "constant", fill_value = -9999)

data_before = data.copy()
#print(data_before.isnull().sum())
data_after = pd.DataFrame(imputer.fit_transform(data_before))
#print(data_after.isnull().sum())

In [None]:
# Example to replace with max in that column.
data_max = data.copy()
for f in data.columns.tolist():
    data_max[f] = data_max[f].replace(np.NaN, -9999)
data_max.head(5) 


<a id="section-predict"></a>
## 4.Predict the missing values

In this method, we will predict/guess the value of a missing/null feature with the help of other non-null features. It is a more effective method when compared to other handling methods. This method may result in better accuracy. But, there is a problem with finding the correct predicting model. We need to try all the algorithms and pick the one with better accuracy. This is a good idea to try different algorithms instead of hoping for one.

### Two ways:
- 1. Model-based strategy 
- 2. Progressive model-based strategy

#### Model-based strategy:
Let fix a feature as a target feature train_y (having null values). Split this feature as having a null value set and having a non-null value set. The rows in the non-null value set makes the training data and the rows in the null values set makes the testing data. After this split, the data as train_x and train_y, train the model with this. Using this model we can find the value of the missing value in the target feature. see the example to get a clear understanding.
#### Progressive model-based strategy:
Same as above, but after we find the missing values in a given feature, we consider the feature as a predictor for predicting the missing values of the next feature.

Models:
1. linear regression.
2. KNN
3. logistric regression.

<a id="section-model-based"></a>
### Model-based Imputation strategy

In [None]:
from sklearn.impute import SimpleImputer

imputer = SimpleImputer(missing_values = np.nan, strategy = "constant", fill_value = -9999)

# need some feature without null value , so creating one dataset randomly
def generate_data_predict_section(data, imputer):
    data = data.iloc[11000:12000,30:45]
    # impute some feature with constant to make dataset i want.
    data.iloc[:,0:14] = pd.DataFrame(imputer.fit_transform(data.iloc[:,0:14]))
    return data
data_predict_section  = generate_data_predict_section(data_,imputer)

In [None]:
data_predict_section.isnull().sum()
# this is the data i am going to use, 14  non-null features -- 1 null features.

In [None]:
from sklearn.linear_model import LinearRegression

# method for Model-based strategy using Linear regression. i am taking only one null feature here. you can do the same for all other null features.
def model_based_imputation(data):
    # find the null features.
    feature_with_null = data.columns[data.isnull().any()].tolist()
    # find the non-null feature.
    feature_without_null = [col for col in list(data.columns) if col not in feature_with_null]
    
    # split train ans test data fro each null col and make prediction.
    for col in feature_with_null:
        test = data[data[col].isnull()]
        train = data.dropna()
        
        # split as train_x and train_y 
        train_x, train_y = train[feature_without_null], train[feature_with_null]
        test_x = test[feature_without_null]
        
        # train the model.
        linear = LinearRegression().fit(train_x, train_y)
        #predict the results
        pred_y = linear.predict(test_x)
        
        # set back to dataset.
        data.loc[test_x.index,feature_with_null] = pred_y
        
            
    return data

data_after = model_based_imputation(data_predict_section)
data_after.isnull().sum()

<a id = "section-progressive-model-based"></a>
### Progressive model-based imputation strategy

In [None]:
from sklearn.impute import SimpleImputer

imputer = SimpleImputer(missing_values = np.nan, strategy = "constant", fill_value = -9999)

# need some feature without null value , so creating one dataset randomly
def generate_data_predict_section(data, imputer):
    data = data.iloc[11000:12000,30:45]
    # impute some feature with constant to make dataset i want.
    data.iloc[:,0:10] = pd.DataFrame(imputer.fit_transform(data.iloc[:,0:10]))
    return data
data_predict_section  = generate_data_predict_section(data_,imputer)

In [None]:
data_predict_section.isnull().sum()
# this is the data i am going to use, 10  non-null features -- 5 null features.

In [None]:
from sklearn.linear_model import LinearRegression

# method for Model-based strategy using Linear regression. i am taking only one null feature here. you can do the same for all other null features.
def progressive_model_based_imputation(data):
    # find the null features.
    feature_with_null = data.columns[data.isnull().any()].tolist()
    # find the non-null feature.
    feature_without_null = [col for col in list(data.columns) if col not in feature_with_null]
    #generate data without any null values
    data_non_null = data[feature_without_null].copy()
    data_null = data[feature_with_null].copy()
    # split train ans test data fro each null col and make prediction.
    for col in feature_with_null:
        print(col)
        # add the first null feature to data for predcition
        data_non_null[col] = data_null[col]
        test = data_non_null[data_non_null[col].isnull()]
        train = data_non_null.dropna()
        
        # split as train_x and train_y 
        train_x, train_y = train[feature_without_null], train[col]
        test_x = test[feature_without_null]
        # train the model.
        linear = LinearRegression().fit(train_x, train_y)
        #predict the results
        pred_y = linear.predict(test_x)
        
        # set back to dataset.
        data_non_null.loc[test_x.index,col] = pred_y
        #print(data_non_null.isnull().sum())
        # add the col to feature_without_null list ---> because it has no null value now and it can be used for prediction as a training data.
        feature_without_null.append(col)
        #print(feature_without_null)

    return data_non_null

data_after = progressive_model_based_imputation(data_predict_section)
print("The dataset after the progressive based imputation")
data_after.isnull().sum()

<a id="section-model"></a>
## 5. Use models which support missing values

We can use some models which can supports null values. 
Example - KNN

### Completing your reading... 

### hope this will help some people like me :) please upvote if you find this helpful.