In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

In [None]:
import seaborn as sns
import numpy as np

## Content:
> ### <a href='#1'>1. Load Data, Check Data, Null Values</a>

 
> ### <a href='#2'>2. Filter Null Values</a> 
 
> ### <a href='#3'>3. Check Null Values</a> 
 
> ### <a href='#4'>4. Droping </a>

> ### <a href="#5">5. Replace with a Constant Value [median, average, mode, next value, previous value, constant value]</a>
> ### <a href="#6">6. Used Other Columns' Info</a>

>### <a href="#7">7. Predict Missing Values Using ML Prediction</a>


## <a id='1'>1.  Load Data, Check Data, Null Values, etc

In [None]:
data = pd.read_csv("../input/loan-predication/train_u6lujuX_CVtuZ9i (1).csv")

In [None]:
data.head()

In [None]:
data.shape

First, we need to check how many null values do we have in each data set:

In [None]:
data.isnull().sum()

We can divide each raw with the sum of the raws to have a percentage of null values in each raw:

In [None]:
Missing_percentage = (data.isnull().sum()/data.shape[0])*100
print(np.round(Missing_percentage, decimals=2))

We can also visualize the null values. Black lines are showning the null values in each column

In [None]:
import seaborn as sns
sns.heatmap(data.isnull(), center=True)

# <a id='2'> 2. Filter Null Values

We want to filter the raws and columns with more than 10 null values:

In [None]:
# columns with more than 10 null values
more_10nulls_columns = data.columns[(data.isna().sum()>10)]
#filter data
data[more_10nulls_columns]

In [None]:
# raws with more than 2 null values - method 1
indexx = []
for i in range(0, data.shape[0]):
    if (data.iloc[i].isna().sum()>2):
        indexx.append(i)
data.iloc[indexx]

In [None]:
# raws with more than 2 null values - method 2
data[data.isnull().sum(axis=1)>2]

#### Columns with largest null values

In [None]:
# 3 most largest
data.isnull().sum().nlargest(3)

# <a id='3'> 3. Check Null Values:

Purpose: We want to make sure that the values are missed. Sometimes, null values are not really "missed" data. For example, imagine you have "balcony" column in a house dataset. If you have no balcony in a house, maybe the corresponding value is left as a blank or "NAN". In this case, we know the missing value is "zero" not a real null value.
Maybe the data does not exist at all. For example, size of the balcony in a house dataset where house does not have any balcony!

What we need to do? CHECK THE NULL VALUES :)

In [None]:
data[data.isnull().sum(axis=1)>0]

For example if someone does not married, he/she may not have dependent. So, 'nan' in dependent means no dependent

In [None]:
data[(data.Dependents.isna()) & (data.Married=='No')]

Lets replace NaN with 'NA' or not applicable

In [None]:
data.Dependents[(data.Dependents.isna()) & (data.Married=='No')] = "NA"

In [None]:
data.Dependents.unique()

# <a id='4'> 4. Drop Null Values

#### Drop Null Values

In [None]:
# drop null values:
data.dropna()
#pros: easy, fast
#cons: loosing some data
#if you want to change the dataset permanently:
#data.dropna(inplace=True)

# <a id='5'> 5. Replace with a Constant Value

#### Fill based on previous or next values

In [None]:
#based on previous value in the column
data.fillna(method='backfill')
# We do not changed the dataset permanently, in case if you want to change the dataset use inplace=True method
#pros: fast, easy, no previous knowledge about data
#cons: altering data 

In [None]:
#based on next value in the column
data.fillna(method='ffill')

#### Replace with specific value

In [None]:
index_Loan_Null = data.Loan_Amount_Term[data.Loan_Amount_Term.isna()].index
data.Loan_Amount_Term[data.Loan_Amount_Term.isna()]

In [None]:
# replace the loan amount with 360
newdata = data.Loan_Amount_Term.fillna(value=360) 
newdata.iloc[index_Loan_Null]

In [None]:
# replace with avarage value
newdata = data.Loan_Amount_Term.fillna(value=data.Loan_Amount_Term.mean()) 
newdata.iloc[index_Loan_Null]

In [None]:
# replace with mode value
newdata = data.Loan_Amount_Term.fillna(value=data.Loan_Amount_Term.mode().max())
newdata.iloc[index_Loan_Null]

In [None]:
# replace with min or max value
newdata = data.Loan_Amount_Term.fillna(value=data.Loan_Amount_Term.min())  #min,   replace min with max for maximum value
newdata.iloc[index_Loan_Null]

In [None]:
# replace with median value
newdata = data.Loan_Amount_Term.fillna(data.Loan_Amount_Term.median())
newdata.iloc[index_Loan_Null]

# <a id='6'> 6. Use Other Columns' Information

Lets play with "Married" column and try to replace the null values

In [None]:
data.Married.unique()

In [None]:
import seaborn as sns
sns.countplot(data.Married, hue=data.Dependents)

As can be seen from the above graph, if someone does not married, it is less likely to have dependents. Hence, we can say: if married section is null value, and dependent value is zero, most probabley married column is 'No'

In [None]:
data.Married[(data.Married.isna()) & (data.Dependents !=0)]

In [None]:
data.Married[(data.Married.isna()) & (data.Dependents !=0)] = 'No'

# <a id='7'> 7. Prediction of Null Values Using ML Prediction

The idea is build a ML classifer/regression to predict the missed values. To implement this, we should build a model (train model) based on non-null values and then predict the null values:

Lets predict LoanAmount missing values

In [None]:
print("we have {} percentage of null values for LoanAmount".format(round(100*data.LoanAmount.isna().sum()/data.shape[0], 2)))

To keep the original data set unchanged, I am copying the data set in another variable called "data_ML"

In [None]:
data_ML = data.copy()

Separate the null values in LeanAmount and name it test_x and test_y:
I am using columns without a null values to predict LoanAmount

In [None]:
data_with_missed_loan = data_ML[data_ML.LoanAmount.isna()]
test_y = data_with_missed_loan.LoanAmount
test_x = data_with_missed_loan[['Married','Education','ApplicantIncome','CoapplicantIncome','Loan_Status']]
test_y.shape , test_x.shape

In [None]:
test_x.head()

In [None]:
import numpy as np
from sklearn.preprocessing import OneHotEncoder
categorical_columns = ['Married','Education','Loan_Status']
for items in categorical_binary:
    le = OneHotEncoder(drop="first")
    t = le.fit_transform(test_x[[items]]).toarray()
    test_x[items+'_binary']=t
test_x = test_x[['ApplicantIncome', 'CoapplicantIncome', 'Married_binary', 'Education_binary','Loan_Status_binary']]
test_x.head()

Now remove the missing values of "LoanAmount" column from data_ML:

In [None]:
data_ML=data_ML.iloc[data_ML.LoanAmount.dropna().index]

In [None]:
y = data_ML[['Married','Education','ApplicantIncome','CoapplicantIncome','Loan_Status']]
X = data_ML.LoanAmount
y.shape, X.shape
for items in categorical_binary:
    le = OneHotEncoder(drop="first")
    t = le.fit_transform(y[[items]]).toarray()
    y[items+'_binary']=t
y = y[['ApplicantIncome', 'CoapplicantIncome', 'Married_binary', 'Education_binary','Loan_Status_binary']]
y.head()

In [None]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.35, random_state=42)


In [None]:
from sklearn.ensemble import RandomForestRegressor
model = RandomForestRegressor(n_estimators=200)
model.fit(y_train, X_train)
predict = model.predict(y_test)

from sklearn.metrics import r2_score
r2_score(predict, X_test)

You can optimize the model to get a better accuracy, but we are moving as this is just an example :)

In [None]:
missed_values = model.predict(test_x)
missed_values

In [None]:
data.LoanAmount[data.LoanAmount.isna()]=missed_values

In [None]:
data.LoanAmount.isna().any()

Try KNN and other ML methods as well

### I am working on this notebook ....