# Prediction using FastAI to 100 % Accuracy

In [1]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
%matplotlib inline

import warnings
warnings.filterwarnings("ignore")

In [9]:
from fastai.tabular.all import *

In [10]:
train = pd.read_csv("data/train_u6lujuX_CVtuZ9i.csv")
test = pd.read_csv("data/test_Y3wMUE5_7gLdaTN.csv")

In [12]:
train.head()

Unnamed: 0,Loan_ID,Gender,Married,Dependents,Education,Self_Employed,ApplicantIncome,CoapplicantIncome,LoanAmount,Loan_Amount_Term,Credit_History,Property_Area,Loan_Status
0,LP001002,Male,No,0,Graduate,No,5849,0.0,,360.0,1.0,Urban,Y
1,LP001003,Male,Yes,1,Graduate,No,4583,1508.0,128.0,360.0,1.0,Rural,N
2,LP001005,Male,Yes,0,Graduate,Yes,3000,0.0,66.0,360.0,1.0,Urban,Y
3,LP001006,Male,Yes,0,Not Graduate,No,2583,2358.0,120.0,360.0,1.0,Urban,Y
4,LP001008,Male,No,0,Graduate,No,6000,0.0,141.0,360.0,1.0,Urban,Y


In [11]:
train.isnull().sum().sort_index()/len(train)

ApplicantIncome      0.000000
CoapplicantIncome    0.000000
Credit_History       0.081433
Dependents           0.024430
Education            0.000000
Gender               0.021173
LoanAmount           0.035831
Loan_Amount_Term     0.022801
Loan_ID              0.000000
Loan_Status          0.000000
Married              0.004886
Property_Area        0.000000
Self_Employed        0.052117
dtype: float64

In [13]:
train.columns.to_series().groupby(train.dtypes).groups

{int64: ['ApplicantIncome'], float64: ['CoapplicantIncome', 'LoanAmount', 'Loan_Amount_Term', 'Credit_History'], object: ['Loan_ID', 'Gender', 'Married', 'Dependents', 'Education', 'Self_Employed', 'Property_Area', 'Loan_Status']}

In [14]:
cat_names = ['Loan_ID', 'Gender', 'Married', 'Dependents', 'Education', 'Self_Employed', 'Property_Area']

cont_names = ['ApplicantIncome', 'CoapplicantIncome', 'LoanAmount', 'Loan_Amount_Term', 'Credit_History']

In [15]:
splits = RandomSplitter(valid_pct=0.2)(range_of(train))

to = TabularPandas(train, procs=[Categorify, FillMissing, Normalize],
                   cat_names = cat_names,
                   cont_names = cont_names,
                   y_names='Loan_Status',
                   splits=splits)

In [16]:
to.train.xs.columns.to_series().groupby(to.train.xs.dtypes).groups

{int8: ['Gender', 'Married', 'Dependents', 'Education', 'Self_Employed', 'Property_Area', 'Loan_Status', 'LoanAmount_na', 'Loan_Amount_Term_na', 'Credit_History_na'], int16: ['Loan_ID'], float64: ['ApplicantIncome', 'CoapplicantIncome', 'LoanAmount', 'Loan_Amount_Term', 'Credit_History']}

In [17]:
to.train

     Loan_ID  Gender  Married  Dependents  Education  Self_Employed  \
590      591       2        2           1          1              1   
87        88       2        2           1          1              1   
163      164       2        2           3          1              1   
169      170       2        2           3          1              1   
149      150       2        2           1          1              1   
..       ...     ...      ...         ...        ...            ...   
570      571       2        2           2          1              1   
556      557       1        1           1          1              1   
538      539       2        2           1          2              1   
576      577       0        2           1          1              1   
515      516       2        2           4          1              1   

     ApplicantIncome  CoapplicantIncome  LoanAmount  Loan_Amount_Term  \
590        -0.414867           0.717034   -1.123420         -2.442000   
8

In [18]:
to.train.xs

Unnamed: 0,Loan_ID,Gender,Married,Dependents,Education,Self_Employed,Property_Area,Loan_Status,LoanAmount_na,Loan_Amount_Term_na,Credit_History_na,ApplicantIncome,CoapplicantIncome,LoanAmount,Loan_Amount_Term,Credit_History
590,591,2,2,1,1,1,2,1,1,1,1,-0.414867,0.717034,-1.123420,-2.442000,0.410665
87,88,2,2,1,1,1,2,1,1,1,1,-0.506578,0.226941,-0.499007,0.287773,0.410665
163,164,2,2,3,1,1,1,1,1,1,1,-0.200815,-0.026412,0.203458,0.287773,0.410665
169,170,2,2,3,1,1,2,1,1,1,1,0.502237,-0.572764,0.749819,0.287773,0.410665
149,150,2,2,1,1,1,2,1,1,1,1,-0.073705,-0.259376,-0.225826,0.287773,0.410665
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
570,571,2,2,2,1,1,3,1,1,1,1,-0.338381,0.087993,0.567699,0.287773,0.410665
556,557,1,1,1,1,1,3,1,1,1,2,-0.475947,0.040796,-0.759179,0.287773,0.410665
538,539,2,2,1,2,1,1,0,1,1,1,-0.430091,-0.370383,-0.993334,0.287773,0.410665
576,577,0,2,1,1,1,2,0,1,1,1,-0.398910,0.261678,-0.082732,0.287773,-2.435072


In [19]:
to.train.ys.values.ravel()

array([1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 1, 1, 0, 1, 1, 1, 1,
       1, 1, 1, 0, 1, 1, 0, 0, 0, 1, 0, 1, 0, 1, 1, 1, 1, 1, 1, 0, 0, 1,
       1, 1, 1, 0, 1, 1, 1, 1, 0, 1, 1, 1, 1, 1, 1, 0, 0, 1, 1, 1, 1, 1,
       0, 1, 0, 1, 0, 1, 1, 1, 1, 0, 1, 0, 1, 1, 0, 1, 1, 0, 1, 1, 1, 0,
       1, 1, 1, 0, 0, 1, 1, 1, 0, 0, 1, 1, 1, 0, 0, 1, 0, 1, 1, 1, 1, 1,
       0, 1, 1, 1, 0, 0, 1, 0, 1, 1, 0, 0, 0, 0, 0, 1, 1, 0, 1, 1, 0, 1,
       0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 1, 1, 0, 0, 0, 1, 1,
       0, 1, 1, 0, 0, 1, 0, 1, 1, 1, 1, 1, 1, 0, 1, 1, 1, 1, 1, 0, 1, 1,
       1, 1, 1, 0, 0, 1, 0, 0, 1, 1, 0, 1, 1, 1, 1, 1, 0, 0, 1, 1, 1, 0,
       0, 1, 0, 0, 1, 1, 1, 1, 1, 0, 1, 0, 1, 0, 1, 0, 1, 1, 1, 1, 0, 1,
       1, 1, 1, 0, 0, 0, 0, 1, 1, 1, 1, 0, 1, 1, 1, 1, 1, 1, 1, 0, 1, 1,
       1, 1, 0, 1, 0, 0, 1, 1, 1, 1, 1, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1,
       1, 1, 0, 1, 1, 1, 1, 0, 1, 1, 1, 1, 1, 1, 0, 0, 1, 1, 0, 0, 1, 0,
       0, 1, 0, 1, 1, 0, 1, 1, 1, 1, 1, 0, 1, 1, 1,

In [20]:
from sklearn.ensemble import RandomForestClassifier

X_train, y_train = to.train.xs, to.train.ys.values.ravel()
X_valid, y_valid = to.valid.xs, to.valid.ys.values.ravel()

In [21]:
X_train.head()

Unnamed: 0,Loan_ID,Gender,Married,Dependents,Education,Self_Employed,Property_Area,Loan_Status,LoanAmount_na,Loan_Amount_Term_na,Credit_History_na,ApplicantIncome,CoapplicantIncome,LoanAmount,Loan_Amount_Term,Credit_History
590,591,2,2,1,1,1,2,1,1,1,1,-0.414867,0.717034,-1.12342,-2.442,0.410665
87,88,2,2,1,1,1,2,1,1,1,1,-0.506578,0.226941,-0.499007,0.287773,0.410665
163,164,2,2,3,1,1,1,1,1,1,1,-0.200815,-0.026412,0.203458,0.287773,0.410665
169,170,2,2,3,1,1,2,1,1,1,1,0.502237,-0.572764,0.749819,0.287773,0.410665
149,150,2,2,1,1,1,2,1,1,1,1,-0.073705,-0.259376,-0.225826,0.287773,0.410665


In [22]:
rnf_classifier= RandomForestClassifier(n_estimators=100, n_jobs=-1)
rnf_classifier.fit(X_train,y_train)

In [23]:
y_pred=rnf_classifier.predict(X_valid)


from sklearn.metrics import accuracy_score

accuracy_score(y_pred, y_valid)

1.0