In [1]:

# Import Python pkgs pandas, numpy, matplotlib.pyplot, & seaborn
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import numpy.random as nr
import math

%matplotlib inline  
# Start of magic command which configures execution environment, to display graphics w/in notebook

In [2]:
# Load training feature dataset, display shape, & explore first 10 rows of Pandas data frame

LoanTrainV = pd.read_csv('data/train_values.csv', header=0)
print(LoanTrainV.shape)
LoanTrainV.head(10)

(500000, 22)


Unnamed: 0,row_id,loan_type,property_type,loan_purpose,occupancy,loan_amount,preapproval,msa_md,state_code,county_code,...,applicant_sex,applicant_income,population,minority_population_pct,ffiecmedian_family_income,tract_to_msa_md_income_pct,number_of_owner-occupied_units,number_of_1_to_4_family_units,lender,co_applicant
0,0,3,1,1,1,70.0,3,18,37,246,...,1,24.0,6203.0,44.23,60588.0,50.933,716.0,2642.0,4536,False
1,1,1,1,3,1,178.0,3,369,52,299,...,1,57.0,5774.0,15.905,54821.0,100.0,1622.0,2108.0,2458,False
2,2,2,1,3,1,163.0,3,16,10,306,...,1,67.0,6094.0,61.27,67719.0,100.0,760.0,1048.0,5710,False
3,3,1,1,1,1,155.0,1,305,47,180,...,1,105.0,6667.0,6.246,78439.0,100.0,2025.0,2299.0,5888,True
4,4,1,1,1,1,305.0,3,24,37,20,...,2,71.0,6732.0,100.0,63075.0,82.2,1464.0,1847.0,289,False
5,5,1,1,3,1,133.0,3,221,13,55,...,2,51.0,6078.0,4.821,82745.0,96.55,1827.0,2340.0,964,False
6,6,3,1,1,1,240.0,3,374,28,131,...,2,104.0,6068.0,26.427,65282.0,81.068,1863.0,2560.0,5488,False
7,7,2,1,1,1,210.0,3,322,37,35,...,1,55.0,6030.0,78.153,108353.0,59.439,969.0,1601.0,2442,True
8,8,1,1,3,2,209.0,3,24,37,20,...,1,244.0,5151.0,88.156,63414.0,62.813,411.0,481.0,2118,True
9,9,1,1,3,1,197.0,3,194,9,20,...,1,86.0,7916.0,24.893,65927.0,100.0,1861.0,2123.0,3507,False


In [3]:
#Above- We have a total of 22 columns and 500,000 rows in the training feature dataset.

#Below- Review data types for each column
LoanTrainV.dtypes

row_id                              int64
loan_type                           int64
property_type                       int64
loan_purpose                        int64
occupancy                           int64
loan_amount                       float64
preapproval                         int64
msa_md                              int64
state_code                          int64
county_code                         int64
applicant_ethnicity                 int64
applicant_race                      int64
applicant_sex                       int64
applicant_income                  float64
population                        float64
minority_population_pct           float64
ffiecmedian_family_income         float64
tract_to_msa_md_income_pct        float64
number_of_owner-occupied_units    float64
number_of_1_to_4_family_units     float64
lender                              int64
co_applicant                         bool
dtype: object

In [4]:
# Load label dataset, display shape, & explore first 10 rows of Pandas data frame

LoanTrainL = pd.read_csv('data/train_labels.csv', header=0)
print(LoanTrainL.shape)
LoanTrainL.head(10)

(500000, 2)


Unnamed: 0,row_id,accepted
0,0,1
1,1,0
2,2,1
3,3,1
4,4,1
5,5,1
6,6,1
7,7,1
8,8,1
9,9,0


In [5]:
#Above- We have a total of 2 columns and 500,000 rows in the Label dataset.


#Below- Review data types for each column

LoanTrainL.dtypes

row_id      int64
accepted    int64
dtype: object

In [6]:
# Find missing values w/in dataset
 # Can be coded as character string, numeric value like -999 or missing value NaN, or a NULL value.
# Then determine how to treat missing values.
 # Remove rows w/ missing values
 # Remove features w/ missing values
 # Impute by replacing values w/ mean or median using simple algorithms, or more complex ones like SMOTE or expectation maximization (EM).
 # Use nearest neighbor, averaging, forward filling or backward filling.

# Shows which features are missing values

#(LoanTrainV.astype(np.object) == '?').any()  # This will only work w/ strings
pd.isna(LoanTrainV).any()

row_id                            False
loan_type                         False
property_type                     False
loan_purpose                      False
occupancy                         False
loan_amount                       False
preapproval                       False
msa_md                            False
state_code                        False
county_code                       False
applicant_ethnicity               False
applicant_race                    False
applicant_sex                     False
applicant_income                   True
population                         True
minority_population_pct            True
ffiecmedian_family_income          True
tract_to_msa_md_income_pct         True
number_of_owner-occupied_units     True
number_of_1_to_4_family_units      True
lender                            False
co_applicant                      False
dtype: bool

In [7]:
#Above- Training feature dataset does seem to contain missing values in 7 columns.

#Below- Find out how many values are missing w/in each object/categorical feature.

for col in LoanTrainV.columns:
    if LoanTrainV[col].dtype == bool:
        count = 0
        count = [count + 1 for x in LoanTrainV[col] if pd.isna(x)]
        print(col + ' ' + str(sum(count)))

co_applicant 0


In [8]:
#Above- Training feature dataset does not seem to contain missing values in boolean columns.

#Below- Find out how many values are missing w/in each float64 feature.

for col in LoanTrainV.columns:
    if LoanTrainV[col].dtype == float :
        count = 0
        count = [count + 1 for x in LoanTrainV[col] if pd.isna(x)]
        print(col + ' ' + str(sum(count)))

loan_amount 0
applicant_income 39948
population 22465
minority_population_pct 22466
ffiecmedian_family_income 22440
tract_to_msa_md_income_pct 22514
number_of_owner-occupied_units 22565
number_of_1_to_4_family_units 22530


In [9]:
#Above- Training feature dataset does seem to contain missing values in 7 float columns.


#Below- Find out how many values are missing w/in each int64 feature.

for col in LoanTrainV.columns:
    if LoanTrainV[col].dtype == int :
        count = 0
        count = [count + 1 for x in LoanTrainV[col] if pd.isna(x)]
        print(col + ' ' + str(sum(count)))

row_id 0
loan_type 0
property_type 0
loan_purpose 0
occupancy 0
preapproval 0
msa_md 0
state_code 0
county_code 0
applicant_ethnicity 0
applicant_race 0
applicant_sex 0
lender 0


In [10]:
#Above- Training feature dataset does not seem to contain missing values in integer columns.

#Below- Find out how many values are missing as '-1' w/in this integer column feature.

print(any(LoanTrainV.msa_md == -1))
print(LoanTrainV.msa_md[LoanTrainV.msa_md == -1].count())

True
76982


In [11]:
#Above- Training feature column msa_md does seem to contain 76,982 missing values.

#Below- Find out how many values are missing as '-1' w/in this integer column feature.

print(any(LoanTrainV.state_code == -1))
print(LoanTrainV.state_code[LoanTrainV.state_code == -1].count())

True
19132


In [12]:
#Above- Training feature column state_code does seem to contain 19,132 missing values.

#Below- Find out how many values are missing as '-1' w/in this integer column feature.

print(any(LoanTrainV.county_code == -1))
print(LoanTrainV.county_code[LoanTrainV.county_code == -1].count())

True
20466


In [13]:
#Above- Training feature column county_code does seem to contain 20,466 missing values.

# Find missing values w/in label dataset
 # Can be coded as character string, numeric value like -999 or missing value NaN, or a NULL value.
# Then determine how to treat missing values.
 # Remove rows w/ missing values
 # Remove features w/ missing values
 # Impute by replacing values w/ mean or median using simple algorithms, or more complex ones like SMOTE or expectation maximization (EM).
 # Use nearest neighbor, averaging, forward filling or backward filling.

# Shows which features are missing values

#(LoanTrainV.astype(np.object) == '?').any()  # This will only work w/ strings
pd.isna(LoanTrainL).any()
#pd.isnull(LoanTrainV)

row_id      False
accepted    False
dtype: bool

In [14]:
#Above- Label dataset does seem to contain missing values in float columns.

#Below- Examine feature training data row_id for duplicates by checking for unique IDs.

print(LoanTrainV.shape)
print(LoanTrainV.row_id.unique().shape)

(500000, 22)
(500000,)


In [15]:
#Above- Training feature dataset does not seem to contain duplicate values.

# Compute & display summary statistics for numeric columns of training data

LoanTrainV.describe()

Unnamed: 0,row_id,loan_type,property_type,loan_purpose,occupancy,loan_amount,preapproval,msa_md,state_code,county_code,...,applicant_race,applicant_sex,applicant_income,population,minority_population_pct,ffiecmedian_family_income,tract_to_msa_md_income_pct,number_of_owner-occupied_units,number_of_1_to_4_family_units,lender
count,500000.0,500000.0,500000.0,500000.0,500000.0,500000.0,500000.0,500000.0,500000.0,500000.0,...,500000.0,500000.0,460052.0,477535.0,477534.0,477560.0,477486.0,477435.0,477470.0,500000.0
mean,249999.5,1.366276,1.04765,2.06681,1.10959,221.753158,2.764722,181.606972,23.726924,144.542062,...,4.786586,1.462374,102.389521,5416.833956,31.61731,69235.603298,91.832624,1427.718282,1886.147065,3720.121344
std,144337.711634,0.690555,0.231404,0.948371,0.326092,590.641648,0.543061,138.464169,15.982768,100.243612,...,1.024927,0.677685,153.534496,2728.144999,26.333938,14810.058791,14.210924,737.559511,914.123744,1838.313175
min,0.0,1.0,1.0,1.0,1.0,1.0,1.0,-1.0,-1.0,-1.0,...,1.0,1.0,1.0,14.0,0.534,17858.0,3.981,4.0,1.0,0.0
25%,124999.75,1.0,1.0,1.0,1.0,93.0,3.0,25.0,6.0,57.0,...,5.0,1.0,47.0,3744.0,10.7,59731.0,88.06725,944.0,1301.0,2442.0
50%,249999.5,1.0,1.0,2.0,1.0,162.0,3.0,192.0,26.0,131.0,...,5.0,1.0,74.0,4975.0,22.901,67526.0,100.0,1327.0,1753.0,3731.0
75%,374999.25,2.0,1.0,3.0,1.0,266.0,3.0,314.0,37.0,246.0,...,5.0,2.0,117.0,6467.0,46.02,75351.0,100.0,1780.0,2309.0,5436.0
max,499999.0,4.0,3.0,3.0,3.0,100878.0,3.0,408.0,52.0,324.0,...,7.0,4.0,10139.0,37097.0,100.0,125248.0,100.0,8771.0,13623.0,6508.0


In [16]:
#Below- Examine label data for duplicates by checking for unique IDs (row_id).

print(LoanTrainL.shape)
print(LoanTrainL.row_id.unique().shape)

(500000, 2)
(500000,)


In [17]:
#Above- Label dataset does not seem to contain duplicate values.

#Below- Compute & display summary statistics for numeric columns of label data

LoanTrainL.describe()

Unnamed: 0,row_id,accepted
count,500000.0,500000.0
mean,249999.5,0.500228
std,144337.711634,0.5
min,0.0,0.0
25%,124999.75,0.0
50%,249999.5,1.0
75%,374999.25,1.0
max,499999.0,1.0


In [18]:
# rename training data

LoanTrain = LoanTrainV

print(LoanTrain.shape)
LoanTrain.head(10)

(500000, 22)


Unnamed: 0,row_id,loan_type,property_type,loan_purpose,occupancy,loan_amount,preapproval,msa_md,state_code,county_code,...,applicant_sex,applicant_income,population,minority_population_pct,ffiecmedian_family_income,tract_to_msa_md_income_pct,number_of_owner-occupied_units,number_of_1_to_4_family_units,lender,co_applicant
0,0,3,1,1,1,70.0,3,18,37,246,...,1,24.0,6203.0,44.23,60588.0,50.933,716.0,2642.0,4536,False
1,1,1,1,3,1,178.0,3,369,52,299,...,1,57.0,5774.0,15.905,54821.0,100.0,1622.0,2108.0,2458,False
2,2,2,1,3,1,163.0,3,16,10,306,...,1,67.0,6094.0,61.27,67719.0,100.0,760.0,1048.0,5710,False
3,3,1,1,1,1,155.0,1,305,47,180,...,1,105.0,6667.0,6.246,78439.0,100.0,2025.0,2299.0,5888,True
4,4,1,1,1,1,305.0,3,24,37,20,...,2,71.0,6732.0,100.0,63075.0,82.2,1464.0,1847.0,289,False
5,5,1,1,3,1,133.0,3,221,13,55,...,2,51.0,6078.0,4.821,82745.0,96.55,1827.0,2340.0,964,False
6,6,3,1,1,1,240.0,3,374,28,131,...,2,104.0,6068.0,26.427,65282.0,81.068,1863.0,2560.0,5488,False
7,7,2,1,1,1,210.0,3,322,37,35,...,1,55.0,6030.0,78.153,108353.0,59.439,969.0,1601.0,2442,True
8,8,1,1,3,2,209.0,3,24,37,20,...,1,244.0,5151.0,88.156,63414.0,62.813,411.0,481.0,2118,True
9,9,1,1,3,1,197.0,3,194,9,20,...,1,86.0,7916.0,24.893,65927.0,100.0,1861.0,2123.0,3507,False


In [19]:
#Below- Examine training data for county_code duplicates by checking for unique IDs.

print(LoanTrain.shape)
print(LoanTrain.county_code.unique().shape)

(500000, 22)
(318,)


In [20]:
# Below - Handle missing data
 # Set limit that features which are missing 20% or more of their data are removed.
    
for col in LoanTrain.columns:
    if LoanTrain[col].dtype == float :
        count = 0
        count = [count + 1 for x in LoanTrain[col] if pd.isna(x)]
        prct = ((sum(count))/500000)*100
        print(col + ' ' + str(prct))

loan_amount 0.0
applicant_income 7.989599999999999
population 4.492999999999999
minority_population_pct 4.4932
ffiecmedian_family_income 4.488
tract_to_msa_md_income_pct 4.5028
number_of_owner-occupied_units 4.513
number_of_1_to_4_family_units 4.506


In [21]:
#Below- Find out how many values are missing as '-1' w/in this integer column feature.
 # Set limit that features which are missing 20% or more of their data are removed.
    
missing_msa_md = LoanTrain.msa_md[LoanTrain.msa_md == -1].count()
missing_state_code = LoanTrain.state_code[LoanTrain.state_code == -1].count()
missing_county_code = LoanTrain.county_code[LoanTrain.county_code == -1].count()

pctmiss_msa_ma = ((missing_msa_md)/500000)*100
pctmiss_state_code = ((missing_state_code)/500000)*100
pctmiss_county_code = ((missing_county_code)/500000)*100

print(pctmiss_msa_ma)
print(pctmiss_state_code)
print(pctmiss_county_code)

15.3964
3.8264
4.0932


In [22]:
#Above- None of the columns are above the 20% mark in the dataset.

#Below- Save this data to a csv (backup)

LoanTrain.to_csv('LoanTrain_2019-04-21.csv', index = False, header = True)

In [23]:
## Sorted the backup csv by smallest to largest for state_code then county_code. No data deleted.
# Load the grouped feature dataset, display shape, & explore first 10 rows of Pandas data frame.
# Labels (accepted) were concatenated to end of dataset.

LoanTrainG = pd.read_csv('LoanTrain_Groups_2019-04-21.csv', header=0)
print(LoanTrainG.shape)
LoanTrainG.head(10)

FileNotFoundError: [Errno 2] File b'LoanTrain_Groups_2019-04-21.csv' does not exist: b'LoanTrain_Groups_2019-04-21.csv'

In [None]:
#Below- RECHECK-Find out how many values are missing as '-1' w/in this integer column feature.

print(any(LoanTrainG.msa_md == -1))
print(LoanTrainG.msa_md[LoanTrainG.msa_md == -1].count())

In [None]:
#Above- Still have 75,900 missing data for msa_md.

#Below- RECHECK-Find out how many values are missing as '-1' w/in this integer column feature.

print(any(LoanTrainG.state_code == -1))
print(LoanTrainG.state_code[LoanTrainG.state_code == -1].count())

In [None]:
#Above- Still have 19,132 missing data for state_code.

#Below- RECHECK-Find out how many values are missing as '-1' w/in this integer column feature.

print(any(LoanTrainG.county_code == -1))
print(LoanTrainG.county_code[LoanTrainG.county_code == -1].count())

In [None]:
#Above- Still have 20,466 missing data for county_code.

#Below- Since the mean and medians were so different from each other, indicating skewness, we will replace missing values in rows
 # with medians instead of means. 
    
# Calculate median of each column in dataset.

median= LoanTrainG.median()
print(median)

In [None]:
#Below- RECHECK missing values for county_code & msa_md in dataset.

LoanTrainG

In [None]:
#Fill the remaining numerical missing values with median values for each column.

LoanTrainG = LoanTrainG.fillna(median)
LoanTrainG.shape

In [None]:
# RECHECK- Now double check and look for missing values again in training dataset.

for col in LoanTrainG.columns:
    if LoanTrainG[col].dtype == float:
        count = 0
        count = [count + 1 for x in LoanTrainG[col] if pd.isna(x)]
        print(col + ' ' + str(sum(count)))

In [None]:
# RECHECK - Find out how many values are missing as '-1' w/in this integer column feature.
    
missing_msa_md = LoanTrainG.msa_md[LoanTrainG.msa_md == -1].count()
missing_state_code = LoanTrainG.state_code[LoanTrainG.state_code == -1].count()
missing_county_code = LoanTrainG.county_code[LoanTrainG.county_code == -1].count()

pctmiss_msa_ma = ((missing_msa_md)/500000)*100
pctmiss_state_code = ((missing_state_code)/500000)*100
pctmiss_county_code = ((missing_county_code)/500000)*100

print(pctmiss_msa_ma)
print(pctmiss_state_code)
print(pctmiss_county_code)

In [None]:
#Above - Will keep the missing interger values as '-1' within the training dataset.

#Below - Recalculate the summary statistcs for numeric values with columns removed and missing values filled with median.

LoanTrainG.describe()

In [None]:
#Save this cleaned training data to a csv

LoanTrainG.to_csv('LoanTrain_Clean_2019-04-23d.csv', index = False, header = True)