In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

## Data cleaning

In [2]:
df=pd.read_csv('lc_loan.csv/lc_loan.csv',low_memory=False)

In [3]:
df.head()

Unnamed: 0,id,member_id,loan_amnt,funded_amnt,funded_amnt_inv,term,int_rate,installment,grade,sub_grade,...,total_bal_il,il_util,open_rv_12m,open_rv_24m,max_bal_bc,all_util,total_rev_hi_lim,inq_fi,total_cu_tl,inq_last_12m
0,1077501,1296599,5000.0,5000.0,4975.0,36 months,10.65,162.87,B,B2,...,,,,,,,,,,
1,1077430,1314167,2500.0,2500.0,2500.0,60 months,15.27,59.83,C,C4,...,,,,,,,,,,
2,1077175,1313524,2400.0,2400.0,2400.0,36 months,15.96,84.33,C,C5,...,,,,,,,,,,
3,1076863,1277178,10000.0,10000.0,10000.0,36 months,13.49,339.31,C,C1,...,,,,,,,,,,
4,1075358,1311748,3000.0,3000.0,3000.0,60 months,12.69,67.79,B,B5,...,,,,,,,,,,


In [4]:
df.columns

Index(['id', 'member_id', 'loan_amnt', 'funded_amnt', 'funded_amnt_inv',
       'term', 'int_rate', 'installment', 'grade', 'sub_grade', 'emp_title',
       'emp_length', 'home_ownership', 'annual_inc', 'verification_status',
       'issue_d', 'loan_status', 'pymnt_plan', 'url', 'desc', 'purpose',
       'title', 'zip_code', 'addr_state', 'dti', 'delinq_2yrs',
       'earliest_cr_line', 'inq_last_6mths', 'mths_since_last_delinq',
       'mths_since_last_record', 'open_acc', 'pub_rec', 'revol_bal',
       'revol_util', 'total_acc', 'initial_list_status', 'out_prncp',
       'out_prncp_inv', 'total_pymnt', 'total_pymnt_inv', 'total_rec_prncp',
       'total_rec_int', 'total_rec_late_fee', 'recoveries',
       'collection_recovery_fee', 'last_pymnt_d', 'last_pymnt_amnt',
       'next_pymnt_d', 'last_credit_pull_d', 'collections_12_mths_ex_med',
       'mths_since_last_major_derog', 'policy_code', 'application_type',
       'annual_inc_joint', 'dti_joint', 'verification_status_joint',
    

In [5]:
df.shape

(887379, 74)

In [6]:
#We look to see which columns have missing values
missing = pd.concat([df.isnull().sum(), 100 * df.isnull().mean()], axis=1)
missing.columns=['count','%']
missing[missing["count"]>0].sort_values(by='count',ascending=False)

Unnamed: 0,count,%
dti_joint,886870,99.94264
annual_inc_joint,886868,99.942415
verification_status_joint,886868,99.942415
il_util,868762,97.902024
mths_since_rcnt_il,866569,97.654892
total_cu_tl,866007,97.591559
inq_fi,866007,97.591559
all_util,866007,97.591559
max_bal_bc,866007,97.591559
open_rv_24m,866007,97.591559


Some of the most commonly missing values are related to joint applications. We can see how many there are.

In [7]:
df[df.application_type=="JOINT"].shape

(511, 74)

There are only 511 joint applications. While we may want to come back to these later, for a first run through we can get rid of them. Doing so we can get rid of the application type field. We will also take out the member ID field for modeling.

In [8]:
X=df[df.application_type!="JOINT"]
X=X.drop(columns=['id','member_id','application_type'])

We can now get rid of any columns involving the joint applicant

In [9]:
X=X.drop(columns=['dti_joint','annual_inc_joint','verification_status_joint'])

We can get the 2016-2017 data (to be used as testing) and start doing the same thing

In [10]:
df2=pd.read_csv("lc_2016_2017.csv/lc_2016_2017.csv",low_memory=False)

In [11]:
X_test=df2[df2.application_type!="JOINT"]
X_test=X_test.drop(columns=['id','member_id','application_type','dti_joint','annual_inc_joint','verification_status_joint'])

In [12]:
#Separate the target variable and remove it from the features
y=X.loan_status
y_test=X_test.loan_status
X.drop(columns=['loan_status'],inplace=True)
X_test.drop(columns=['loan_status'],inplace=True)

In [13]:
#We look to see which columns in the restricted data have missing values still
missing = pd.concat([X.isnull().sum(), 100 * X.isnull().mean()], axis=1)
missing.columns=['count','%']
missing[missing["count"]>0].sort_values(by='count',ascending=False)

Unnamed: 0,count,%
il_util,868392,97.916714
mths_since_rcnt_il,866220,97.671807
inq_last_12m,865659,97.608551
open_acc_6m,865659,97.608551
total_cu_tl,865659,97.608551
inq_fi,865659,97.608551
all_util,865659,97.608551
max_bal_bc,865659,97.608551
open_rv_24m,865659,97.608551
open_rv_12m,865659,97.608551


For the features with many missing values we want to see what's actually there

In [14]:
X.il_util.value_counts()

0.0      138
100.0    132
87.0      68
75.0      66
86.0      64
        ... 
118.8      1
113.0      1
136.3      1
133.2      1
116.9      1
Name: il_util, Length: 1271, dtype: int64

In [15]:
X.il_util.unique()

array([  nan,  49.3,  88.8, ..., 133.1,  12.9, 116.9])

In [16]:
X.il_util.describe()

count    18476.000000
mean        71.519788
std         23.034225
min          0.000000
25%         58.600000
50%         75.000000
75%         87.600000
max        223.300000
Name: il_util, dtype: float64

We're missing a lot of data here. There's quite a range here. While this potentially could cause issues we could impute the median of 75 here. We will impute the same to the test data 

In [17]:
X.il_util.fillna(75,inplace=True)

In [18]:
X_test.il_util.fillna(75,inplace=True)

In [19]:
X.mths_since_rcnt_il.value_counts()

4.0      1220
3.0      1179
7.0      1073
5.0      1041
6.0       980
         ... 
338.0       1
230.0       1
238.0       1
275.0       1
250.0       1
Name: mths_since_rcnt_il, Length: 200, dtype: int64

In [20]:
X.mths_since_rcnt_il.unique()

array([ nan,  28.,  11.,  47.,  13.,   8.,  21., 338.,  54.,  73.,  14.,
        18.,  23.,  42.,  19.,  10.,  27.,   2.,  76.,   7.,   6.,   5.,
        15.,   9.,  62.,  35.,   3.,  17.,  45.,  16.,  38.,  24.,  25.,
        12.,   1.,  43.,  91.,  55.,  52.,  37.,  50.,  79.,  64.,  22.,
        36.,  58.,  49.,  26.,   4.,  61.,  59.,  46., 141.,  32.,  53.,
        56., 100., 103.,  33.,  20.,  51.,  63.,  40., 118.,  98.,  31.,
       275., 121.,  29., 124.,  34.,  89.,  41.,   0., 145.,  44.,  77.,
        82.,  80.,  39.,  30.,  71., 111.,  95., 129.,  68., 168., 119.,
       115., 151.,  72.,  67., 104.,  74.,  93.,  57., 238., 148.,  87.,
       170., 230.,  66., 158., 107., 101.,  90., 150.,  78., 114., 120.,
       133., 136.,  60., 137.,  85., 131., 130.,  75.,  48., 109.,  81.,
       117., 102., 105., 113.,  88.,  83.,  84., 147.,  65.,  69.,  99.,
        86., 110., 152., 288., 135., 108., 149., 140., 112.,  94.,  97.,
       123., 134., 116.,  92., 138., 169., 139., 12

In [21]:
X.mths_since_rcnt_il.describe()

count    20648.000000
mean        20.922801
std         27.233241
min          0.000000
25%          6.000000
50%         12.000000
75%         23.000000
max        363.000000
Name: mths_since_rcnt_il, dtype: float64

This is months since most recent installment account was opened. Missing values are likely people that have no old one. The easiest way to make sure it's considered separate is to simply impute a very large number. For now we can use infinite. Since we likely will use decision trees here, that will be a reasonable way to separate them.

This will mean that trying to look at statistics on these fields will not make much sense.

In [22]:
X.mths_since_rcnt_il.fillna(np.inf,inplace=True)
X_test.mths_since_rcnt_il.fillna(np.inf,inplace=True)

In [23]:
X.inq_last_12m.value_counts()

 1.0     5011
 0.0     3958
 2.0     3728
 3.0     2576
 4.0     1681
-4.0     1365
 5.0      987
 6.0      664
 7.0      424
 8.0      272
 9.0      143
 10.0     112
 11.0      73
 12.0      57
 13.0      55
 14.0      27
 15.0      26
 16.0      12
 17.0       8
 19.0       8
 20.0       7
 18.0       5
 21.0       3
 25.0       2
 30.0       1
 26.0       1
 22.0       1
 32.0       1
 24.0       1
Name: inq_last_12m, dtype: int64

It seems reasonable to impute 0 for missing ones since the reasonable conclusion of no data on inquiries in the last 12 months is that there were zero. We have a fwe entries that are -4. That obviously isn't reasonable since you can't have negative inquiries. We can try to see what those say for inquiries in the last 6 months.

In [24]:
X[X.inq_last_12m==-4].inq_last_6mths.value_counts()

0.0    1365
Name: inq_last_6mths, dtype: int64

All 1365 had none in the last 6 months. So we will replace those with 0 as well. We will replace all missing and negative entries in the training and test data with 0

In [25]:
X.inq_last_12m=X.inq_last_12m.replace(-4,0)
X.inq_last_12m.fillna(0,inplace=True)

In [26]:
X.inq_last_12m.value_counts()

0.0     870982
1.0       5011
2.0       3728
3.0       2576
4.0       1681
5.0        987
6.0        664
7.0        424
8.0        272
9.0        143
10.0       112
11.0        73
12.0        57
13.0        55
14.0        27
15.0        26
16.0        12
19.0         8
17.0         8
20.0         7
18.0         5
21.0         3
25.0         2
26.0         1
30.0         1
22.0         1
32.0         1
24.0         1
Name: inq_last_12m, dtype: int64

In [27]:
X_test.inq_last_12m.fillna(0,inplace=True)
#There were no negative values in the test data so we didn't have to worry about that.

In [28]:
missing[missing['count']>0].index

Index(['emp_title', 'emp_length', 'annual_inc', 'desc', 'title', 'delinq_2yrs',
       'earliest_cr_line', 'inq_last_6mths', 'mths_since_last_delinq',
       'mths_since_last_record', 'open_acc', 'pub_rec', 'revol_util',
       'total_acc', 'last_pymnt_d', 'next_pymnt_d', 'last_credit_pull_d',
       'collections_12_mths_ex_med', 'mths_since_last_major_derog',
       'acc_now_delinq', 'tot_coll_amt', 'tot_cur_bal', 'open_acc_6m',
       'open_il_6m', 'open_il_12m', 'open_il_24m', 'mths_since_rcnt_il',
       'total_bal_il', 'il_util', 'open_rv_12m', 'open_rv_24m', 'max_bal_bc',
       'all_util', 'total_rev_hi_lim', 'inq_fi', 'total_cu_tl',
       'inq_last_12m'],
      dtype='object')

In [29]:
X.annual_inc.describe()

count    8.868640e+05
mean     7.503702e+04
std      6.471049e+04
min      1.896000e+03
25%      4.500000e+04
50%      6.500000e+04
75%      9.000000e+04
max      9.500000e+06
Name: annual_inc, dtype: float64

We only had a few missing annual incomes. We will simply use the median of the test data (65000) for those

In [30]:
X.annual_inc.fillna(65000,inplace=True)
X_test.annual_inc.fillna(65000,inplace=True)

In [31]:
X.emp_title.value_counts()

Teacher                                   13453
Manager                                   11233
Registered Nurse                           5521
Owner                                      5375
RN                                         5352
                                          ...  
The Sams Clinic                               1
Annalect Group                                1
Coast Composites                              1
Antelope Valley Union High School Dist        1
Manager Hotel Operations Oasis                1
Name: emp_title, Length: 299159, dtype: int64

We will replace missing values of title as 'other'

In [32]:
X.emp_title.fillna('other',inplace=True)
X_test.emp_title.fillna('other',inplace=True)

In [33]:
X.emp_length.value_counts()

10+ years    291403
2 years       78833
< 1 year      70559
3 years       69994
1 year        57064
5 years       55686
4 years       52496
7 years       44576
8 years       43930
6 years       42928
9 years       34635
Name: emp_length, dtype: int64

In [34]:
X.emp_length.unique()

array(['10+ years', '< 1 year', '1 year', '3 years', '8 years', '9 years',
       '4 years', '5 years', '6 years', '2 years', '7 years', nan],
      dtype=object)

We want to try to find roughly the median employement length. This is only going to be rough because we have categories of < 1 year and 10+ years
Roughly half the values are at 6 years or below and half at 7 years and above. For now we will try using 7 years

In [35]:
X.emp_length.fillna('7 years',inplace=True)
X_test.emp_length.fillna('7 years',inplace=True)

In [36]:
X.emp_length.value_counts()

10+ years    291403
7 years       89340
2 years       78833
< 1 year      70559
3 years       69994
1 year        57064
5 years       55686
4 years       52496
8 years       43930
6 years       42928
9 years       34635
Name: emp_length, dtype: int64

In [37]:
X.desc.value_counts()

                                                                                                                                                                                                                                                                                                                                                                       246
Debt Consolidation                                                                                                                                                                                                                                                                                                                                                      13
  Borrower added on 03/17/14 > Debt consolidation<br>                                                                                                                                                                                                                             

The description field has many missing values and while debt consolidation appears relatively often for now we will drop this. A potential future thing to do is try to do some natural language processing on this and use some word vectors as features.

In [38]:
X.drop(columns=['desc'],inplace=True)
X_test.drop(columns=['desc'],inplace=True)

In [39]:
X.delinq_2yrs.value_counts()

0.0     716564
1.0     113147
2.0      33530
3.0      11970
4.0       5323
5.0       2709
6.0       1469
7.0        783
8.0        461
9.0        284
10.0       192
11.0       121
12.0        89
13.0        64
14.0        45
15.0        28
16.0        17
18.0        11
17.0        10
19.0         8
22.0         3
21.0         2
26.0         2
20.0         2
29.0         1
24.0         1
30.0         1
27.0         1
39.0         1
Name: delinq_2yrs, dtype: int64

In [40]:
#We will replace missing values with 0 assuming that no entry means no delinquencies in the last 2 years
X.delinq_2yrs.fillna(0,inplace=True)
X_test.delinq_2yrs.fillna(0,inplace=True)

In [41]:
X.earliest_cr_line.value_counts()

Aug-2001    6656
Aug-2000    6522
Oct-2000    6320
Oct-2001    6150
Aug-2002    6080
            ... 
Jun-1949       1
Oct-1950       1
Jan-1948       1
Jul-1961       1
Apr-1958       1
Name: earliest_cr_line, Length: 696, dtype: int64

In [42]:
X.earliest_cr_line.unique()

array(['Jan-1985', 'Apr-1999', 'Nov-2001', 'Feb-1996', 'Jan-1996',
       'Nov-2004', 'Jul-2005', 'Jan-2007', 'Apr-2004', 'Sep-2004',
       'Jan-1998', 'Oct-1989', 'Jul-2003', 'May-1991', 'Sep-2007',
       'Oct-1998', 'Aug-1993', 'Oct-2003', 'Jan-2001', 'Nov-1997',
       'Feb-1983', 'Jul-1985', 'Apr-2003', 'Jun-2001', 'Feb-2002',
       'Aug-1984', 'Nov-2006', 'Dec-1987', 'Nov-1981', 'Feb-1997',
       'Apr-2005', 'Oct-2007', 'Dec-2000', 'Apr-2007', 'Dec-2001',
       'Jan-2003', 'Mar-1994', 'Sep-1998', 'Jun-2004', 'Nov-1995',
       'Jul-1999', 'Jun-1995', 'Sep-1992', 'Jan-2002', 'Apr-1992',
       'Oct-2006', 'May-2000', 'Dec-1998', 'Dec-2004', 'Oct-2000',
       'May-2002', 'May-2006', 'Jul-2002', 'Jul-2006', 'May-1997',
       'Oct-2005', 'Apr-1995', 'Oct-2002', 'Jan-2000', 'Apr-2000',
       'Dec-1994', 'Sep-2005', 'Dec-1984', 'Dec-1999', 'Nov-2003',
       'Jun-1989', 'Jun-2003', 'Oct-1996', 'May-2003', 'Jun-2002',
       'Jun-2007', 'Dec-1996', 'Feb-1984', 'Sep-2002', 'Jan-19

In [43]:
#We import the datetime package so we can convert these to datetime objects so we can analyze them
import datetime
X.earliest_cr_line=pd.to_datetime(X.earliest_cr_line)
X_test.earliest_cr_line=pd.to_datetime(X_test.earliest_cr_line)

In [44]:
X.earliest_cr_line.describe(datetime_is_numeric=True)['50%']

Timestamp('1999-09-01 00:00:00')

In [45]:
X.earliest_cr_line.isna().sum()

29

In [46]:
#Replacing earliest credit line in those instances with the median earliest credit line
X.earliest_cr_line.fillna(X.earliest_cr_line.describe(datetime_is_numeric=True)['50%'],inplace=True)
X_test.earliest_cr_line.fillna(X.earliest_cr_line.describe(datetime_is_numeric=True)['50%'],inplace=True)

In [47]:
X.inq_last_6mths.value_counts()

0.0     497619
1.0     241336
2.0      94066
3.0      37387
4.0      10753
5.0       3985
6.0       1231
7.0        195
8.0        122
9.0         50
10.0        24
11.0        15
12.0        15
15.0         9
13.0         6
14.0         6
18.0         4
16.0         3
17.0         2
24.0         2
19.0         2
32.0         1
33.0         1
31.0         1
28.0         1
25.0         1
27.0         1
20.0         1
Name: inq_last_6mths, dtype: int64

In [48]:
#As we did with inquiries with last 12 monts we replace missing values with zero
X.inq_last_6mths.fillna(0,inplace=True)
X_test.inq_last_6mths.fillna(0,inplace=True)

In [49]:
X.mths_since_last_delinq.value_counts()

9.0      8586
6.0      8465
12.0     8366
8.0      8326
13.0     8312
         ... 
180.0       1
136.0       1
124.0       1
143.0       1
137.0       1
Name: mths_since_last_delinq, Length: 155, dtype: int64

In [50]:
#We will replace missing values with infinity assuming this means they've never had one

In [51]:
X.mths_since_last_delinq.fillna(np.inf,inplace=True)
X_test.mths_since_last_delinq.fillna(np.inf,inplace=True)

In [52]:
X.mths_since_last_record.value_counts()

61.0     1960
62.0     1919
71.0     1916
67.0     1913
69.0     1910
         ... 
1.0        71
2.0        63
120.0       9
121.0       2
129.0       1
Name: mths_since_last_record, Length: 123, dtype: int64

In [53]:
X.mths_since_last_record.describe()

count    136937.000000
mean         70.116375
std          28.132227
min           0.000000
25%          51.000000
50%          70.000000
75%          92.000000
max         129.000000
Name: mths_since_last_record, dtype: float64

In [54]:
#We will replace this with the median 70 
X.mths_since_last_record.fillna(70,inplace=True)
X_test.mths_since_last_record.fillna(70,inplace=True)

In [55]:
#For months since last derogatory record will replace missing values with infinity on the assumption this means there was no derogatory record
X.mths_since_last_major_derog.fillna(np.inf,inplace=True)
X.mths_since_last_major_derog.fillna(np.inf,inplace=True)

In [56]:
X.open_acc.value_counts()

9.0     80237
10.0    78275
8.0     76523
11.0    72228
7.0     67842
        ...  
57.0        1
67.0        1
65.0        1
75.0        1
90.0        1
Name: open_acc, Length: 77, dtype: int64

In [57]:
X.open_acc.describe()

count    886839.000000
mean         11.548715
std           5.317242
min           0.000000
25%           8.000000
50%          11.000000
75%          14.000000
max          90.000000
Name: open_acc, dtype: float64

In [58]:
#We will use the median 8 for missing values
X.open_acc.fillna(8,inplace=True)
X_test.open_acc.fillna(8,inplace=True)

In [59]:
X.pub_rec.value_counts()

0.0     751177
1.0     113176
2.0      14834
3.0       4484
4.0       1561
5.0        757
6.0        385
7.0        170
8.0        113
9.0         50
10.0        42
11.0        23
12.0        16
13.0        12
15.0         6
18.0         5
16.0         5
21.0         4
17.0         3
19.0         2
14.0         2
49.0         2
40.0         1
63.0         1
54.0         1
34.0         1
23.0         1
26.0         1
28.0         1
20.0         1
86.0         1
22.0         1
Name: pub_rec, dtype: int64

In [60]:
#Replacing missing values of public derogatory records with 0
X.pub_rec.fillna(0,inplace=True)
X_test.pub_rec.fillna(0,inplace=True)

In [61]:
X.revol_util.value_counts()

0.00      3540
58.00     1778
53.00     1765
59.00     1762
61.00     1756
          ... 
32.71        1
0.86         1
0.03         1
0.16         1
184.60       1
Name: revol_util, Length: 1356, dtype: int64

In [62]:
X.revol_util.describe()

count    886366.000000
mean         55.063147
std          23.834616
min           0.000000
25%          37.700000
50%          56.000000
75%          73.600000
max         892.300000
Name: revol_util, dtype: float64

In [63]:
#replacing missing values for this field with the median 56
X.revol_util.fillna(56,inplace=True)
X_test.revol_util.fillna(56,inplace=True)

In [64]:
X.total_acc.value_counts()

22.0     32237
20.0     32094
21.0     31987
19.0     31557
23.0     31296
         ...  
121.0        1
119.0        1
156.0        1
150.0        1
108.0        1
Name: total_acc, Length: 135, dtype: int64

In [65]:
X.total_acc.describe()

count    886839.000000
mean         25.268133
std          11.840410
min           1.000000
25%          17.000000
50%          24.000000
75%          32.000000
max         169.000000
Name: total_acc, dtype: float64

In [66]:
#replace missing values here with the median, 24
X.total_acc.fillna(24,inplace=True)
X_test.total_acc.fillna(24,inplace=True)

In [67]:
X.last_pymnt_d.unique()

array(['Jan-2015', 'Apr-2013', 'Jun-2014', 'Jan-2016', 'Apr-2012',
       'Nov-2012', 'Jun-2013', 'Sep-2013', 'Jul-2012', 'Oct-2013',
       'May-2013', 'Feb-2015', 'Aug-2015', 'Oct-2012', 'Sep-2012', nan,
       'Dec-2012', 'Dec-2014', 'Aug-2013', 'Nov-2013', 'Jan-2014',
       'Apr-2014', 'Aug-2014', 'Oct-2014', 'Aug-2012', 'Jul-2014',
       'Jul-2013', 'Apr-2015', 'Feb-2014', 'Sep-2014', 'Jun-2012',
       'Feb-2013', 'Mar-2013', 'May-2014', 'Mar-2015', 'Jan-2013',
       'Dec-2013', 'Feb-2012', 'Mar-2014', 'Sep-2015', 'Nov-2015',
       'Dec-2015', 'Jan-2012', 'Oct-2015', 'Nov-2014', 'Mar-2012',
       'May-2012', 'Jun-2015', 'May-2015', 'Jul-2015', 'Dec-2011',
       'Nov-2011', 'Oct-2011', 'Sep-2011', 'Aug-2011', 'Jul-2011',
       'Jun-2011', 'May-2011', 'Apr-2011', 'Mar-2011', 'Feb-2011',
       'Jan-2011', 'Dec-2010', 'Nov-2010', 'Oct-2010', 'Sep-2010',
       'Aug-2010', 'Jul-2010', 'Jun-2010', 'May-2010', 'Apr-2010',
       'Mar-2010', 'Feb-2010', 'Jan-2010', 'Dec-2009', 'N

In [68]:
#Convert these dates to datetime objects to better understand them
X.last_pymnt_d=pd.to_datetime(X.last_pymnt_d)
X_test.last_pymnt_d=pd.to_datetime(X_test.last_pymnt_d)

In [69]:
X.last_pymnt_d.describe(datetime_is_numeric=True)

count                           869347
mean     2015-08-11 06:38:31.221641472
min                2007-12-01 00:00:00
25%                2015-09-01 00:00:00
50%                2016-01-01 00:00:00
75%                2016-01-01 00:00:00
max                2016-01-01 00:00:00
Name: last_pymnt_d, dtype: object

In [70]:
X_test.last_pymnt_d.describe(datetime_is_numeric=True)

count                           758390
mean     2017-10-06 22:23:31.442661120
min                2016-01-01 00:00:00
25%                2017-11-01 00:00:00
50%                2017-12-01 00:00:00
75%                2017-12-01 00:00:00
max                2017-12-01 00:00:00
Name: last_pymnt_d, dtype: object

Normally would not make different replacements for missing values in test and training data. However we can't do this here as it's an actual date and the actual loan applications take place over different periods. We can see that the latest payment date in the training data is the earliest for the test data. We will depart from usual procedure and use the median for each separately

In [71]:
X.last_pymnt_d.fillna(X.last_pymnt_d.describe(datetime_is_numeric=True)['50%'],inplace=True)
X_test.last_pymnt_d.fillna(X_test.last_pymnt_d.describe(datetime_is_numeric=True)['50%'],inplace=True)

In [72]:
X.next_pymnt_d=pd.to_datetime(X.next_pymnt_d)
X_test.next_pymnt_d=pd.to_datetime(X_test.next_pymnt_d)
X.next_pymnt_d.describe(datetime_is_numeric=True)

count                           633898
mean     2016-01-21 01:07:17.328403200
min                2007-12-01 00:00:00
25%                2016-02-01 00:00:00
50%                2016-02-01 00:00:00
75%                2016-02-01 00:00:00
max                2016-03-01 00:00:00
Name: next_pymnt_d, dtype: object

In [73]:
X_test.next_pymnt_d.describe(datetime_is_numeric=True)

count                           591423
mean     2017-12-31 23:55:23.746962432
min                2017-12-01 00:00:00
25%                2018-01-01 00:00:00
50%                2018-01-01 00:00:00
75%                2018-01-01 00:00:00
max                2018-02-01 00:00:00
Name: next_pymnt_d, dtype: object

In [74]:
#As the previous case with dates we will use the median for each set separately
X.next_pymnt_d.fillna(X.next_pymnt_d.describe(datetime_is_numeric=True)['50%'],inplace=True)
X_test.next_pymnt_d.fillna(X_test.next_pymnt_d.describe(datetime_is_numeric=True)['50%'],inplace=True)

One thing we will have to consider before modeling is how to deal with these sorts of date ranges. Since our training data and test data have different ranges entirely (the max for the next payment date in the training data is earlier than the minimum for the test data) we will have to do something to tweak this before trying to apply a model here.