In [1]:
 %matplotlib inline
import pandas as pd 
import datetime as dt
import numpy as np
import matplotlib.pyplot as plt
import scipy.stats as ss

In [2]:
def stringToInt(var):
    try:
        var = np.int16(var)
        return var
    except Exception:
        if var == 'R':
            return np.int16(130)# Setting a arbituary hight value for Forclosed home
        else:
            return np.int16(-1)

def stringToFloat(var):
    try:
        var = np.float32(var)
        return var
    except Exception:
        return np.float32(0)

def fillFlag(var):
    try:
        var=np.char(var)
        return var
    except Exception:
        return np.char('F')


### Data file containing loan origination information for Year 2010

In [3]:
fields_Origin = ['CREDIT SCORE','FIRST TIME HOMEBUYER FLAG','MORTGAGE INSURANCE PERCENTAGE','CLTV','DTI Ratio',
          'ORIGINAL UPB','ORIGINAL LTV','ORIGINAL INTEREST RATE' ,'LOAN SEQUENCE NUMBER']    #9
fData=pd.read_csv('data/historical_data1_Q11999/historical_data1_Q11999.txt','|',index_col=None, encoding='utf-8',low_memory=False
                 ,usecols=fields_Origin,converters={'CREDIT SCORE':stringToInt,
                                                     'DTI Ratio':stringToFloat,
                                                      'CLTV':stringToFloat})
len(fData)

392776

### Data file containing loan monthly data for Year 2010. 

In [4]:
dateparse = lambda x: pd.datetime.strptime(x, '%Y%m')
fields_Month=['LOAN SEQUENCE NUMBER','MONTHLY REPORTING PERIOD','CURRENT ACTUAL UPB','CURRENT LOAN DELINQUENCY STATUS',
              'REMAINING MONTHS TO LEGAL MATURITY','REPURCHASE FLAG','MODIFICATION FLAG',
              'ZERO BALANCE CODE','ZERO BALANCE EFFECTIVE DATE','CURRENT INTEREST RATE','CURRENT DEFERRED UPB']
reader=pd.read_csv('data/historical_data1_Q11999/historical_data1_time_Q11999.txt','|',
                   index_col=None, parse_dates=['MONTHLY REPORTING PERIOD'],
                   date_parser=dateparse,encoding='utf-8',low_memory=False,chunksize=10000
                  ,usecols=fields_Month,converters={'CURRENT ACTUAL UPB':stringToFloat,
                                                    'CURRENT LOAN DELINQUENCY STATUS':stringToInt})


#Get dataframe from chunks
df = pd.concat(chunk for chunk in reader)

#Fill missing data
df['REPURCHASE FLAG'].fillna('F', inplace=True)
df['MODIFICATION FLAG'].fillna('N', inplace=True)
df['ZERO BALANCE CODE'].fillna('00', inplace=True)
df['ZERO BALANCE EFFECTIVE DATE'].fillna('000000', inplace=True)

df.info()



<class 'pandas.core.frame.DataFrame'>
Int64Index: 22227548 entries, 0 to 7547
Data columns (total 11 columns):
LOAN SEQUENCE NUMBER                  object
MONTHLY REPORTING PERIOD              datetime64[ns]
CURRENT ACTUAL UPB                    float64
CURRENT LOAN DELINQUENCY STATUS       int64
REMAINING MONTHS TO LEGAL MATURITY    int64
REPURCHASE FLAG                       object
MODIFICATION FLAG                     object
ZERO BALANCE CODE                     object
ZERO BALANCE EFFECTIVE DATE           object
CURRENT INTEREST RATE                 float64
CURRENT DEFERRED UPB                  float64
dtypes: datetime64[ns](1), float64(3), int64(2), object(5)
memory usage: 2.0+ GB


In [5]:
df.head()

Unnamed: 0,LOAN SEQUENCE NUMBER,MONTHLY REPORTING PERIOD,CURRENT ACTUAL UPB,CURRENT LOAN DELINQUENCY STATUS,REMAINING MONTHS TO LEGAL MATURITY,REPURCHASE FLAG,MODIFICATION FLAG,ZERO BALANCE CODE,ZERO BALANCE EFFECTIVE DATE,CURRENT INTEREST RATE,CURRENT DEFERRED UPB
0,F199Q1000001,2002-05-01,171982.4375,0,328,F,N,0,0,6.3,0.0
1,F199Q1000001,2002-06-01,171571.390625,0,327,F,N,0,0,6.3,0.0
2,F199Q1000001,2002-07-01,171158.328125,0,326,F,N,0,0,6.3,0.0
3,F199Q1000001,2002-08-01,170742.890625,0,325,F,N,0,0,6.3,0.0
4,F199Q1000001,2002-09-01,170325.171875,0,324,F,N,0,0,6.3,0.0


In [6]:
df2=df.sort_values(by='CURRENT LOAN DELINQUENCY STATUS',ascending=False)
print (df2['MONTHLY REPORTING PERIOD'].max())
print (df2['MONTHLY REPORTING PERIOD'].min())
print (df2['CURRENT LOAN DELINQUENCY STATUS'].max())

2015-12-01 00:00:00
1999-02-01 00:00:00
130


In [7]:
#Calculate two new columns 
df['year']=df['MONTHLY REPORTING PERIOD'].dt.year
df['quarter']=df['MONTHLY REPORTING PERIOD'].dt.quarter
df.head()

Unnamed: 0,LOAN SEQUENCE NUMBER,MONTHLY REPORTING PERIOD,CURRENT ACTUAL UPB,CURRENT LOAN DELINQUENCY STATUS,REMAINING MONTHS TO LEGAL MATURITY,REPURCHASE FLAG,MODIFICATION FLAG,ZERO BALANCE CODE,ZERO BALANCE EFFECTIVE DATE,CURRENT INTEREST RATE,CURRENT DEFERRED UPB,year,quarter
0,F199Q1000001,2002-05-01,171982.4375,0,328,F,N,0,0,6.3,0.0,2002,2
1,F199Q1000001,2002-06-01,171571.390625,0,327,F,N,0,0,6.3,0.0,2002,2
2,F199Q1000001,2002-07-01,171158.328125,0,326,F,N,0,0,6.3,0.0,2002,3
3,F199Q1000001,2002-08-01,170742.890625,0,325,F,N,0,0,6.3,0.0,2002,3
4,F199Q1000001,2002-09-01,170325.171875,0,324,F,N,0,0,6.3,0.0,2002,3


#### Removing Loan data with no credit score

In [8]:
print (len(fData))
fDataClean=fData[fData['CREDIT SCORE']>0]
print (len(fDataClean))

392776
391229


#### Removing invalid data (Missing CURRENT LOAN DELINQUENCY STATUS) and Forclosed data out.

In [11]:
print (len(df))
dfClean=df[df['CURRENT LOAN DELINQUENCY STATUS'] >-1]
print (len(dfClean))
dfClean=dfClean[dfClean['CURRENT LOAN DELINQUENCY STATUS'] <130]
print (len(dfClean))

22227548
22227548
22221286


### Merging  Loan Origination with Monthly datasets 

In [12]:

dfMonthly = pd.merge(dfClean,fDataClean,on='LOAN SEQUENCE NUMBER')
dfMonthly.head()
print (len(dfMonthly))

22124867


### Slicing and peeking into dataset

In [13]:
df1=dfMonthly[['LOAN SEQUENCE NUMBER','CREDIT SCORE','CURRENT LOAN DELINQUENCY STATUS']]
df1.head(20)

Unnamed: 0,LOAN SEQUENCE NUMBER,CREDIT SCORE,CURRENT LOAN DELINQUENCY STATUS
0,F199Q1000001,751,0
1,F199Q1000001,751,0
2,F199Q1000001,751,0
3,F199Q1000001,751,0
4,F199Q1000001,751,0
5,F199Q1000001,751,0
6,F199Q1000001,751,0
7,F199Q1000002,733,0
8,F199Q1000002,733,0
9,F199Q1000002,733,0


### Understanding key values

In [14]:
df2=df1.sort_values(by='CURRENT LOAN DELINQUENCY STATUS',ascending=False)
df2.head(20)

Unnamed: 0,LOAN SEQUENCE NUMBER,CREDIT SCORE,CURRENT LOAN DELINQUENCY STATUS
3743777,F199Q1068505,742,128
17408608,F199Q1314043,695,100
17408607,F199Q1314043,695,99
17408606,F199Q1314043,695,98
17408605,F199Q1314043,695,97
17408604,F199Q1314043,695,96
17408603,F199Q1314043,695,95
17408602,F199Q1314043,695,94
17408601,F199Q1314043,695,93
17408600,F199Q1314043,695,92


### Grouping data for each loan.  Total number of unique loans

In [15]:
df2=df1.groupby('LOAN SEQUENCE NUMBER').max()
df3=df2.reset_index()
df3.head()

len(df3)

391192

### Total of non performing loan

In [16]:
df4= df3[df3['CURRENT LOAN DELINQUENCY STATUS']>0]
len(df4)

72332

### Monthly GDP Data

In [17]:
dateparse2 = lambda x: pd.datetime.strptime(x, '%Y-%m-%d')
fields_gdp=['date','change-current','change-chained']
gdpData= pd.read_csv('data/gdp-quarter.csv',',', parse_dates=['date'],date_parser=dateparse2,usecols=fields_gdp)
gdpData=gdpData[(gdpData.date > np.datetime64('2008-12-30')) & (gdpData.date < np.datetime64('2016-01-01'))]
gdpData['year']=gdpData['date'].dt.year
gdpData['quarter']=gdpData['date'].dt.quarter
gdpData.year.max()

2014

### Merging the values with above dfMonthly dataset

In [18]:
dfWithG=pd.merge(dfMonthly,gdpData,on=['year','quarter'])


In [19]:
dfWithG.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 1538440 entries, 0 to 1538439
Data columns (total 24 columns):
LOAN SEQUENCE NUMBER                  1538440 non-null object
MONTHLY REPORTING PERIOD              1538440 non-null datetime64[ns]
CURRENT ACTUAL UPB                    1538440 non-null float64
CURRENT LOAN DELINQUENCY STATUS       1538440 non-null int64
REMAINING MONTHS TO LEGAL MATURITY    1538440 non-null int64
REPURCHASE FLAG                       1538440 non-null object
MODIFICATION FLAG                     1538440 non-null object
ZERO BALANCE CODE                     1538440 non-null object
ZERO BALANCE EFFECTIVE DATE           1538440 non-null object
CURRENT INTEREST RATE                 1538440 non-null float64
CURRENT DEFERRED UPB                  1538440 non-null float64
year                                  1538440 non-null int64
quarter                               1538440 non-null int64
CREDIT SCORE                          1538440 non-null int64
FIRST TIME HO

### Getting the needed data from above merged dataset

In [20]:
dfWithG=dfWithG[['LOAN SEQUENCE NUMBER','MONTHLY REPORTING PERIOD','CURRENT ACTUAL UPB','CURRENT LOAN DELINQUENCY STATUS',
              'REMAINING MONTHS TO LEGAL MATURITY','REPURCHASE FLAG','MODIFICATION FLAG',
              'ZERO BALANCE CODE','CURRENT INTEREST RATE','CURRENT DEFERRED UPB','year','quarter',
              'CREDIT SCORE','FIRST TIME HOMEBUYER FLAG','MORTGAGE INSURANCE PERCENTAGE','CLTV','DTI Ratio',
              'change-current','change-chained']]

### Saving Data in pickle object onto the hard drive

In [21]:
dfWithG.to_pickle('data/dataframe.pkl')

### Testing the saved pickle object

In [22]:
dfWithG=pd.read_pickle('data/dataframe.pkl')
dfWithG.year.max()

2014