In [1]:
import pandas as pd
import numpy as np

%matplotlib inline
import matplotlib
import matplotlib.pyplot as plt
from IPython.display import set_matplotlib_formats
set_matplotlib_formats('retina')

This notebook shows some basics stats about securities labeled as NC1

## Load Data

In [2]:
data = pd.DataFrame([])
for n in range(2,9):
    file_name = 'trimmed_'+ str(2000+n) +'_v3_USE.csv'
    df = pd.read_csv(file_name)
    rows = df.shape[0]
    year = {'Year' : ([str(2000+n)]*rows)}
    year_df = pd.DataFrame(data=year)
    df = pd.concat([df, year_df], axis=1)
    data = data.append(df,ignore_index=True)

In [3]:
#Get all rows with NC1
nc1 = data.loc[data['Label'] == 'NC1']

## Data Validation

In [4]:
print('we have {} rows of data'.format(data.shape[0]))
print('we have {} rows of NC1'.format(nc1.shape[0]))
print('percentage: {:.2f}%'.format(nc1.shape[0]/data.shape[0]*100))

we have 31811 rows of data
we have 6427 rows of NC1
percentage: 20.20%


In [5]:
valid = nc1.groupby(by=['Year','Label'])['CUSIP','PID'].size().sum() == nc1.shape[0]
print("Does nc1.groupby(by=['Year','Label'])['CUSIP','PID'] equal number of rows in nc1? {}".format(valid))

Does nc1.groupby(by=['Year','Label'])['CUSIP','PID'] equal number of rows in nc1? True


In [6]:
valid = data.groupby(by=['Year','Label'])['CUSIP','PID'].size().sum() == data.shape[0]
print("Does data.groupby(by=['Year','Label'])['CUSIP','PID'] equal number of rows in data? {}".format(valid))
if (valid): print("This ensures ['CUSIP','PID'] shows uniqueness.")

Does data.groupby(by=['Year','Label'])['CUSIP','PID'] equal number of rows in data? True
This ensures ['CUSIP','PID'] shows uniqueness.


In [7]:
print('Number of NC1 in given year. Note:2008 does not have NC1')
nc1.groupby(by=['Year','Label'])['CUSIP','PID'].size()

Number of NC1 in given year. Note:2008 does not have NC1


Year  Label
2002  NC1         5
2003  NC1         2
2004  NC1       464
2005  NC1      1627
2006  NC1      2297
2007  NC1      2032
dtype: int64

In [8]:
print('Ratio of NC1/All in given year:')
nc1.groupby(by=['Year','Label'])['CUSIP','PID'].size()/data.groupby(by=['Year'])['CUSIP','PID'].size()

Ratio of NC1/All in given year:


Year  Label
2002  NC1      0.001805
2003  NC1      0.000472
2004  NC1      0.087054
2005  NC1      0.180999
2006  NC1      0.452968
2007  NC1      0.446495
dtype: float64

In [9]:
valid = (data.loc[data['Label'] != 'NC1'].shape[0]) == data.shape[0]-nc1.shape[0]
print("Is data.loc[data['Label'] != 'NC1'] valid for geting non-NC1? {}".format(valid))

Is data.loc[data['Label'] != 'NC1'] valid for geting non-NC1? True


## Explore NC1

In [10]:
count = nc1.groupby('PID')['CUSIP'].nunique().count()
print('There are ' + str(count) + ' of prospectus that have NC1 with unique CUSIP')
print("On average {:.2f} NC1 per prospectus that has NC1".format(nc1.shape[0]/count))

There are 1001 of prospectus that have NC1 with unique CUSIP
On average 6.42 NC1 per prospectus that has NC1


In [11]:
#These two are different because CUSIP is not unique
#nc1.groupby('PID')['CUSIP'].size().sum()
#nc1.groupby('PID')['CUSIP'].nunique()

In [12]:
show_top = 5
print('Top {} of the MTG_TRANCHE_TYP_LONG among NC1 are:'.format(show_top))
nc1.groupby(['MTG_TRANCHE_TYP_LONG'])['CUSIP','PID'].size().sort_values(ascending=False).head(show_top)

Top 5 of the MTG_TRANCHE_TYP_LONG among NC1 are:


MTG_TRANCHE_TYP_LONG
SUB,CSTR,NAS            1609
MEZ,FLT,STEP,IRC         989
SUB,NAS                  663
MEZ,FLT,STEP             389
MEZ,FLT,STEP,IRC,NAS     274
dtype: int64

In [13]:
print('Top {} of the MTG_TRANCHE_TYP_LONG among non-NC1s are:'.format(show_top))
data.loc[data['Label'] != 'NC1'].groupby(['MTG_TRANCHE_TYP_LONG'])['CUSIP','PID'].size().sort_values(ascending=False).head(show_top)

Top 5 of the MTG_TRANCHE_TYP_LONG among non-NC1s are:


MTG_TRANCHE_TYP_LONG
SUB,CSTR,NAS        1997
SEQ,AS              1906
SUB,NAS             1734
MEZ,FLT,STEP,IRC    1205
FLT,STEP,IRC        1123
dtype: int64

[See here for more info about MTG_TRANCHE_TYP](https://docs.google.com/spreadsheets/d/1MOwPnTr2owqPoJNy73U7UEc3z1RvtzELOCM0ZFxBJU8/edit?usp=sharing)

In [14]:
nc1_total = nc1['MTG ORIG AMT'].sum()
nonNC1_total = data.loc[data['Label'] != 'NC1']['MTG ORIG AMT'].sum()
print('Sum of MTG ORIG AMT among NC1 = {:.2f}'.format(nc1_total))
print('Sum of MTG ORIG AMT among non-NC1 = {:.2f}'.format(nonNC1_total))
print('Sum of MTG ORIG AMT among all = {:.2f}'.format(nc1_total+nonNC1_total))
print('MTG ORIG AMT among NC1 / MTG ORIG AMT of all = {:.2f}%'.format(nc1_total/(nc1_total+nonNC1_total)*100))

Sum of MTG ORIG AMT among NC1 = 55423.82
Sum of MTG ORIG AMT among non-NC1 = 1754465.00
Sum of MTG ORIG AMT among all = 1809888.82
MTG ORIG AMT among NC1 / MTG ORIG AMT of all = 3.06%


In [15]:
print('Desciption of MTG ORIG AMT among NC1:')
nc1['MTG ORIG AMT'].describe()

Desciption of MTG ORIG AMT among NC1:


count    6427.000000
mean        8.623591
std        13.733825
min         0.000100
25%         2.070100
50%         5.000000
75%        11.027500
max       560.470000
Name: MTG ORIG AMT, dtype: float64

## To do
- Look into Bloomberg (Paydown Infomation) <br>
API found, can actually use excel on Bloomberg terminal to get the HIST_UNSUPPORTED_RISK_SHORTFALL data quickly
- Look why payment just suddenly stops instead of gradually decreased and stoped


## Finding duplicatations
The CUSIP in dataset is duplicated. The uniqueness is determined by using both CUSIP and PID. The way of finding duplicated CUSIP is the following:

This line of code shows that CUSIP is not unique in the dataset. Using count is to count the total number of rows, but using nunique is to count unique. I think both method will be handy in different situations. The Falses in result shows that two ways of counting are different.

In [16]:
data.groupby('Label')['CUSIP'].size() == data.groupby('Label')['CUSIP'].nunique()

Label
0            False
1.4          False
FE           False
IOfailing     True
IOpassMED    False
IOpassMEY    False
MED          False
MEY          False
NC1          False
NC2          False
NC3          False
NMEm         False
NMEs          True
Name: CUSIP, dtype: bool

In [17]:
CUSIP_PID_count = data.groupby(by=['CUSIP','PID'])['CUSIP','PID'].nunique().sum()
CUSIP_PID_count

CUSIP    31628
PID      31628
dtype: int64

In [18]:
print('CUSIP and PID agree but does not equal to number of rows. We have {} rows of data'.format(data.shape[0]))
print('The difference is {}'.format(data.shape[0]-CUSIP_PID_count[0]))

CUSIP and PID agree but does not equal to number of rows. We have 31811 rows of data
The difference is 183


Duplicated rows are found below

In [19]:
data.groupby(by=['CUSIP','PID']).size().sort_values(ascending=False).head()

CUSIP      PID    
576434EZ3  FC_1525    2
576434HX5  FC_2221    2
576434HP2  FC_2221    2
576434HQ0  FC_2221    2
576434HR8  FC_2221    2
dtype: int64

Theese two rows are he same

In [20]:
data.loc[data['CUSIP'] == '576434DB7']

Unnamed: 0,PID,Prospectus,Class,norm_class,Name,Current_Balance,Zero-Balance Payment Period Number,Sum Principle Paid,MTG ORIG AMT,Maturity,...,Moody Rating,Initial Moody Rating,Bloomberg Composite,HCLB,MTG INT SHRTFLL,HIST INTRST SHRTFLL,Label,norm_label,USE,Year
3352,FC_1440,MASTR_1225297_0000950136-03-000749.txt,6A2,A,MALT 2003-2 6A2,0.0,1,7.778,7.778,3/25/2033,...,WR,Aaa,NR,0.0,,0\0\0,MED,ME,0,2003
3381,FC_1440,MASTR_1225297_0000950136-03-000749.txt,6A2,A,MALT 2003-2 6A2,0.0,1,7.778,7.778,3/25/2033,...,WR,Aaa,NR,0.0,,0\0\0,MED,ME,0,2003


data.duplicated() helps us find duplicated rows and True means it is duplicated

In [21]:
data.duplicated().sort_values(ascending=False).head()

3757    True
4432    True
4491    True
4490    True
3374    True
dtype: bool

In [22]:
num_duplicated = data.loc[data.duplicated(keep=False)].shape[0]
print('keep=false shows all duplicated instace so there are {} rows that are duplicated.'.format(num_duplicated))

keep=false shows all duplicated instace so there are 326 rows that are duplicated.


In [23]:
num_to_delete = data.loc[data.duplicated()].shape[0]
print('No keep parameter shows duplicated instace that we should delete so there are {} rows'
      'that are duplications that should be deleted.'.format(num_to_delete))
print('That will give us {}'.format(num_duplicated-num_to_delete))

No keep parameter shows duplicated instace that we should delete so there are 163 rowsthat are duplications that should be deleted.
That will give us 163


Use new variable no_duplicated_data using drop_duplicates()

In [24]:
no_duplicated_data = data.drop_duplicates()

In [25]:
if (((data.shape[0]-CUSIP_PID_count[0])-(num_duplicated-num_to_delete))==0): print('No!! Data rows and CUSIP_PID still not match')
else: print('Total Number Match!!!!')

Total Number Match!!!!


In [26]:
no_duplicated_data.groupby(by=['CUSIP','PID']).size().sort_values(ascending=False).head()

CUSIP      PID    
576434KV5  FC_2478    2
576434LA0  FC_2478    2
86358RJ40  FC_386     2
86358RN52  FC_2609    2
576434KU7  FC_2478    2
dtype: int64

In [27]:
no_duplicated_data.groupby(by=['CUSIP'])['PID'].nunique().sort_values(ascending=False).head()

CUSIP
126671Y67    4
126671Z41    4
1266712B1    4
1266712F2    4
126671Y75    4
Name: PID, dtype: int64

In [28]:
no_duplicated_data.loc[no_duplicated_data['CUSIP'] == '05948KXX2']

Unnamed: 0,PID,Prospectus,Class,norm_class,Name,Current_Balance,Zero-Balance Payment Period Number,Sum Principle Paid,MTG ORIG AMT,Maturity,...,Moody Rating,Initial Moody Rating,Bloomberg Composite,HCLB,MTG INT SHRTFLL,HIST INTRST SHRTFLL,Label,norm_label,USE,Year
13288,FC_1378,COUNTRYWIDE_1316390_0000950129-05-000828.txt,2CB1,U,BOAA 2005-2 2CB1,2.8537,paying,22.0169,25.309,3/25/2035,...,Caa2,Aaa,DDD+,,0.0,-77664.73\26803.1\0,MEY,ME,0,2005
19597,FC_448,BANC_OF_AMERICA_1318762_0001193125-05-034840.htm,2CB1,U,BOAA 2005-2 2CB1,2.8634,paying,22.0085,25.309,3/25/2035,...,Caa2,Aaa,DDD+,,0.0,-77664.73\26803.1\0,MEY,ME,0,2005


In [29]:
no_duplicated_data.loc[no_duplicated_data['CUSIP'] == '576434AH7']

Unnamed: 0,PID,Prospectus,Class,norm_class,Name,Current_Balance,Zero-Balance Payment Period Number,Sum Principle Paid,MTG ORIG AMT,Maturity,...,Moody Rating,Initial Moody Rating,Bloomberg Composite,HCLB,MTG INT SHRTFLL,HIST INTRST SHRTFLL,Label,norm_label,USE,Year
1808,FC_1509,MASTR_1179092_0000950136-02-002215.txt,AX,A,MALT 2002-1 AX,0.2771,paying,0.0,16.9397,2032-07-25,...,,,NR,0.0,0.0,0\0\0,IOpassMEY,ME,0,2002
1831,FC_1509,MASTR_1179092_0000950136-02-002215.txt,AX,A,MALT 2002-1 AX,0.2802,paying,0.0,16.9397,2032-07-25,...,,,NR,0.0,0.0,0\0\0,IOpassMEY,ME,0,2002


In [30]:
no_duplicated_data.groupby(by=['CUSIP','PID','Current_Balance']).size().sort_values(ascending=False).head()

CUSIP      PID      Current_Balance
86358RX28  FC_2784  0.0000             2
1266712S4  FC_96    3.0685             2
576434AE4  FC_1509  0.0000             2
86358RN52  FC_2609  0.0000             2
86358RJ40  FC_386   0.0000             2
dtype: int64

In [31]:
no_duplicated_data.loc[no_duplicated_data['CUSIP'] == '81744Mar3']

Unnamed: 0,PID,Prospectus,Class,norm_class,Name,Current_Balance,Zero-Balance Payment Period Number,Sum Principle Paid,MTG ORIG AMT,Maturity,...,Moody Rating,Initial Moody Rating,Bloomberg Composite,HCLB,MTG INT SHRTFLL,HIST INTRST SHRTFLL,Label,norm_label,USE,Year


In [32]:
no_duplicated_data.groupby(by=['CUSIP','PID','Current_Balance','norm_class']).size().sort_values(ascending=False).head()

CUSIP      PID      Current_Balance  norm_class
576434AE4  FC_1509  0.0000           A             2
BCC2BQMX0  FC_704   14.2665          A             1
12669CXX3  FC_404   0.0000           B             1
12669CXJ4  FC_404   0.0000           A             1
12669CXK1  FC_404   0.0000           A             1
dtype: int64

In [33]:
no_duplicated_data.loc[no_duplicated_data['CUSIP'] == '576434AE4']

Unnamed: 0,PID,Prospectus,Class,norm_class,Name,Current_Balance,Zero-Balance Payment Period Number,Sum Principle Paid,MTG ORIG AMT,Maturity,...,Moody Rating,Initial Moody Rating,Bloomberg Composite,HCLB,MTG INT SHRTFLL,HIST INTRST SHRTFLL,Label,norm_label,USE,Year
1806,FC_1509,MASTR_1179092_0000950136-02-002215.txt,A5,A,MALT 2002-1 A5,0.0,1,25.65,25.65,2032-07-25,...,,,NR,0.0,,0\0\0,MED,ME,0,2002
1829,FC_1509,MASTR_1179092_0000950136-02-002215.txt,A5,A,MALT 2002-1 A5,0.0,1,25.65,25.65,2032-07-25,...,,,NR,0.0,,0\0\0,MED,ME,0,2002


In [34]:
no_duplicated_data.groupby(by=['CUSIP','PID','Current_Balance','norm_class','MTG_TRANCHE_TYP_LONG']).size().sort_values(ascending=False).head()

CUSIP      PID     Current_Balance  norm_class  MTG_TRANCHE_TYP_LONG
BCC2BQMX0  FC_704  14.2665          A           STEP,AFC,AS             1
12669CXJ4  FC_404  0.0000           A           SEQ,AS                  1
12669CXL9  FC_404  0.0000           A           SEQ,AS                  1
12669CXM7  FC_404  0.0000           A           SEQ,AS                  1
12669CXN5  FC_404  0.0000           A           SEQ,AS                  1
dtype: int64

Here is the way to find what CUSIPs are duplicated. (Here only shows the top five and bottom five results which are all >1). The series can be exported using pandas function

In [35]:
#Get series of CUSIP and get count > 1
series = data.groupby(by=['CUSIP']).size().sort_values(ascending=False)
series = series.loc[series > 1]

In [36]:
series.head()

CUSIP
1266712B1    4
126671Z66    4
1266712A3    4
126671Z41    4
126671Z74    4
dtype: int64

In [37]:
series.tail()

CUSIP
05948KDX4    2
05948KDW6    2
05948KEP0    2
05948KEQ8    2
05948KER6    2
dtype: int64

In [38]:
#data.to_pickle('parent_data_Label_Securities_v3_USE_2002-2007')

In [39]:
export_nc1 = nc1['Name'] + ' MTGE'

In [40]:
export_nc1.shape

(6427,)

In [41]:
export_nc1 = export_nc1.append([export_nc1]*3,ignore_index=True)

In [42]:
export_nc1.shape

(25708,)

In [43]:
export_nc1 = export_nc1.sort_values()

In [44]:
shortfall_dict = {0:'HISTORICAL_BASIS_RISK_SHORTFALL',1:'HIST_UNSUPPORTED_RISK_SHORTFALL',2:'HIST_CUMUL_INT_SHRTFLL_BOND',3:'HIST_INTEREST_SHORTFALL'}
shortfall = pd.Series(shortfall_dict)

In [45]:
shortfall

0    HISTORICAL_BASIS_RISK_SHORTFALL
1    HIST_UNSUPPORTED_RISK_SHORTFALL
2        HIST_CUMUL_INT_SHRTFLL_BOND
3            HIST_INTEREST_SHORTFALL
dtype: object

In [46]:
shortfall_series = shortfall.append([shortfall]*int(export_nc1.shape[0]/4-1),ignore_index=True)

In [47]:
shortfall_series.shape

(25708,)

In [48]:
export_nc1.shape

(25708,)

In [49]:
export = pd.DataFrame({'NAME':export_nc1,'SHORTFALL_TYPE':shortfall_series})

In [50]:
export.groupby(['NAME'])['SHORTFALL_TYPE'].nunique().sort_values()

NAME
AABST 2005-4 B1 MTGE        4
MLMI 2006-RM4 M6 MTGE       4
MLMI 2006-RM4 M5 MTGE       4
MLMI 2006-RM4 M4 MTGE       4
MLMI 2006-RM4 M3 MTGE       4
MLMI 2006-RM4 M2 MTGE       4
MLMI 2006-RM4 M1 MTGE       4
MLMI 2006-RM4 B4 MTGE       4
MLMI 2006-RM4 B3 MTGE       4
MLMI 2006-RM4 B2 MTGE       4
MLMI 2006-RM5 B1 MTGE       4
MLMI 2006-RM4 B1 MTGE       4
MLMI 2006-OPT1 M5 MTGE      4
MLMI 2006-OPT1 M4 MTGE      4
MLMI 2006-OPT1 M3 MTGE      4
MLMI 2006-OPT1 B3 MTGE      4
MLMI 2006-OPT1 B2 MTGE      4
MLMI 2006-OPT1 B1 MTGE      4
MLMI 2006-HE6 M6 MTGE       4
MLMI 2006-HE6 M5 MTGE       4
MLMI 2006-HE6 M4 MTGE       4
MLMI 2006-OPT1 M6 MTGE      4
MLMI 2006-RM5 B2 MTGE       4
MLMI 2006-RM5 B3 MTGE       4
MLMI 2006-RM5 M1 MTGE       4
MLMI 2006-WMC1 M4 MTGE      4
MLMI 2006-WMC1 M3 MTGE      4
MLMI 2006-WMC1 B3 MTGE      4
MLMI 2006-WMC1 B2B MTGE     4
MLMI 2006-WMC1 B2A MTGE     4
                           ..
CWALT 2006-OC1 M2 MTGE      4
CWALT 2006-OC1 M1 MTGE      4
CWALT

In [51]:
export.to_csv('get_shortfall.csv')

In [58]:
no_duplicated_data.groupby(['CUSIP']).size().sort_values(ascending=False)

CUSIP
1266712F2    4
126671Y75    4
126671Y91    4
126671Z25    4
126671Z33    4
126671Z41    4
126671Z58    4
126671Z66    4
126671Z74    4
126671Z82    4
126671Z90    4
126671Y67    4
126671Y83    4
1266712B1    4
1266712A3    4
05948KAY5    3
05948KAX7    3
94974SAG8    3
94974SAF0    3
94974SAD5    3
94974SAC7    3
94974SAB9    3
05948KBM0    3
05948KBL2    3
05948KBK4    3
94974SAA1    3
94974SAH6    3
05948KBB4    3
05948KAW9    3
05948KAU3    3
            ..
59025CAC4    1
59025CAB6    1
59025CAA8    1
59024FAR5    1
59023QAQ4    1
59023XAE6    1
59023XAF3    1
59023XAG1    1
59023XAH9    1
59023XAJ5    1
59023XAK2    1
59023XAL0    1
59023XAM8    1
59023XAN6    1
59024FAA2    1
59024FAB0    1
59024FAC8    1
59024FAD6    1
59024FAE4    1
59024FAF1    1
59024FAG9    1
59024FAH7    1
59024FAJ3    1
59024FAK0    1
59024FAL8    1
59024FAM6    1
59024FAN4    1
59024FAP9    1
59024FAQ7    1
00252FAA9    1
Length: 30855, dtype: int64

In [60]:
no_duplicated_data.loc[no_duplicated_data['CUSIP']=='126671Y75']

Unnamed: 0,PID,Prospectus,Class,norm_class,Name,Current_Balance,Zero-Balance Payment Period Number,Sum Principle Paid,MTG ORIG AMT,Maturity,...,Moody Rating,Initial Moody Rating,Bloomberg Composite,HCLB,MTG INT SHRTFLL,HIST INTRST SHRTFLL,Label,norm_label,USE,Year
7161,FC_1095,COUNTRYWIDE_1313077_0000950136-04-004590.htm,1A,A,CWL 2004-1 1A,47.9382,paying,1468.7518,1516.69,5/25/2034,...,Aa2,Aaa,AA+,0.0,0.0,0\0\0,MEY,ME,0,2004
9044,FC_2318,COUNTRYWIDE_1312966_0000950136-04-004574.htm,1A,A,CWL 2004-1 1A,47.9382,paying,1468.7518,1516.69,5/25/2034,...,Aa2,Aaa,AA+,0.0,0.0,0\0\0,MEY,ME,0,2004
9396,FC_2566,COUNTRYWIDE_1304962_0000950136-04-003232.htm,1A,A,CWL 2004-1 1A,47.9382,paying,1468.7518,1516.69,5/25/2034,...,Aa2,Aaa,AA+,0.0,0.0,0\0\0,MEY,ME,0,2004
10999,FC_3476,COUNTRYWIDE_1310068_0000950136-04-004170.htm,1A,A,CWL 2004-1 1A,47.9382,paying,1468.7518,1516.69,5/25/2034,...,Aa2,Aaa,AA+,0.0,0.0,0\0\0,MEY,ME,0,2004


In [61]:
no_duplicated_data.loc[no_duplicated_data['CUSIP']=='1266712F2']

Unnamed: 0,PID,Prospectus,Class,norm_class,Name,Current_Balance,Zero-Balance Payment Period Number,Sum Principle Paid,MTG ORIG AMT,Maturity,...,Moody Rating,Initial Moody Rating,Bloomberg Composite,HCLB,MTG INT SHRTFLL,HIST INTRST SHRTFLL,Label,norm_label,USE,Year
7165,FC_1095,COUNTRYWIDE_1313077_0000950136-04-004590.htm,AR,A,CWL 2004-1 AR,0.0,1,0.0001,0.0001,3/25/2004,...,WR,Aaa,NR,,,,MED,ME,0,2004
9048,FC_2318,COUNTRYWIDE_1312966_0000950136-04-004574.htm,AR,A,CWL 2004-1 AR,0.0,1,0.0001,0.0001,3/25/2004,...,WR,Aaa,NR,,,,MED,ME,0,2004
9400,FC_2566,COUNTRYWIDE_1304962_0000950136-04-003232.htm,AR,A,CWL 2004-1 AR,0.0,1,0.0001,0.0001,3/25/2004,...,WR,Aaa,NR,,,,MED,ME,0,2004
11003,FC_3476,COUNTRYWIDE_1310068_0000950136-04-004170.htm,AR,A,CWL 2004-1 AR,0.0,1,0.0001,0.0001,3/25/2004,...,WR,Aaa,NR,0.0,,,MED,ME,0,2004
