In [1]:
import pandas as pd
import numpy as np

%matplotlib inline
import matplotlib
import matplotlib.pyplot as plt
from IPython.display import set_matplotlib_formats
set_matplotlib_formats('retina')

This notebook shows some basics stats about securities labeled as NC1

## Load Data

In [2]:
data = pd.DataFrame([])
for n in range(2,9):
    file_name = 'trimmed_'+ str(2000+n) +'_v2.csv'
    data = data.append(pd.read_csv(file_name))

In [3]:
#Get all rows with NC1
nc1 = data.loc[data['Label'] == 'NC1(failing)']

## Data Validation

In [4]:
print('we have {} rows of data'.format(data.shape[0]))
print('we have {} rows of NC1'.format(nc1.shape[0]))
print('percentage: {:.2f}%'.format(nc1.shape[0]/data.shape[0]*100))

we have 24597 rows of data
we have 3856 rows of NC1
percentage: 15.68%


In [5]:
valid = nc1.groupby(by=['Year','Label'])['CUSIP','PID'].size().sum() == nc1.shape[0]
print("Does nc1.groupby(by=['Year','Label'])['CUSIP','PID'] equal number of rows in nc1? {}".format(valid))

Does nc1.groupby(by=['Year','Label'])['CUSIP','PID'] equal number of rows in nc1? True


In [6]:
print('Number of NC1 in given year. Note:2008 does not have NC1')
nc1.groupby(by=['Year','Label'])['CUSIP','PID'].size()

Number of NC1 in given year. Note:2008 does not have NC1


Year  Label       
2002  NC1(failing)       5
2004  NC1(failing)     461
2005  NC1(failing)    1402
2006  NC1(failing)       8
2007  NC1(failing)    1980
dtype: int64

In [7]:
print('Ratio of NC1/All in given year:')
nc1.groupby(by=['Year','Label'])['CUSIP','PID'].size()/data.groupby(by=['Year'])['CUSIP','PID'].size()

Ratio of NC1/All in given year:


Year  Label       
2002  NC1(failing)    0.001805
2004  NC1(failing)    0.103596
2005  NC1(failing)    0.279561
2006  NC1(failing)    0.001990
2007  NC1(failing)    0.444345
dtype: float64

In [8]:
valid = (data.loc[data['Label'] != 'NC1(failing)'].shape[0]) == data.shape[0]-nc1.shape[0]
print("Is data.loc[data['Label'] != 'NC1(failing)'] valid for geting non-NC1? {}".format(valid))

Is data.loc[data['Label'] != 'NC1(failing)'] valid for geting non-NC1? True


## Explore NC1

In [9]:
count = nc1.groupby('PID')['CUSIP'].nunique().count()
print('There are ' + str(count) + ' of prospectus that have NC1 with unique CUSIP')
print("On average {:.2f} NC1 per prospectus that has NC1".format(nc1.shape[0]/count))

There are 669 of prospectus that have NC1 with unique CUSIP
On average 5.76 NC1 per prospectus that has NC1


In [10]:
#These two are different because CUSIP is not unique
#nc1.groupby('PID')['CUSIP'].size().sum()
#nc1.groupby('PID')['CUSIP'].nunique()

In [11]:
show_top = 5
print('Top {} of the MTG_TRANCHE_TYP_LONG among NC1 are:'.format(show_top))
nc1.groupby(['MTG_TRANCHE_TYP_LONG'])['CUSIP','PID'].size().sort_values(ascending=False).head(show_top)

Top 5 of the MTG_TRANCHE_TYP_LONG among NC1 are:


MTG_TRANCHE_TYP_LONG
SUB,CSTR,NAS            1115
SUB,NAS                  469
MEZ,FLT,STEP,IRC         459
MEZ,FLT,STEP             166
MEZ,FLT,STEP,IRC,NAS     163
dtype: int64

In [12]:
print('Top {} of the MTG_TRANCHE_TYP_LONG among non-NC1s are:'.format(show_top))
data.loc[data['Label'] != 'NC1(failing)'].groupby(['MTG_TRANCHE_TYP_LONG'])['CUSIP','PID'].size().sort_values(ascending=False).head(show_top)

Top 5 of the MTG_TRANCHE_TYP_LONG among non-NC1s are:


MTG_TRANCHE_TYP_LONG
SUB,CSTR,NAS        1720
SEQ,AS              1468
MEZ,FLT,STEP,IRC    1466
SUB,NAS             1335
FLT,STEP,IRC         920
dtype: int64

[See here for more info about MTG_TRANCHE_TYP](https://docs.google.com/spreadsheets/d/1MOwPnTr2owqPoJNy73U7UEc3z1RvtzELOCM0ZFxBJU8/edit?usp=sharing)

In [13]:
nc1_total = nc1['MTG ORIG AMT'].sum()
nonNC1_total = data.loc[data['Label'] != 'NC1(failing)']['MTG ORIG AMT'].sum()
print('Sum of MTG ORIG AMT among NC1 = {:.2f}'.format(nc1_total))
print('Sum of MTG ORIG AMT among non-NC1 = {:.2f}'.format(nonNC1_total))
print('Sum of MTG ORIG AMT among all = {:.2f}'.format(nc1_total+nonNC1_total))
print('MTG ORIG AMT among NC1 / MTG ORIG AMT of all = {:.2f}%'.format(nc1_total/(nc1_total+nonNC1_total)*100))

Sum of MTG ORIG AMT among NC1 = 30423.10
Sum of MTG ORIG AMT among non-NC1 = 1387799.87
Sum of MTG ORIG AMT among all = 1418222.97
MTG ORIG AMT among NC1 / MTG ORIG AMT of all = 2.15%


In [14]:
print('Desciption of MTG ORIG AMT among NC1:')
nc1['MTG ORIG AMT'].describe()

Desciption of MTG ORIG AMT among NC1:


count    3856.000000
mean        7.889809
std        14.696035
min         0.000100
25%         1.828500
50%         4.258000
75%         9.924000
max       560.470000
Name: MTG ORIG AMT, dtype: float64

## To do
- Look into Bloomberg (Paydown Infomation) <br>
API found, can actually use excel on Bloomberg terminal to get the HIST_UNSUPPORTED_RISK_SHORTFALL data quickly
- Look why payment just suddenly stops instead of gradually decreased and stoped


## Finding duplicated CUSIP
The CUSIP in dataset is duplicated. The uniqueness is determined by using both CUSIP and PID. The way of finding duplicated CUSIP is the following:

This line of code shows that CUSIP is not unique in the dataset. Using count is to count the total number of rows, but using nunique is to count unique. I think both method will be handy in different situations. The Falses in result shows that two ways of counting are different.

In [15]:
data.groupby('Label')['CUSIP'].size() == data.groupby('Label')['CUSIP'].nunique()

Label
0                              False
1.4                            False
FE                             False
IOfailing                       True
IOpassMED                      False
IOpassMEY                      False
MED                            False
MEY                            False
NC1(failing)                   False
NC2(z>1 nsf)                   False
NC3(z>1, not paid off, nsf)    False
NMEm                           False
NMEs                            True
Name: CUSIP, dtype: bool

Here is the way to find what CUSIPs are duplicated. (Here only shows the top five and bottom five results which are all >1). The series can be exported using pandas function

In [16]:
#Get series of CUSIP and get count > 1
series = data.groupby(by=['CUSIP']).size().sort_values(ascending=False)
series = series.loc[series > 1]

In [17]:
series.head()

CUSIP
94974SAB9    3
126671Y91    3
05948KAS8    3
1266712B1    3
1266712F2    3
dtype: int64

In [18]:
series.tail()

CUSIP
576434HV9    2
576434HU1    2
576434HT4    2
576433QQ2    2
576433QR0    2
dtype: int64