In [1]:
import pandas as pd
import numpy as np

%matplotlib inline
import matplotlib
import matplotlib.pyplot as plt
from IPython.display import set_matplotlib_formats
set_matplotlib_formats('retina')

This notebook shows some basics stats about securities labeled as NC1

## Load Data

In [2]:
data = pd.DataFrame([])
for n in range(2,8):
    file_name = 'trimmed_'+ str(2000+n) +'_v3_USE.csv'
    data = data.append(pd.read_csv(file_name),ignore_index=True)

In [3]:
#Get all rows with NC1
nc1 = data.loc[data['Label'] == 'NC1(failing)']

## Data Validation

In [4]:
print('we have {} rows of data'.format(data.shape[0]))
print('we have {} rows of NC1'.format(nc1.shape[0]))
print('percentage: {:.2f}%'.format(nc1.shape[0]/data.shape[0]*100))

we have 30944 rows of data
we have 0 rows of NC1
percentage: 0.00%


In [5]:
valid = nc1.groupby(by=['Year','Label'])['CUSIP','PID'].size().sum() == nc1.shape[0]
print("Does nc1.groupby(by=['Year','Label'])['CUSIP','PID'] equal number of rows in nc1? {}".format(valid))

KeyError: 'Year'

In [None]:
valid = data.groupby(by=['Year','Label'])['CUSIP','PID'].size().sum() == data.shape[0]
print("Does data.groupby(by=['Year','Label'])['CUSIP','PID'] equal number of rows in data? {}".format(valid))
if (valid): print("This ensures ['CUSIP','PID'] shows uniqueness.")

In [None]:
print('Number of NC1 in given year. Note:2008 does not have NC1')
nc1.groupby(by=['Year','Label'])['CUSIP','PID'].size()

In [None]:
print('Ratio of NC1/All in given year:')
nc1.groupby(by=['Year','Label'])['CUSIP','PID'].size()/data.groupby(by=['Year'])['CUSIP','PID'].size()

In [None]:
valid = (data.loc[data['Label'] != 'NC1(failing)'].shape[0]) == data.shape[0]-nc1.shape[0]
print("Is data.loc[data['Label'] != 'NC1(failing)'] valid for geting non-NC1? {}".format(valid))

## Explore NC1

In [None]:
count = nc1.groupby('PID')['CUSIP'].nunique().count()
print('There are ' + str(count) + ' of prospectus that have NC1 with unique CUSIP')
print("On average {:.2f} NC1 per prospectus that has NC1".format(nc1.shape[0]/count))

In [None]:
#These two are different because CUSIP is not unique
#nc1.groupby('PID')['CUSIP'].size().sum()
#nc1.groupby('PID')['CUSIP'].nunique()

In [None]:
show_top = 5
print('Top {} of the MTG_TRANCHE_TYP_LONG among NC1 are:'.format(show_top))
nc1.groupby(['MTG_TRANCHE_TYP_LONG'])['CUSIP','PID'].size().sort_values(ascending=False).head(show_top)

In [None]:
print('Top {} of the MTG_TRANCHE_TYP_LONG among non-NC1s are:'.format(show_top))
data.loc[data['Label'] != 'NC1(failing)'].groupby(['MTG_TRANCHE_TYP_LONG'])['CUSIP','PID'].size().sort_values(ascending=False).head(show_top)

[See here for more info about MTG_TRANCHE_TYP](https://docs.google.com/spreadsheets/d/1MOwPnTr2owqPoJNy73U7UEc3z1RvtzELOCM0ZFxBJU8/edit?usp=sharing)

In [None]:
nc1_total = nc1['MTG ORIG AMT'].sum()
nonNC1_total = data.loc[data['Label'] != 'NC1(failing)']['MTG ORIG AMT'].sum()
print('Sum of MTG ORIG AMT among NC1 = {:.2f}'.format(nc1_total))
print('Sum of MTG ORIG AMT among non-NC1 = {:.2f}'.format(nonNC1_total))
print('Sum of MTG ORIG AMT among all = {:.2f}'.format(nc1_total+nonNC1_total))
print('MTG ORIG AMT among NC1 / MTG ORIG AMT of all = {:.2f}%'.format(nc1_total/(nc1_total+nonNC1_total)*100))

In [None]:
print('Desciption of MTG ORIG AMT among NC1:')
nc1['MTG ORIG AMT'].describe()

## To do
- Look into Bloomberg (Paydown Infomation) <br>
API found, can actually use excel on Bloomberg terminal to get the HIST_UNSUPPORTED_RISK_SHORTFALL data quickly
- Look why payment just suddenly stops instead of gradually decreased and stoped


## Finding duplicatations
The CUSIP in dataset is duplicated. The uniqueness is determined by using both CUSIP and PID. The way of finding duplicated CUSIP is the following:

This line of code shows that CUSIP is not unique in the dataset. Using count is to count the total number of rows, but using nunique is to count unique. I think both method will be handy in different situations. The Falses in result shows that two ways of counting are different.

In [None]:
data.groupby('Label')['CUSIP'].size() == data.groupby('Label')['CUSIP'].nunique()

In [None]:
CUSIP_PID_count = data.groupby(by=['CUSIP','PID'])['CUSIP','PID'].nunique().sum()
CUSIP_PID_count

In [None]:
print('CUSIP and PID agree but does not equal to number of rows. We have {} rows of data'.format(data.shape[0]))
print('The difference is {}'.format(data.shape[0]-CUSIP_PID_count[0]))

Duplicated rows are found below

In [None]:
data.groupby(by=['CUSIP','PID']).size().sort_values(ascending=False).head()

Theese two rows are he same

In [None]:
data.loc[data['CUSIP'] == '576434DB7']

data.duplicated() helps us find duplicated rows and True means it is duplicated

In [None]:
data.duplicated().sort_values(ascending=False).head()

In [None]:
num_duplicated = data.loc[data.duplicated(keep=False)].shape[0]
print('keep=false shows all duplicated instace so there are {} rows that are duplicated.'.format(num_duplicated))

In [None]:
num_to_delete = data.loc[data.duplicated()].shape[0]
print('No keep parameter shows duplicated instace that we should delete so there are {} rows'
      'that are duplications that should be deleted.'.format(num_to_delete))
print('That will give us {}'.format(num_duplicated-num_to_delete))

Use new variable no_duplicated_data using drop_duplicates()

In [None]:
no_duplicated_data = data.drop_duplicates()

In [None]:
if (((data.shape[0]-CUSIP_PID_count[0])-(num_duplicated-num_to_delete))==0): print('No!! Data rows and CUSIP_PID still not match')
else: print('Total Number Match!!!!')

In [None]:
no_duplicated_data.groupby(by=['CUSIP','PID']).size().sort_values(ascending=False).head()

In [None]:
no_duplicated_data.groupby(by=['CUSIP'])['PID'].nunique().sort_values(ascending=False).head()

In [None]:
no_duplicated_data.loc[no_duplicated_data['CUSIP'] == '05948KXX2']

In [None]:
no_duplicated_data.loc[no_duplicated_data['CUSIP'] == '576434AH7']

In [None]:
no_duplicated_data.groupby(by=['CUSIP','PID','Current_Balance']).size().sort_values(ascending=False).head()

In [None]:
no_duplicated_data.loc[no_duplicated_data['CUSIP'] == '81744Mar3']

In [None]:
no_duplicated_data.groupby(by=['CUSIP','PID','Current_Balance','norm_class']).size().sort_values(ascending=False).head()

In [None]:
no_duplicated_data.loc[no_duplicated_data['CUSIP'] == '576434AE4']

In [None]:
no_duplicated_data.groupby(by=['CUSIP','PID','Current_Balance','norm_class','MTG_TRANCHE_TYP_LONG']).size().sort_values(ascending=False).head()

Here is the way to find what CUSIPs are duplicated. (Here only shows the top five and bottom five results which are all >1). The series can be exported using pandas function

In [None]:
#Get series of CUSIP and get count > 1
series = data.groupby(by=['CUSIP']).size().sort_values(ascending=False)
series = series.loc[series > 1]

In [None]:
series.head()

In [None]:
series.tail()