In [1]:
import pandas as pd
import numpy as np

%matplotlib inline
import matplotlib
import matplotlib.pyplot as plt
from IPython.display import set_matplotlib_formats
set_matplotlib_formats('retina')

## Load Data

In [2]:
join_data = pd.DataFrame([])
for n in range(2,8):
    file_name = 'FI_TW_Prop_'+ str(2000+n) +'_v3.csv'
    join_data = join_data.append(pd.read_csv(file_name),ignore_index=True)

In [3]:
parent_data = pd.read_pickle('parent_data_Label_Securities_v3_USE_2002-2007')

## Data Validation

In [59]:
print('we have {} rows of join_data'.format(join_data.shape[0]))
print('we have {} rows of parent_data'.format(parent_data.shape[0]))
if ((join_data.shape[0]-parent_data.shape[0]) == 0) : print('same size')
else : print('different size')
print('we use {:.2f}% of data'.format(join_data['USE'].mean()*100))

we have 30944 rows of join_data
we have 30944 rows of parent_data
same size
we use 88.36% of data


### Duplicated Rows

In [7]:
join_data.duplicated().sort_values(ascending=False).head()

4412    True
3533    True
4420    True
4419    True
4418    True
dtype: bool

In [8]:
num_duplicated = join_data.loc[join_data.duplicated(keep=False)].shape[0]
print('keep=false shows all duplicated instace so there are {} rows that are duplicated.'.format(num_duplicated))

keep=false shows all duplicated instace so there are 326 rows that are duplicated.


In [66]:
a = join_data.loc[join_data.duplicated(keep=False)]['USE'] > 0
a.sum()
print('We use {} of all duplicated rows'.format(a.sum()))

We use 0 of all duplicated rows


In [9]:
num_to_delete = join_data.loc[join_data.duplicated()].shape[0]
print('No keep parameter shows duplicated instace that we should delete so there are {} rows'
      'that are duplications that should be deleted.'.format(num_to_delete))
print('That will give us {}'.format(num_duplicated-num_to_delete))

No keep parameter shows duplicated instace that we should delete so there are 163 rowsthat are duplications that should be deleted.
That will give us 163


In [10]:
no_duplicated_join_data = join_data.drop_duplicates()

### Duplicated CUSIP

In [18]:
no_duplicated_join_data.shape

(30781, 365)

In [72]:
a = join_data.loc[join_data.duplicated(subset=['CUSIP'],keep=False)]['USE'] > 0
print('We use {} of duplicated CUSIP in join_data'.format(a.sum()))
b = join_data.loc[join_data.duplicated(subset=['CUSIP'])]['USE'] > 0
print('{} of them should be deleted'.format(b.sum()))

We use 108 of duplicated CUSIP in join_data
54 of them should be deleted


In [94]:
c = join_data.loc[join_data.duplicated(subset=['CUSIP'],keep=False)]
c.loc[c['USE'] == 1]

Unnamed: 0,PID,FI_1,FI_2,FI_3,FI_4,FI_5,FI_6,FI_7,FI_8,FI_9,...,MTG_TRANCHE_TYP_LONG,Moody Rating,Initial Moody Rating,Bloomberg Composite,HCLB,MTG INT SHRTFLL,HIST INTRST SHRTFLL,Label,norm_label,USE
16383,FC_2439,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,"FLT,IRC,AD,SSNR,AS",Caa2,Aaa,DDD+,0.3476,0.0,0\0\0,NMEm,NME,1
16384,FC_2439,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,"FLT,SUP,SSNR,AS",Caa2,Aaa,DDD+,0.3231,0.0,0\0\0,NMEm,NME,1
16385,FC_2439,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,"INV,SUP,AS",Caa2,Aaa,DDD+,0.0294,0.0,0\0\0,NMEm,NME,1
16386,FC_2439,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,"FLT,IRC,AD,SSNR,AS",Caa2,Aaa,DDD+,0.0000,0.0,0\0\0,MEY,ME,1
16387,FC_2439,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,"SSNR,SSUP,NAS",WR,Aaa,NR,0.7015,0.0,0\0\0,FE,FE,1
16388,FC_2439,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,"SSUP,NAS",NR,NR,NR,2.4213,,0\0\0,FE,FE,1
16389,FC_2439,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,"IO,INV,NTL",Caa2,Aaa,NR,0.0000,0.0,0\0\0,IOpassMEY,ME,1
16390,FC_2439,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,"SEQ,SSNR,AS",Caa2,Aaa,DDD+,0.0000,0.0,0\0\0,MEY,ME,1
16391,FC_2439,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,"SSNR,NAS",Caa2,Aaa,DDD+,0.0000,0.0,0\0\0,MEY,ME,1
16392,FC_2439,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,"SEQ,SSNR,AS",Caa2,Aaa,DDD+,0.0361,0.0,0\0\0,NMEm,NME,1


In [73]:
a = no_duplicated_join_data.loc[no_duplicated_join_data.duplicated(subset=['CUSIP'],keep=False)]['USE'] > 0
print('We use {} of duplicated CUSIP in no_duplicated_join_data'.format(a.sum()))
b = no_duplicated_join_data.loc[no_duplicated_join_data.duplicated(subset=['CUSIP'])]['USE'] > 0
print('{} of them should be deleted'.format(b.sum()))

We use 108 of duplicated CUSIP in no_duplicated_join_data
54 of them should be deleted


In [58]:
print('no_duplicated_join_data use {:.2f}% of data'.format(no_duplicated_join_data['USE'].mean()*100))

no_duplicated_join_data use 88.83% of data


In [41]:
a = no_duplicated_join_data.groupby(by=['CUSIP']).size() > 1
a.values.sum()
b = no_duplicated_join_data.groupby(by=['CUSIP']).size().sort_values(ascending=False)
c = b[0:a.values.sum()]
c.index

Index(['126671Y67', '126671Z33', '1266712B1', '1266712F2', '126671Y75',
       '126671Y83', '126671Z25', '126671Y91', '126671Z41', '126671Z58',
       ...
       '05948KFL8', '05948KFG9', '05948KFF1', '05948KFD6', '05948KFC8',
       '05948KFA2', '05948KFZ7', '05948KFN4', '05949CQP4', '05949CQN9'],
      dtype='object', name='CUSIP', length=726)

In [15]:
CUSIP_PID_count = no_duplicated_join_data.groupby(by=['CUSIP','PID'])['CUSIP','PID'].nunique().sum()
CUSIP_PID_count

CUSIP    30761
PID      30761
dtype: int64

In [17]:
print("Are there duplicated CUSIP and PID in the no_duplicated_join_data?")
if(CUSIP_PID_count[0] < no_duplicated_join_data.shape[0]): print('CUSIP:Yes')
if(CUSIP_PID_count[1] < no_duplicated_join_data.shape[0]): print('PID:Yes')

Are there duplicated CUSIP and PID in the no_duplicated_join_data?
CUSIP:Yes
PID:Yes


In [46]:
no_duplicated_join_data['CUSIP'].size

30781

In [45]:
no_duplicated_join_data.loc[no_duplicated_join_data['CUSIP'] == c.index]

ValueError: Lengths must match to compare

In [43]:
no_duplicated_join_data.loc[no_duplicated_join_data['CUSIP'] == '576434AJ3']

Unnamed: 0,PID,FI_1,FI_2,FI_3,FI_4,FI_5,FI_6,FI_7,FI_8,FI_9,...,MTG_TRANCHE_TYP_LONG,Moody Rating,Initial Moody Rating,Bloomberg Composite,HCLB,MTG INT SHRTFLL,HIST INTRST SHRTFLL,Label,norm_label,USE
487,FC_1509,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,"SUB,NAS",,,NR,0.0,0.0,0\0\0,MEY,ME,0
510,FC_1509,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,"SUB,NAS",,,NR,0.0,0.0,0\0\0,MEY,ME,0


In [12]:
no_duplicated_join_data.groupby(by=['CUSIP','PID','Current_Balance','norm_class','MTG_TRANCHE_TYP_LONG']).size().sort_values(ascending=False).head()

CUSIP      PID     Current_Balance  norm_class  MTG_TRANCHE_TYP_LONG
BCC2BQMX0  FC_704  14.2665          A           STEP,AFC,AS             1
126694YG7  FC_576  0.0000           B           SUB,CSTR,NAS            1
12669CE22  FC_538  0.0000           A           AD,AS                   1
12669CE30  FC_538  0.0000           A           AD,NAS                  1
12669CE48  FC_538  0.0000           A           AD,NAS                  1
dtype: int64