In [1]:
import pandas as pd
import numpy as np

%matplotlib inline
import matplotlib
import matplotlib.pyplot as plt
from IPython.display import set_matplotlib_formats
set_matplotlib_formats('retina')

from pathlib import Path

## Load Data

In [2]:
root_folder = Path('../../Data/Labeled Securities/')

data = pd.DataFrame([])
for n in range(2,9):
    file_name = 'trimmed_'+ str(2000+n) +'_v3_USE.csv'
    df = pd.read_csv(root_folder / file_name)
    rows = df.shape[0]
    year = {'Year' : ([str(2000+n)]*rows)}
    year_df = pd.DataFrame(data=year)
    df = pd.concat([df, year_df], axis=1)
    data = data.append(df,ignore_index=True)

## Finding duplicatations


#### Drop duplicated rows before checking duplicated CUSIP

In [3]:
rows = data.shape[0]
print('Before drop: {}'.format(rows))
data = data.drop_duplicates()
rows = data.shape[0]
print('After drop: {}'.format(rows))

Before drop: 31811
After drop: 31648


In [4]:
data.groupby(['CUSIP']).size().sort_values(ascending=False).head(3)

CUSIP
1266712F2    4
126671Y75    4
126671Y91    4
dtype: int64

In [5]:
count = ((data.groupby(['CUSIP']).size() > 1).sum())
percentage = count/rows*100
print("{} of CUSIP are duplicated which is {:.2f}% of dataset".format(count,percentage))

742 of CUSIP are duplicated which is 2.34% of dataset


In [6]:
data.groupby(by=['PID','Prospectus','Class','Name','CUSIP','Year']).size().sort_values(ascending=False).head(3)

PID      Prospectus                              Class  Name             CUSIP      Year
FC_2478  MASTR_1274601_0000950136-03-003206.txt  1A1    MALT 2003-9 1A1  576434KU7  2003    2
                                                 4A1    MALT 2003-9 4A1  576434KX1  2003    2
                                                 B2     MALT 2003-9 B2   576434LP7  2003    2
dtype: int64

In [7]:
count = (data.groupby(by=['PID','Prospectus','Class','Name','CUSIP','Year']).size() > 1).sum()
percentage = count/rows*100
print("{} of ['PID','Prospectus','Class','Name','CUSIP','Year'] are duplicated which is {:.2f}% of dataset".format(count,percentage))

16 of ['PID','Prospectus','Class','Name','CUSIP','Year'] are duplicated which is 0.05% of dataset


In [8]:
count = (data.groupby(by=['PID','Class','CUSIP']).size() > 1).sum()
percentage = count/rows*100
print("{} of ['PID','Class','CUSIP'] are duplicated which is {:.2f}% of dataset".format(count,percentage))

16 of ['PID','Class','CUSIP'] are duplicated which is 0.05% of dataset


In [9]:
count = (data.groupby(by=['PID','CUSIP']).size() > 1).sum()
percentage = count/rows*100
print("{} of ['PID','CUSIP'] are duplicated which is {:.2f}% of dataset".format(count,percentage))

20 of ['PID','CUSIP'] are duplicated which is 0.06% of dataset


### Some examples

In [10]:
(data.groupby(by=['PID','CUSIP']).size() > 1).sort_values(ascending=False).head(5)

PID      CUSIP    
FC_2478  576434LP7    True
FC_1509  576434AH7    True
FC_2478  576434KV5    True
FC_1509  576434AN4    True
         576434AL8    True
dtype: bool

In [11]:
data.loc[(data['PID'] == 'FC_2478') & (data['CUSIP'] == '576434LP7')]

Unnamed: 0,PID,Prospectus,Class,norm_class,Name,Current_Balance,Zero-Balance Payment Period Number,Sum Principle Paid,MTG ORIG AMT,Maturity,...,Moody Rating,Initial Moody Rating,Bloomberg Composite,HCLB,MTG INT SHRTFLL,HIST INTRST SHRTFLL,Label,norm_label,USE,Year
4929,FC_2478,MASTR_1274601_0000950136-03-003206.txt,B2,B,MALT 2003-9 B2,0.3853,paying,2.6747,3.06,1/25/2034,...,,,CC+,0.0,0.0,-4153\4153\0,MEY,ME,0,2003
4951,FC_2478,MASTR_1274601_0000950136-03-003206.txt,B2,B,MALT 2003-9 B2,0.3671,paying,2.7646,3.06,1/25/2034,...,,,CC+,0.0,0.0,-4153\4153\445.13,NMEm,NME,0,2003


In [12]:
data.loc[(data['PID'] == 'FC_1509') & (data['CUSIP'] == '576434AH7')]

Unnamed: 0,PID,Prospectus,Class,norm_class,Name,Current_Balance,Zero-Balance Payment Period Number,Sum Principle Paid,MTG ORIG AMT,Maturity,...,Moody Rating,Initial Moody Rating,Bloomberg Composite,HCLB,MTG INT SHRTFLL,HIST INTRST SHRTFLL,Label,norm_label,USE,Year
1808,FC_1509,MASTR_1179092_0000950136-02-002215.txt,AX,A,MALT 2002-1 AX,0.2771,paying,0.0,16.9397,2032-07-25,...,,,NR,0.0,0.0,0\0\0,IOpassMEY,ME,0,2002
1831,FC_1509,MASTR_1179092_0000950136-02-002215.txt,AX,A,MALT 2002-1 AX,0.2802,paying,0.0,16.9397,2032-07-25,...,,,NR,0.0,0.0,0\0\0,IOpassMEY,ME,0,2002
