### 1. Setting Up

In [1]:
import warnings
warnings.filterwarnings("ignore", category=DeprecationWarning)

In [2]:
import pandas as pd
import numpy as np
import os

# Record Linkage
import recordlinkage as rl
from recordlinkage.index import Block
from recordlinkage.preprocessing import clean

# Regular expression operations
import re

### 2. Preprocessing Data

In [3]:
df = pd.read_excel("data_with_more_match_IDs_2_2_20.xlsx")

In [4]:
columns_to_drop = [c for c in df.columns if 'Unnamed' in c]

In [5]:
df.drop(columns_to_drop, axis = 1, inplace = True)

In [6]:
print("There are {} columns in df.".format(len(df.columns)))
df.columns

There are 45 columns in df.


Index(['ID', 'Census.Year', 'State/Province_x', 'County_x', 'Place_x',
       'Household Joint ID', 'Joint ID for Matched Records', 'Last.Name',
       'First.Name', 'CalculatedBirthYear', 'Age', 'Sex_x',
       'Color..Race.or.Ethnicity', 'lat_x', 'long_x', 'address', 'MARITAL',
       'WARD', 'ROLL or Sheet#', 'PROFESSION_x', 'Notable_x', 'STREET',
       'PLACEOFBIRTH_x', 'RELIGION_x',
       'NOTE these only apply to narrative answers', 'LIVING W MALE FAMILY?',
       'LIVING W FEMALE FAMILY?', 'LIVING W MALE NONFAMILY?',
       'LIVING W FEMALE NONFAMILY?', 'Cannot Read', 'Cannot Write', 'Sick',
       'Relation to Head of Household',
       'Year of Immigration to Canada if an Immigrant', 'Date of Death',
       'Cause of Death', 'Rank (Military)', 'Enlistment Date',
       'Enlistment Place', 'Date Mustered Out', 'Year of this Record',
       'Last Name MATCH', 'First Name Match', 'Census Year Match',
       'Total of Matches'],
      dtype='object')

In [7]:
df['unique_id'] = df.index + 1

In [16]:
df.to_excel("20200225_MASTER_DATA.xlsx")

### 3. Duplications

In [143]:
df_dup_1 = pd.read_csv('duplications.csv')
df_dup_2 = pd.read_csv('duplications_5.csv')


In [144]:
df_dup_2['dup_pair'] = df_dup_2['dup_pair'] + 114

In [145]:
df_dup = pd.concat([df_dup_1, df_dup_2])

In [146]:
df_dup = df_dup.reset_index().drop(['index', 'Unnamed: 0'], axis = 1)

In [147]:
df_dup

Unnamed: 0,Age,CalculatedBirthYear,Cannot Read,Cannot Write,Cause of Death,Census Year Match,Census.Year,Color..Race.or.Ethnicity,County_x,Date Mustered Out,...,State/Province_x,Total of Matches,WARD,Year of Immigration to Canada if an Immigrant,Year of this Record,address,lat_x,long_x,unique_id,dup_pair
0,26.0,1835.0,,,,,1861.0,M,Hamilton,,...,Ontario,,,,,,,,796.0,0
1,26.0,1835.0,,,,100.0,1861.0,M,Hamilton,,...,Ontario,#REF!,,,,,,,827.0,0
2,27.0,1834.0,,,,,1861.0,B,Hamilton,,...,Ontario,,,,,,,,1687.0,1
3,27.0,1834.0,,,,0.0,1861.0,B,Hamilton,,...,Ontario,#REF!,,,,,,,1695.0,1
4,24.0,1837.0,,,,,1861.0,B,Hamilton,,...,Ontario,,,,,,,,1828.0,2
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1113,30.0,1850.0,,,,,1880.0,B,SUFFOLK,,...,MA,,,,,"BOSTON,SUFFOLK,MA",42.360082,-7.105888e+01,50755.0,556
1114,33.0,1847.0,3.0,2.0,1.0,,1880.0,Mulatto(BlackandWhite),,,...,NY,,0.0,0.0,,5,,6.090789e+09,50702.0,557
1115,33.0,1847.0,,,,,1880.0,M,KINGS,,...,NY,,,,,"19-WD;BROOKLYN,KINGS,NY",40.727098,-7.393684e+01,50768.0,557
1116,5.0,1875.0,0.0,0.0,3.0,,1880.0,Mulatto(BlackandWhite),,,...,NY,,0.0,2.0,,5,,6.090789e+09,50718.0,558


In [148]:
joint_id = [x for x in list(df_dup['Joint ID for Matched Records'].unique()) if pd.notnull(x)]

In [149]:
joint_id

[41.0,
 616.0,
 769.0,
 '720.0',
 '927.0',
 '427.0',
 '433.0',
 '445.0',
 '47.0',
 '447.0',
 '461.0',
 '462.0',
 '511.0',
 '527.0',
 '548.0',
 '562.0',
 '?',
 '899',
 '568',
 '583',
 '586',
 '590?',
 '594',
 '907',
 '972',
 '914',
 '915',
 '698',
 '720',
 '740',
 '759',
 '775',
 '783',
 '807',
 '812',
 '818',
 '833',
 '929',
 '852']

In [150]:
df_dup[df_dup['Joint ID for Matched Records'].isin(joint_id)][['Census.Year', 
                                                               'First.Name', 'Last.Name', 
                                                               'unique_id', 'dup_pair',
                                                               'Household Joint ID']]

Unnamed: 0,Census.Year,First.Name,Last.Name,unique_id,dup_pair,Household Joint ID
46,1880.0,Dora,Crosby,10569.0,23,
126,1910.0,LOUISA,LUCAS,28855.0,63,
202,1920.0,ALBERT,TAYLOR,43782.0,101,
203,1920.0,ALBERT,TAYLOR,43783.0,101,
258,1880.0,WILLIAM,BARTLET,2477.0,129,
288,1880.0,MARY,BLUME,3960.0,144,
290,1880.0,MARYE.,BLUME,3961.0,145,
348,1880.0,WMH,CLARK,8612.0,174,
378,1880.0,WILLIAM,COOKE,9742.0,189,
386,1880.0,JAMES,CURRIE,10880.0,193,


In [151]:
dup_id = df_dup[df_dup['Joint ID for Matched Records'].isin(joint_id)][['Census.Year', 
                                                               'First.Name', 'Last.Name', 
                                                               'unique_id', 'dup_pair',
                                                               'Household Joint ID']]['dup_pair']

NEED TO REASSIGN:
* 769

In [152]:
dup_id_1 = set(df_dup[df_dup['Joint ID for Matched Records'].isin(joint_id)]['dup_pair'])

In [153]:
dup_id_1

{23,
 63,
 101,
 129,
 144,
 145,
 174,
 189,
 193,
 197,
 199,
 206,
 207,
 229,
 244,
 270,
 274,
 283,
 287,
 295,
 308,
 311,
 322,
 326,
 381,
 383,
 385,
 400,
 427,
 441,
 466,
 476,
 484,
 507,
 511,
 517,
 521,
 532,
 533,
 542}

In [154]:
len(dup_id_1)

40

In [155]:
dup_id_all = set(df_dup['dup_pair'].to_list())

In [156]:
dup_id_all

{0,
 1,
 2,
 3,
 4,
 5,
 6,
 7,
 8,
 9,
 10,
 11,
 12,
 13,
 14,
 15,
 16,
 17,
 18,
 19,
 20,
 21,
 22,
 23,
 24,
 25,
 26,
 27,
 28,
 29,
 30,
 31,
 32,
 33,
 34,
 35,
 36,
 37,
 38,
 39,
 40,
 41,
 42,
 43,
 44,
 45,
 46,
 47,
 48,
 49,
 50,
 51,
 52,
 53,
 54,
 55,
 56,
 57,
 58,
 59,
 60,
 61,
 62,
 63,
 64,
 65,
 66,
 67,
 68,
 69,
 70,
 71,
 72,
 73,
 74,
 75,
 76,
 77,
 78,
 79,
 80,
 81,
 82,
 83,
 84,
 85,
 86,
 87,
 88,
 89,
 90,
 91,
 92,
 93,
 94,
 95,
 96,
 97,
 98,
 99,
 100,
 101,
 102,
 103,
 104,
 105,
 106,
 107,
 108,
 109,
 110,
 111,
 112,
 113,
 114,
 115,
 116,
 117,
 118,
 119,
 120,
 121,
 122,
 123,
 124,
 125,
 126,
 127,
 128,
 129,
 130,
 131,
 132,
 133,
 134,
 135,
 136,
 137,
 138,
 139,
 140,
 141,
 142,
 143,
 144,
 145,
 146,
 147,
 148,
 149,
 150,
 151,
 152,
 153,
 154,
 155,
 156,
 157,
 158,
 159,
 160,
 161,
 162,
 163,
 164,
 165,
 166,
 167,
 168,
 169,
 170,
 171,
 172,
 173,
 174,
 175,
 176,
 177,
 178,
 179,
 180,
 181,
 182,
 183,
 184,


In [157]:
len(dup_id_all)

559

In [158]:
dup_id_2 = dup_id_all.difference(dup_id_1)

In [159]:
len(dup_id_2)

519

In [160]:
df_dup_2 = df_dup[df_dup['dup_pair'].isin(dup_id_2)]

In [217]:
df_dup['unique_id'].value_counts().to_dict()

{42909.0: 2,
 50792.0: 2,
 42952.0: 2,
 50676.0: 2,
 43036.0: 2,
 49570.0: 2,
 49638.0: 2,
 50677.0: 2,
 49791.0: 2,
 24414.0: 1,
 42146.0: 1,
 4947.0: 1,
 13634.0: 1,
 24825.0: 1,
 32939.0: 1,
 24481.0: 1,
 40842.0: 1,
 21715.0: 1,
 33230.0: 1,
 25534.0: 1,
 40506.0: 1,
 8365.0: 1,
 18539.0: 1,
 42427.0: 1,
 47042.0: 1,
 49180.0: 1,
 32504.0: 1,
 5585.0: 1,
 2477.0: 1,
 21589.0: 1,
 27375.0: 1,
 33637.0: 1,
 15035.0: 1,
 26476.0: 1,
 25642.0: 1,
 13506.0: 1,
 35753.0: 1,
 24894.0: 1,
 48736.0: 1,
 39157.0: 1,
 47039.0: 1,
 47600.0: 1,
 4715.0: 1,
 43113.0: 1,
 40052.0: 1,
 18289.0: 1,
 22984.0: 1,
 26169.0: 1,
 25814.0: 1,
 19950.0: 1,
 34502.0: 1,
 46975.0: 1,
 43311.0: 1,
 23772.0: 1,
 22982.0: 1,
 25815.0: 1,
 38654.0: 1,
 50185.0: 1,
 29983.0: 1,
 25869.0: 1,
 8777.0: 1,
 23316.0: 1,
 33837.0: 1,
 47151.0: 1,
 50420.0: 1,
 30469.0: 1,
 10641.0: 1,
 17264.0: 1,
 8242.0: 1,
 44249.0: 1,
 3503.0: 1,
 32669.0: 1,
 22155.0: 1,
 38292.0: 1,
 10569.0: 1,
 19598.0: 1,
 6310.0: 1,
 16442.0

In [222]:
unique_id_to_consider = list({42909.0: 2,
 50792.0: 2,
 42952.0: 2,
 50676.0: 2,
 43036.0: 2,
 49570.0: 2,
 49638.0: 2,
 50677.0: 2}.keys())

In [223]:
unique_id_to_consider

[42909.0, 50792.0, 42952.0, 50676.0, 43036.0, 49570.0, 49638.0, 50677.0]

In [224]:
dup_id_3 = set(df_dup[df_dup['unique_id'].isin(unique_id_to_consider)]['dup_pair'].to_list())

In [225]:
dup_id_3

{467, 468, 469, 532, 533, 534, 553, 554, 555}

In [226]:
df_dup_3 = df_dup[df_dup['dup_pair'].isin(dup_id_3)]

In [227]:
df_dup_3

Unnamed: 0,Age,CalculatedBirthYear,Cannot Read,Cannot Write,Cause of Death,Census Year Match,Census.Year,Color..Race.or.Ethnicity,County_x,Date Mustered Out,...,State/Province_x,Total of Matches,WARD,Year of Immigration to Canada if an Immigrant,Year of this Record,address,lat_x,long_x,unique_id,dup_pair
934,28.0,1852.0,1.0,1.0,1.0,,1880.0,Black,,,...,NY,,0.0,0.0,,3,,5787627000.0,42909.0,467
935,28.0,1852.0,2.0,2.0,1.0,,1880.0,Black,,,...,NY,,0.0,0.0,,5,,6170205000.0,42952.0,467
936,28.0,1852.0,1.0,1.0,1.0,,1880.0,Black,,,...,NY,,0.0,0.0,,3,,5787627000.0,42909.0,468
937,28.0,1852.0,,,,,1880.0,B,CAYUGA,,...,NY,,,,,"9-WD;AUBURN,CAYUGA,NY",42.9438,-76.54354,43036.0,468
938,28.0,1852.0,2.0,2.0,1.0,,1880.0,Black,,,...,NY,,0.0,0.0,,5,,6170205000.0,42952.0,469
939,28.0,1852.0,,,,,1880.0,B,CAYUGA,,...,NY,,,,,"9-WD;AUBURN,CAYUGA,NY",42.9438,-76.54354,43036.0,469
1064,28.0,1852.0,0.0,0.0,5.0,,1880.0,Black,,,...,OH,,0.0,0.0,,6,,7552079000.0,49570.0,532
1065,28.0,1852.0,0.0,0.0,5.0,,1880.0,Black,,,...,OH,,221.0,0.0,,32,,7552093000.0,49638.0,532
1066,28.0,1852.0,0.0,0.0,5.0,,1880.0,Black,,,...,OH,,0.0,0.0,,6,,7552079000.0,49570.0,533
1067,28.0,1852.0,,,,,1880.0,B,LUCAS,,...,OH,,,,,"3-WD;TOLEDO,LUCAS,OH",41.638225,-83.60661,49791.0,533


In [228]:
dup_id_2 = dup_id_2.difference(dup_id_3)

In [229]:
len(dup_id_2)

512

In [230]:
df_dup_2 = df_dup[df_dup['dup_pair'].isin(dup_id_2)]

In [231]:
df_dup_2.shape

(1024, 47)

In [232]:
unique_id_to_delete = df_dup_2[df_dup_2.index % 2 != 1]['unique_id'].to_list()

In [233]:
len(unique_id_to_delete)

512

In [163]:
df_new = pd.read_excel("20200226_MASTER_DATA.xlsx")

In [234]:
df_new[df_new['unique_id'].isin(unique_id_to_delete)]

Unnamed: 0,ID,Census.Year,State/Province_x,County_x,Place_x,Household Joint ID,Joint ID for Matched Records,Last.Name,First.Name,CalculatedBirthYear,...,Rank (Military),Enlistment Date,Enlistment Place,Date Mustered Out,Year of this Record,Last Name MATCH,First Name Match,Census Year Match,Total of Matches,unique_id
2236,3686,1880,NY,ERIE,4-WD;BUFFALO,3,,FIELDS,MARYANN,1832,...,,,,,,,,,,5
2329,,1861,Ontario,London,London District 1,78,,Gray,A,1811,...,,,,,,,,,,297
2348,Matson,1880,Ohio,Clark,Springfield,6?,,Ford,Retta,1858,...,,,,,,,,,,393
2630,1880 IPUMS 100% sample,1880,PA,,Philadephia,,,ADLEY,SUSANE.,1859,...,5900,1230,,,,0,0,0,FALSE,692
2732,,1861,Ontario,Hamilton,St. George's Ward,,,Alexander,Maria,1835,...,,,,,,,,,,796
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
50648,1880 IPUMS 100% sample,1880,MA,,Boston,,,YOUNG,CHARLESF.,1850,...,5900,1230,,,,,,,,50683
50667,1880 IPUMS 100% sample,1880,NY,,Brooklyn New York,,,YOUNG,HENRYA.,1847,...,1000,101,,,,,,,,50702
50668,1880 IPUMS 100% sample,1880,NY,,New York,,,YOUNG,JAMES,1851,...,5900,1230,,,,,,,,50703
50683,1880 IPUMS 100% sample,1880,NY,,Brooklyn New York,,,YOUNG,RICHARD,1875,...,3100,301,,,,,,,,50718


In [242]:
df_adj = df_new[~df_new['unique_id'].isin(unique_id_to_delete)]

In [247]:
df_dup_3

Unnamed: 0,Age,CalculatedBirthYear,Cannot Read,Cannot Write,Cause of Death,Census Year Match,Census.Year,Color..Race.or.Ethnicity,County_x,Date Mustered Out,...,State/Province_x,Total of Matches,WARD,Year of Immigration to Canada if an Immigrant,Year of this Record,address,lat_x,long_x,unique_id,dup_pair
934,28.0,1852.0,1.0,1.0,1.0,,1880.0,Black,,,...,NY,,0.0,0.0,,3,,5787627000.0,42909.0,467
935,28.0,1852.0,2.0,2.0,1.0,,1880.0,Black,,,...,NY,,0.0,0.0,,5,,6170205000.0,42952.0,467
936,28.0,1852.0,1.0,1.0,1.0,,1880.0,Black,,,...,NY,,0.0,0.0,,3,,5787627000.0,42909.0,468
937,28.0,1852.0,,,,,1880.0,B,CAYUGA,,...,NY,,,,,"9-WD;AUBURN,CAYUGA,NY",42.9438,-76.54354,43036.0,468
938,28.0,1852.0,2.0,2.0,1.0,,1880.0,Black,,,...,NY,,0.0,0.0,,5,,6170205000.0,42952.0,469
939,28.0,1852.0,,,,,1880.0,B,CAYUGA,,...,NY,,,,,"9-WD;AUBURN,CAYUGA,NY",42.9438,-76.54354,43036.0,469
1064,28.0,1852.0,0.0,0.0,5.0,,1880.0,Black,,,...,OH,,0.0,0.0,,6,,7552079000.0,49570.0,532
1065,28.0,1852.0,0.0,0.0,5.0,,1880.0,Black,,,...,OH,,221.0,0.0,,32,,7552093000.0,49638.0,532
1066,28.0,1852.0,0.0,0.0,5.0,,1880.0,Black,,,...,OH,,0.0,0.0,,6,,7552079000.0,49570.0,533
1067,28.0,1852.0,,,,,1880.0,B,LUCAS,,...,OH,,,,,"3-WD;TOLEDO,LUCAS,OH",41.638225,-83.60661,49791.0,533


In [214]:
set(unique_id_to_delete).difference(set_)

{49638.0}

In [272]:
unique_id_to_delete_2 = [42909.0, 42952.0, 50676.0, 50677.0]

In [274]:
df_final = df_adj[~df_adj['unique_id'].isin(unique_id_to_delete_2)]

In [248]:
dup_id_1

{23,
 63,
 101,
 129,
 144,
 145,
 174,
 189,
 193,
 197,
 199,
 206,
 207,
 229,
 244,
 270,
 274,
 283,
 287,
 295,
 308,
 311,
 322,
 326,
 381,
 383,
 385,
 400,
 427,
 441,
 466,
 476,
 484,
 507,
 511,
 517,
 521,
 532,
 533,
 542}

In [279]:
df_final['unique_id']

0          107
1          678
2          679
3        47667
4        47668
         ...  
50804    50845
50805    50846
50806    50847
50807    50848
50808    50849
Name: unique_id, Length: 50293, dtype: int64

In [282]:
df_dup['Household Joint ID'].unique()

array([nan, '3.0', '78.0', '6?'], dtype=object)

In [290]:
df_final['unique_id'] = df_final.index + 1

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  """Entry point for launching an IPython kernel.


In [291]:
df_final['unique_id']

0            1
1            2
2            3
3            4
4            5
         ...  
50804    50805
50805    50806
50806    50807
50807    50808
50808    50809
Name: unique_id, Length: 50293, dtype: int64

In [292]:
df_final.to_excel("20200225_MASTER_DATA_REVISED.xlsx")