# INNOVACCER HACKERCAMP SUMMER INTERN ASSIGNMENT

In [1]:
import os                                   # For OS related operations
import dedupe                               # For ML stuff
import pandas as pd                         # For Data Management and Representation
from IPython.display import FileLink        # For Displaying the link in the Notebook itself          

In [2]:
# Loading the Dataset
Data = pd.read_csv('./Train.csv')
Data.head()

Unnamed: 0,ln,dob,gn,fn
0,SMITH JR,01/03/68,F,WILLIAM
1,ROTHMEYER JR,01/03/68,F,WILLIAM
2,ASBY JR,01/03/68,F,WILLIAM
3,SALTER JR,01/03/68,F,WILLIAM
4,BLAND JR,21/02/62,F,WILLIAM


In [3]:
print("Dataset Shape:")
print(Data.shape)
print("\n")
print("Dataset Columns/Features:")
print(Data.dtypes)
Data.head()

Dataset Shape:
(50, 4)


Dataset Columns/Features:
ln     object
dob    object
gn     object
fn     object
dtype: object


Unnamed: 0,ln,dob,gn,fn
0,SMITH JR,01/03/68,F,WILLIAM
1,ROTHMEYER JR,01/03/68,F,WILLIAM
2,ASBY JR,01/03/68,F,WILLIAM
3,SALTER JR,01/03/68,F,WILLIAM
4,BLAND JR,21/02/62,F,WILLIAM


# Dataset Description
## fn - FirstName, ln - LastName, gn - Gender, dob - DateOfBirth

### We are adding ID Columns for further processing

In [4]:
Data['ID'] = range(len(Data.index))

In [5]:
Data.head()

Unnamed: 0,ln,dob,gn,fn,ID
0,SMITH JR,01/03/68,F,WILLIAM,0
1,ROTHMEYER JR,01/03/68,F,WILLIAM,1
2,ASBY JR,01/03/68,F,WILLIAM,2
3,SALTER JR,01/03/68,F,WILLIAM,3
4,BLAND JR,21/02/62,F,WILLIAM,4


#### The Column ID will be used to convert the dataframe to dictionary

In [6]:
Train = Data.to_dict('ID')

In [7]:
Train

{0: {'ID': 0, 'dob': '01/03/68', 'fn': 'WILLIAM', 'gn': 'F', 'ln': 'SMITH JR'},
 1: {'ID': 1,
  'dob': '01/03/68',
  'fn': 'WILLIAM',
  'gn': 'F',
  'ln': 'ROTHMEYER JR'},
 2: {'ID': 2, 'dob': '01/03/68', 'fn': 'WILLIAM', 'gn': 'F', 'ln': 'ASBY JR'},
 3: {'ID': 3,
  'dob': '01/03/68',
  'fn': 'WILLIAM',
  'gn': 'F',
  'ln': 'SALTER JR'},
 4: {'ID': 4, 'dob': '21/02/62', 'fn': 'WILLIAM', 'gn': 'F', 'ln': 'BLAND JR'},
 5: {'ID': 5,
  'dob': '21/02/62',
  'fn': 'WILLIAM',
  'gn': 'F',
  'ln': 'SHAFFER JR'},
 6: {'ID': 6, 'dob': '21/02/62', 'fn': 'BILL', 'gn': 'F', 'ln': 'BLAND JR'},
 7: {'ID': 7, 'dob': '08/06/54', 'fn': 'WILLIAM', 'gn': 'F', 'ln': 'BLAND JR'},
 8: {'ID': 8, 'dob': '25/10/53', 'fn': 'WILLIAM', 'gn': 'F', 'ln': 'BLAND JR'},
 9: {'ID': 9,
  'dob': '25/10/53',
  'fn': 'WILLIAM',
  'gn': 'F',
  'ln': 'SHAFFER JR'},
 10: {'ID': 10,
  'dob': '25/10/53',
  'fn': 'THOMAS',
  'gn': 'F',
  'ln': 'DUNCAN JR'},
 11: {'ID': 11, 'dob': '25/10/53', 'fn': 'ROY', 'gn': 'F', 'ln': 'CARLSON

In [8]:
settings_file = 'csv_example_learned_settings'
training_file = 'csv_example_training.json'

# Training The Model

In [9]:
# If a settings file already exists, we'll just load that and skip training
if os.path.exists(settings_file):
    print('reading from', settings_file)
    with open(settings_file, 'rb') as f:
        deduper = dedupe.StaticDedupe(f)
else:
    # ## Training

    # Define the fields dedupe will pay attention to
    fields = [
            {'field' : 'ln', 'type': 'String'},
            {'field' : 'dob', 'type': 'String'},
            {'field' : 'gn', 'type': 'String'},
            {'field' : 'fn', 'type': 'String'},
        ]

    # Create a new deduper object and pass our data model to it.
    deduper = dedupe.Dedupe(fields)

    # To train dedupe, we feed it a sample of records.
    deduper.sample(Train, 15000)

    # If we have training data saved from a previous run of dedupe,
    # look for it and load it in.
    # __Note:__ if you want to train from scratch, delete the training_file
    if os.path.exists(training_file):
        print('reading labeled examples from ', training_file)
        with open(training_file, 'rb') as f:
            deduper.readTraining(f)

    # ## Active learning
    # Dedupe will find the next pair of records
    # it is least certain about and ask you to label them as duplicates
    # or not.
    # use 'y', 'n' and 'u' keys to flag duplicates
    # press 'f' when you are finished
    print('starting active labeling...')
    dedupe.consoleLabel(deduper)

    # Using the examples we just labeled, train the deduper and learn
    # blocking predicates
    deduper.train()

    # When finished, save our training to disk
    with open(training_file, 'w') as tf:
        deduper.writeTraining(tf)

    # Save our weights and predicates to disk.  If the settings file
    # exists, we will skip all the training and learning next time we run
    # this file.
    with open(settings_file, 'wb') as sf:
        deduper.writeSettings(sf)

INFO:dedupe.api:((SimplePredicate: (twoGramFingerprint, dob), TfidfTextCanopyPredicate: (0.4, ln)),)


reading from csv_example_learned_settings


In [10]:
threshold = deduper.threshold(Train, recall_weight=1)

INFO:dedupe.api:Maximum expected recall and precision
INFO:dedupe.api:recall: 0.473
INFO:dedupe.api:precision: 0.366
INFO:dedupe.api:With threshold: 0.366


In [11]:
print('clustering...')
clustered_dupes = deduper.match(Train, threshold)
clustered_dupes

clustering...


[((4, 6), (0.36583894, 0.36583894)),
 ((40, 41), (1.2275987e-06, 1.2275987e-06)),
 ((20, 21), (2.4787007e-05, 2.4787007e-05))]

In [12]:
for (cluster_id, cluster) in enumerate(clustered_dupes):
    id_set, scores = cluster
    cluster_d = [Train[c] for c in id_set]
    print(cluster_d)
    print()

[{'dob': '21/02/62', 'fn': 'WILLIAM', 'ln': 'BLAND JR', 'gn': 'F', 'ID': 4}, {'dob': '21/02/62', 'fn': 'BILL', 'ln': 'BLAND JR', 'gn': 'F', 'ID': 6}]

[{'dob': '07/10/37', 'fn': 'HAROLD', 'ln': 'FAGEN JR', 'gn': 'M', 'ID': 40}, {'dob': '07/10/37', 'fn': 'GEORGE', 'ln': 'FAGEN JR', 'gn': 'M', 'ID': 41}]

[{'dob': '31/01/46', 'fn': 'LAWRENCE', 'ln': 'LIND JR', 'gn': 'M', 'ID': 20}, {'dob': '31/01/46', 'fn': 'KENNETH', 'ln': 'LIND JR', 'gn': 'M', 'ID': 21}]



In [13]:
list1 = [] # list1 contains all the entries for duplicate record clusters for ex. (66, 67, 68, 69, 70, 71, 72, 73, 74)
list2 = [] # list2 contains the first entry of the duplicate record for each cluster we have found for ex.(66)
newlist = [] # newlist contains the entries except list2 which is for ex. (67, 68, 69, 70, 71, 72, 73, 74)
finallist = [] # finallist contains all the indexes except those present in newlist

In [14]:
for (cluster_id, cluster) in enumerate(clustered_dupes):
    list2.append(cluster[0][0])
    for i in cluster[0]:
        list1.append(i)

newlist = list(set(list1) - set(list2))

for i in range(Data.shape[0]):
    finallist.append(i)

finallist = list(set(finallist) - set(newlist))

In [15]:
FinalDF = pd.DataFrame(columns=['ln','dob','gn','fn'])
for i in finallist:
    FinalDF = FinalDF.append(Data.iloc[i])

FinalDF = FinalDF.drop(['ID'], axis=1)

In [16]:
FinalDF.head()

Unnamed: 0,ln,dob,gn,fn
0,SMITH JR,01/03/68,F,WILLIAM
1,ROTHMEYER JR,01/03/68,F,WILLIAM
2,ASBY JR,01/03/68,F,WILLIAM
3,SALTER JR,01/03/68,F,WILLIAM
4,BLAND JR,21/02/62,F,WILLIAM


In [17]:
filename = 'Train_Correct.csv'
FinalDF.to_csv(filename, index=False)
FileLink(filename)

# Testing The Model On TestData

In [18]:
TestData = pd.read_csv('Test.csv')
TestData

Unnamed: 0,ln,dob,gn,fn
0,Frometa,24/11/34,F,Vladimir
1,Frometa Garo,24/11/34,F,Vladimir Antonio
2,Frometa Garo,24/11/34,F,Vladimir A
3,Frometa,24/11/34,F,Vladimir
4,Frometa G,24/11/34,F,Vladimir
5,Frometa,24/11/34,F,Vladimir A
6,Frometa G,24/11/34,F,Vladimir A
7,Dutta,24/11/34,M,Sparsh
8,Dutta K,24/11/34,M,Sparsh


In [19]:
TestData['ID'] = range(len(TestData.index))

In [20]:
Test = TestData.to_dict('ID')
Test

{0: {'ID': 0,
  'dob': '24/11/34',
  'fn': 'Vladimir ',
  'gn': 'F',
  'ln': 'Frometa'},
 1: {'ID': 1,
  'dob': '24/11/34',
  'fn': 'Vladimir Antonio',
  'gn': 'F',
  'ln': 'Frometa Garo'},
 2: {'ID': 2,
  'dob': '24/11/34',
  'fn': 'Vladimir A',
  'gn': 'F',
  'ln': 'Frometa Garo'},
 3: {'ID': 3, 'dob': '24/11/34', 'fn': 'Vladimir', 'gn': 'F', 'ln': 'Frometa'},
 4: {'ID': 4,
  'dob': '24/11/34',
  'fn': 'Vladimir',
  'gn': 'F',
  'ln': 'Frometa G'},
 5: {'ID': 5,
  'dob': '24/11/34',
  'fn': 'Vladimir A ',
  'gn': 'F',
  'ln': 'Frometa'},
 6: {'ID': 6,
  'dob': '24/11/34',
  'fn': 'Vladimir A ',
  'gn': 'F',
  'ln': 'Frometa G'},
 7: {'ID': 7, 'dob': '24/11/34', 'fn': 'Sparsh', 'gn': 'M', 'ln': 'Dutta'},
 8: {'ID': 8, 'dob': '24/11/34', 'fn': 'Sparsh', 'gn': 'M', 'ln': 'Dutta K'}}

In [21]:
print('Test Data Clustering...')
clustered_dupes = deduper.match(Test, threshold)
clustered_dupes

Test Data Clustering...


[((0, 1, 2, 3, 4, 5, 6),
  array([0.93367286, 0.87325241, 0.9323102 , 0.93295265, 0.93323344,
         0.91185586, 0.91374632])),
 ((7, 8), (0.9455035, 0.9455035))]

In [22]:
for (cluster_id, cluster) in enumerate(clustered_dupes):
    id_set, scores = cluster
    cluster_d = [Test[c] for c in id_set]
    print(cluster_d)
    print()

[{'dob': '24/11/34', 'fn': 'Vladimir ', 'ln': 'Frometa', 'gn': 'F', 'ID': 0}, {'dob': '24/11/34', 'fn': 'Vladimir Antonio', 'ln': 'Frometa Garo', 'gn': 'F', 'ID': 1}, {'dob': '24/11/34', 'fn': 'Vladimir A', 'ln': 'Frometa Garo', 'gn': 'F', 'ID': 2}, {'dob': '24/11/34', 'fn': 'Vladimir', 'ln': 'Frometa', 'gn': 'F', 'ID': 3}, {'dob': '24/11/34', 'fn': 'Vladimir', 'ln': 'Frometa G', 'gn': 'F', 'ID': 4}, {'dob': '24/11/34', 'fn': 'Vladimir A ', 'ln': 'Frometa', 'gn': 'F', 'ID': 5}, {'dob': '24/11/34', 'fn': 'Vladimir A ', 'ln': 'Frometa G', 'gn': 'F', 'ID': 6}]

[{'dob': '24/11/34', 'fn': 'Sparsh', 'ln': 'Dutta', 'gn': 'M', 'ID': 7}, {'dob': '24/11/34', 'fn': 'Sparsh', 'ln': 'Dutta K', 'gn': 'M', 'ID': 8}]



In [23]:
list1 = [] # list1 contains all the entries for duplicate record clusters for ex. (66, 67, 68, 69, 70, 71, 72, 73, 74)
list2 = [] # list2 contains the first entry of the duplicate record for each cluster we have found for ex.(66)
newlist = [] # newlist contains the entries except list2 which is for ex. (67, 68, 69, 70, 71, 72, 73, 74)
finallist = [] # finallist contains all the indexes except those present in newlist

In [24]:
for (cluster_id, cluster) in enumerate(clustered_dupes):
    list2.append(cluster[0][0])
    for i in cluster[0]:
        list1.append(i)

newlist = list(set(list1) - set(list2))

for i in range(TestData.shape[0]):
    finallist.append(i)

finallist = list(set(finallist) - set(newlist))

In [25]:
finallist

[0, 7]

In [26]:
FinalDF = pd.DataFrame(columns=['ln','dob','gn','fn'])
for i in finallist:
    FinalDF = FinalDF.append(TestData.iloc[i])

FinalDF = FinalDF.drop(['ID'], axis=1)

In [27]:
FinalDF.head()

Unnamed: 0,ln,dob,gn,fn
0,Frometa,24/11/34,F,Vladimir
7,Dutta,24/11/34,M,Sparsh


In [28]:
filename = 'Test_Correct.csv'
FinalDF.to_csv(filename, index=False)
FileLink(filename)