# Data Migration Dupe-Management Tool
Creator: Richard Ngo

<b>Prerequistes:</b>
1. Headers must be INS fieldnames
2. Excess whitespaces must be trimmed

<b>Notes:</b>
1. All cells will need to be run individually and in order (cannot restart in the middle, must be from beginning cell).

## 1. Upload CSV Data to be Parsed

Instructions: Run the following cell to turn on / reset the widget. Then click on the widget to upload the .csv file.

In [1]:
import pandas as pd
import numpy as np
import re
import io
import os 
import requests
import json
from ipywidgets import FileUpload
from datetime import datetime, timedelta

def dupecheck(data, data_deduped, param1, param2, index) :
    if param1 in data and param2 in data :
        if data[param1].iloc[index] != '' and data[param1].iloc[index] in data_deduped[param2].tolist() : 
            data['duplicate'][index] = param1 + ' exists in Row ' + str(min(data_deduped.index[data_deduped[param2] == data[param1][index]].tolist()))
    return(data)

# csv file upload widget
uploader = FileUpload(accept='.csv',multiple=False)
display(uploader)

FileUpload(value={}, accept='.csv', description='Upload')

## 2. Select Matching Criteria

Instructions: Run the cell below after matching criteria has been chosen (enter in <b><font color='green'>True</font></b> or <b><font color='green'>False</font></b>).

In [2]:
# ignore
input_file = list(uploader.value.values())[0]
content = input_file['content']
content = io.StringIO(content.decode('utf-8'))
df = pd.read_csv(content)
filename = next(iter(uploader.value))

# matching criteria
BorrowerToBorrower = True
BorrowerToCoBorrower = False
CoBorrowerToBorrower = False
CoBorrowerToCoBorrower = False
Email = True
MobileToMobile = True
MobileToHome = True 
MobileToWork = False
HomeToMobile = True
HomeToHome = True
HomeToWork = False
WorkToMobile = False
WorkToHome = False
WorkToWork = False

IndexError: list index out of range

## 3. Clean the Data

Instructions: Run cell below.

In [285]:
df = df.replace(np.nan, '', regex=True).astype(str) # convert values to string
df.columns = df.columns.str.lower() # convert headers to lowercase

for i in range(len(df.index)) :
    # remove special chars from phonenumbers 
    if 'mobile' in df :
        df['mobile'][i] = re.sub(r'[()--+]', '', df['mobile'][i])
    if 'phone' in df :
        df['phone'][i] = re.sub(r'[()--+]', '', df['phone'][i])
    if 'work' in df :
        df['work'][i] = re.sub(r'[()--+]', '', df['work'][i])
    if 'phone_co' in df :
        df['phone_co'][i] = re.sub(r'[()--+]', '', df['phone_co'][i])
    if 'work_co' in df :
        df['work_co'][i] = re.sub(r'[()--+]', '', df['work_co'][i])
    if 'mobile_co' in df :
        df['mobile_co'][i] = re.sub(r'[()--+]', '', df['mobile_co'][i])
    # convert email values to lowercase
    if 'email' in df :
        df['email'] = df['email'].str.lower()
    if 'email_co' in df :
        df['email_co'] = df['email_co'].str.lower()
    # add statusid and activityid columns if they do not exist; used to disposition dupes
    if 'duplicate' in df :
        pass
    else :
        df['duplicate'] = ''
    
    # display processing time
    if i % 750 == 0 : 
        print('Cleaning... ' + "{:.0%}".format(i/len(df.index)))
        
print(df)

Cleaning... 0%
Cleaning... 23%
Cleaning... 46%
Cleaning... 69%
Cleaning... 92%
           uid first name       last name                    email  \
0     VF-00001    Bernard           McKay        bmckay1@wi.rr.com   
1     VF-00002      Brian          Pigney                            
2     VF-00003      Kelsi         Konopka  kelsi.konopka@yahoo.com   
3     VF-00004      shana          thomas       shanamoh@gmail.com   
4     VF-00005      Vijay  Ravidindrababu    pranaov.r@outlook.com   
...        ...        ...             ...                      ...   
3262  VF-03232    Jeffrey         Cabrera    jcpheonix21@yahoo.com   
3263  VF-03233        RAM               K       krams222@gmail.com   
3264  VF-03234      JASON            HOOD  hood09@consolidated.net   
3265  VF-03235       Kimm         Minnick      kimmminnick@aol.com   
3266  VF-03236      TRENT            HILL      bobbyf15e@yahoo.com   

            mobile phone first name (co-borrower) last name (co-borrower)  \
0  

## 4. Run Dupe Check
Instructions: Run cell below.

In [288]:
df_posted = pd.DataFrame(columns=df.columns) # store "posted" rows (i.e. rows that went through dupe check)

for idx, row in df.iterrows() :
    df_deduped = df_posted[df_posted['duplicate'] == ''] # check new rows against nondupe/nonrejected rows
    
    if BorrowerToBorrower == True : 
        if Email == True : dupecheck(df, df_deduped, 'email', 'email', idx)
        if MobileToMobile == True : dupecheck(df, df_deduped, 'mobile', 'mobile', idx)
        if MobileToHome == True : dupecheck(df, df_deduped, 'mobile', 'phone', idx)
        if MobileToWork == True : dupecheck(df, df_deduped, 'mobile', 'work', idx)
        if HomeToHome == True : dupecheck(df, df_deduped, 'phone', 'phone', idx)
        if HomeToMobile == True : dupecheck(df, df_deduped, 'phone', 'mobile', idx)
        if HomeToWork == True : dupecheck(df, df_deduped, 'phone', 'work', idx)
        if WorkToMobile == True : dupecheck(df, df_deduped, 'work', 'mobile', idx)
        if WorkToHome == True : dupecheck(df, df_deduped, 'work', 'phone', idx)
        if WorkToWork == True : dupecheck(df, df_deduped, 'work', 'work', idx)
    if BorrowerToCoBorrower == True :
        if Email == True : dupecheck(df, df_deduped, 'email', 'email_co', idx)
        if MobileToMobile == True : dupecheck(df, df_deduped, 'mobile', 'mobile_co', idx)
        if MobileToHome == True : dupecheck(df, df_deduped, 'mobile', 'phone_co', idx)
        if MobileToWork == True : dupecheck(df, df_deduped, 'mobile', 'work_co', idx)
        if HomeToMobile == True : dupecheck(df, df_deduped, 'phone', 'mobile_co', idx)
        if HomeToHome == True : dupecheck(df, df_deduped, 'phone', 'phone_co', idx)
        if HomeToWork == True : dupecheck(df, df_deduped, 'phone', 'work_co', idx)
        if WorkToMobile == True : dupecheck(df, df_deduped, 'work', 'mobile_co', idx)
        if WorkToHome == True : dupecheck(df, df_deduped, 'work', 'phone_co', idx)
        if WorkToWork == True : dupecheck(df, df_deduped, 'work', 'work_co', idx)
    if CoBorrowerToBorrower == True :
        if Email == True : dupecheck(df, df_deduped, 'email_co', 'email', idx)
        if MobileToMobile == True : dupecheck(df, df_deduped, 'mobile_co', 'mobile', idx)
        if MobileToHome == True : dupecheck(df, df_deduped, 'mobile_co', 'phone', idx)
        if MobileToWork == True : dupecheck(df, df_deduped, 'mobile_co', 'work', idx)
        if HomeToMobile == True : dupecheck(df, df_deduped, 'phone_co', 'mobile', idx)
        if HomeToHome == True : dupecheck(df, df_deduped, 'phone_co', 'phone', idx)
        if HomeToWork == True : dupecheck(df, df_deduped, 'phone_co', 'work', idx)
        if WorkToMobile == True : dupecheck(df, df_deduped, 'work_co', 'mobile', idx)
        if WorkToHome == True : dupecheck(df, df_deduped, 'work_co', 'phone', idx)
        if WorkToWork == True : dupecheck(df, df_deduped, 'work_co', 'work', idx)
    if CoBorrowerToCoBorrower == True :
        if Email == True : dupecheck(df, df_deduped, 'email_co', 'email_co', idx)
        if MobileToMobile == True : dupecheck(df, df_deduped, 'mobile_co', 'mobile_co', idx)
        if MobileToHome == True : dupecheck(df, df_deduped, 'mobile_co', 'phone_co', idx)
        if MobileToWork == True : dupecheck(df, df_deduped, 'mobile_co', 'work_co', idx)
        if HomeToMobile == True : dupecheck(df, df_deduped, 'phone_co', 'mobile_co', idx)
        if HomeToHome == True : dupecheck(df, df_deduped, 'phone_co', 'phone_co', idx)
        if HomeToWork == True : dupecheck(df, df_deduped, 'phone_co', 'work_co', idx)
        if WorkToMobile == True : dupecheck(df, df_deduped, 'work_co', 'mobile_co', idx)
        if WorkToHome == True : dupecheck(df, df_deduped, 'work_co', 'phone_co', idx)
        if WorkToWork == True : dupecheck(df, df_deduped, 'work_co', 'work_co', idx)
    df_posted = df_posted.append(row)
    
    # display processing time
    if idx % 750 == 0 :
        print('Deduping... ' + "{:.0%}".format(idx/len(df.index)))
        
print(df_posted)
print(str(len(df_posted[df_posted['duplicate']!=''].index)) + ' dupes found.')

Checking for dupes... 0%
Checking for dupes... 23%
Checking for dupes... 46%
Checking for dupes... 69%
Checking for dupes... 92%
           uid first name       last name                    email  \
0     VF-00001    Bernard           McKay        bmckay1@wi.rr.com   
1     VF-00002      Brian          Pigney                            
2     VF-00003      Kelsi         Konopka  kelsi.konopka@yahoo.com   
3     VF-00004      shana          thomas       shanamoh@gmail.com   
4     VF-00005      Vijay  Ravidindrababu    pranaov.r@outlook.com   
...        ...        ...             ...                      ...   
3262  VF-03232    Jeffrey         Cabrera    jcpheonix21@yahoo.com   
3263  VF-03233        RAM               K       krams222@gmail.com   
3264  VF-03234      JASON            HOOD  hood09@consolidated.net   
3265  VF-03235       Kimm         Minnick      kimmminnick@aol.com   
3266  VF-03236      TRENT            HILL      bobbyf15e@yahoo.com   

            mobile phone first

## 5. Export New CSV After Dupe Check

Instructions: Run cell below.

In [271]:
df_posted.to_csv('Downloads/'+filename[:-4]+'-DEDUPED.csv',header=True)