In [31]:
import pandas as pd
import boto3
from boto3.dynamodb.conditions import Key
import simplejson as json
import urllib3
urllib3.disable_warnings(urllib3.exceptions.InsecureRequestWarning)

class Client:
  def __init__(self, clientId, gcid, ssn, firstName, lastName, isPrimary, isActive):
    self.clientId = clientId
    self.gcid = gcid
    self.ssn = ssn
    self.firstName = firstName
    self.lastName = lastName
    self.isPrimary = isPrimary
    self.isActive = isActive

class RemoveDups:
  def __init__(self, fileName):
    self.fileName = fileName
    
  def start_cleanup(self):
    print('Starting Clean up Process on List')
    loansWithDuplicates = []
    loanNumbers = self.loans_from_json()
    for loanNumber in loanNumbers:
      dupLoanNumber = self.clean_loan(str(loanNumber))
      if dupLoanNumber:
        loansWithDuplicates.append(dupLoanNumber)
        
    # if loansWithDuplicates:
    #   print(f'Loans with duplicate clients: {len(loansWithDuplicates)}')
    #   with open(f"/Users/BRadhakrishnan/Documents/repo/ql-personal/notebook/python/remove-dups/loans4.json", "w") as outfile:
    #     serialized = json.dump(loansWithDuplicates, outfile, indent=4 * ' ')
      
          
  def clean_loan(self, loanNumber):
    print(f'Starting Clean up on {loanNumber}')
    
    clients = self.get_clients(loanNumber)
    duplicates = self.__get_duplicates(clients)
    if duplicates:
      self.__backup_deleted(list(duplicates), loanNumber)
      for clientId in duplicates:
        self.remove(loanNumber, clientId)

      #print(f'found duplicates for {loanNumber}')
      return loanNumber
      #print(f"Loans With Duplicates: {loanNumber}")
    
    #print('Completed Clean up')
  def loans_from_json(self):
    file = open(self.fileName) 
    data = json.load(file)
    return data
  
  def loans_from_csv(self):
    reader = pd.read_csv(self.fileName, delimiter= ',', iterator= True)
    for df in reader:
      yield df.iloc[:, 0].tolist()
      
  def __backup(self, item, fileName):
    with open(f"/Users/BRadhakrishnan/Documents/repo/ql-personal/notebook/python/remove-dups/backup/{fileName}.json", "w") as outfile:
      serialized = json.dump(item, outfile, indent=4 * ' ')
    
  def __backup_deleted(self, item, fileName):
    with open(f"/Users/BRadhakrishnan/Documents/repo/ql-personal/notebook/python/remove-dups/deleted/{fileName}.json", "w") as outfile:
      serialized = json.dump(item, outfile, indent=4 * ' ')
    
  def get_clients(self, loanNumber):
    clients = []
    #print(f"Get Clients for loan: {loanNumber}")
    TABLE_NAME = "mortgageclient-data-prod-206763"
    dynamodb = boto3.resource(
        'dynamodb', region_name="us-east-2", verify=False)
    table = dynamodb.Table(TABLE_NAME)
    dynamoResponse = table.query(
        KeyConditionExpression=Key('LoanIdentifier').eq(str(loanNumber)) & Key('SortKey').begins_with('PI#'))
    
    if dynamoResponse:
        items = dynamoResponse['Items']
        self.__backup(items, loanNumber)
        for count, item in enumerate(items):
            ssn = item['SSN'] if 'SSN' in item else '' 
            firstName = item["FirstName"]
            lastName = item["LastName"]
            isPrimary = item['IsPrimary']
            isActive = False if item['IsDeleted'] == True else True
            clientId = item['ClientUID']
            gcid = item['GCID'] if 'GCID' in item else ''
            client = Client(clientId, gcid, ssn, firstName, lastName, isPrimary, isActive)
            clients.append(client)
    return clients
            
  def __get_duplicates(self,clients):
    duplicates = set()
    if clients:
      activeClients = list(filter(lambda client: client.isActive == True and client.gcid and client.ssn, clients))
      inActiveClients = list(filter(lambda client: not client.isActive, clients))
      inActiveSSNClients = list(filter(lambda client: not client.isActive and client.ssn, clients))
      inValidClients = list(filter(lambda client: not client.ssn, inActiveClients))
      invalidActiveClients = list(filter(lambda client: client.isActive == True and not client.ssn, clients))
      
      #Invalid Clients
      # for client in inValidClients:
      #   duplicates.add(client.clientId)
        
      # for client in invalidActiveClients:
      #   duplicates.add(client.clientId)
        
      visited = set()
      for client in inActiveSSNClients:
        if client.clientId in visited:
          continue
        inActiveButDups = [x for x in inActiveSSNClients if x.clientId !=
                       client.clientId and x.ssn == client.ssn]
        for x in inActiveButDups:
          visited.add(x.clientId)
          duplicates.add(x.clientId)
          
      for client in activeClients:
        inActiveDups = [x for x in inActiveClients if x.clientId !=
         client.clientId and x.gcid == client.gcid]
        for iad in inActiveDups:
          duplicates.add(iad.clientId)
          
      visited.clear()
      for client in activeClients:
        if client.clientId in visited:
          continue
        activeButDuplicates = [x for x in activeClients if x.clientId != client.clientId and x.gcid == client.gcid]
        for abd in activeButDuplicates:
          visited.add(abd.clientId)
          duplicates.add(abd.clientId)
      
    return duplicates
            
  def remove(self,loanNumber, clientId):
    TABLE_NAME = 'mortgageclient-data-prod-206763'
    sortKey = f"PI#{clientId}"
    dynamodb = boto3.resource(
        'dynamodb', region_name="us-east-2", verify=False)
    table = dynamodb.Table(TABLE_NAME)
    table.delete_item(Key={'LoanIdentifier': loanNumber,
                           'SortKey': sortKey
                           })
    print(f"Loan {loanNumber}, clientId {clientId} Removed")
    
    
process = RemoveDups(
    '/Users/BRadhakrishnan/Documents/repo/ql-personal/notebook/python/remove-dups/loan.json')
process.start_cleanup()
# loanNumbers = process.loans_from_json()
# for loanNumber in loanNumbers:
#   print(loanNumber)
#process.clean_loan('3520947959')
#process.remove('3521480935', '36a25c388ec54d859cf7e6241df2718d')
# clients = process.get_clients('3521307939')
# for client in clients:
#   print(client.clientId)


Starting Clean up Process on List
Starting Clean up on 3521073009
Starting Clean up on 3521216651
Starting Clean up on 3521311219
Starting Clean up on 3520745262
Starting Clean up on 3521282619
Starting Clean up on 3521268640
Starting Clean up on 3521256697
Starting Clean up on 3521186007
Starting Clean up on 3521317084
Loan 3521317084, clientId 9c54da0eb57a4b549286ecf3125a42c7 Removed
Starting Clean up on 3520187752
Starting Clean up on 3521269024
Starting Clean up on 3521317868
Starting Clean up on 3521307667
Starting Clean up on 3521266496
Starting Clean up on 3521303681
Starting Clean up on 3521263522
Starting Clean up on 3521200670
Starting Clean up on 3521236023
Starting Clean up on 3521283653
Starting Clean up on 3521283576
Starting Clean up on 3521006612
Starting Clean up on 3521309884
Starting Clean up on 3521134651
Starting Clean up on 3521167649
Starting Clean up on 3521306346
Starting Clean up on 3521276284
Starting Clean up on 3521073648
Starting Clean up on 3521307189
Sta