# Match Address - Run twice! Multithreading
The magic! Here we match addresses. You can run this file on an external server - it will take some times.  
**You have to run this twice, for hcp and hco's! Change the variable `run_for`**

In [1]:
import pandas as pd
import numpy as np
from fuzzywuzzy import fuzz
import time
import sys
import os.path
import datetime
import multiprocessing as mp
import math

In [2]:
run_for = 'hco'
version = 0.5
num_process = mp.cpu_count() * 2

# Conditions
conditions = {
    'hcp' :
        {
            'condition_location': 85,
            'condition_address': 0,
            'condition_name': 89
        },
    'hco' :
        {
            'condition_location': 85,
            'condition_address': 75,
            'condition_name': 85
        },
    }

## Check path
Check, if we run in in the git directory or on the server. If on a server, look for the files in the same directory

In [3]:
#Check Server or Git
path_git = '../../data/3. transformation/3_list_expanded.csv'
path_server = '3_list_expanded.csv'

on_git = os.path.isfile(path_git)

## Read Data

In [7]:
if on_git:
    df = pd.read_csv(path_git)
else:
    df = pd.read_csv(path_server)

df_data = df[df.type == run_for].copy()
df_data['address'] = df_data['address'].fillna("")

#For Testing
#df_data = df_data[df_data.source.isin(['eli', 'shire', 'almirall'])]

#Reset index
df_data = df_data.reset_index(drop=True)

#Set Startindex to 1
df_data.index += 1

total_rows = len(df_data)

## Calc rows

In [8]:
#Convert
df_data['name_expand'] = df_data['name_expand'].astype("str")
df_data['address_expand'] = df_data['address_expand'].astype("str")
df_data['location_expand'] = df_data['location_expand'].astype("str")

#Sort
df_data = df_data.sort_values('name_expand')

cond = conditions[run_for]

start_time = time.time()

print("===============================")
print("Start fuzzy matcher THREADS %s %s" % (run_for, version))
print("Cores detected: %s" % mp.cpu_count())
print("Threads started: %s" % num_process)
print("Rows to match: %s" % total_rows)
print("Start time: %s" % datetime.datetime.now())
print("===============================")

def run(datasets):
    print("Thread started")
    df_data = datasets['df_data']
    df_part = datasets['df_part']
    
    #create empty matchlist
    df_matchlist = pd.DataFrame(columns=['source', 'target', 'r_name', 'r_address', 'r_location', 'r_ratio'])
    
    counter = 0
    total_rows = len(df_part)
    for index, row in df_part.iterrows():

        #Frist Fuzzynize only location
        df_data['r_location'] = 0
        df_data['r_address'] = 0
        df_data['r_name'] = 0

        df_data['r_location'] = df_data['location_expand'].apply(lambda x: fuzz.token_set_ratio(x, row['location_expand']))

        #Fuzzy name, when r_location >= 85

        df_data['r_name'] = df_data.loc[df_data.r_location >= cond['condition_location'], 'name_expand'].apply(lambda x: fuzz.token_set_ratio(x.lower(), row['name_expand']))

        #Fuzzy address, when r_location > 85 & r_name >= 80
        df_data['r_address'] = df_data.loc[(df_data.r_location >= cond['condition_location']) & (df_data.r_name >= cond['condition_name']), 'address_expand'].apply(lambda x: np.amax([fuzz.token_set_ratio(x, row['address_expand']), fuzz.partial_ratio(x, row['address_expand'])]))

        #condition_fix = (df_data.index != index) & (df_data['parent'] != index)
        condition_fix = (df_data.index != index)
        if row['address'] == '':
            condition1 = (df_data.r_name >= cond['condition_name']) & (df_data.r_location >= cond['condition_location']) & (condition_fix)
        else:
            condition1 = (df_data.r_name >= cond['condition_name']) & (df_data.r_location >= cond['condition_location']) & (df_data.r_address >= cond['condition_address']) & (condition_fix)

        #Select by condition
        df_matches = df_data[(condition1)]

        #Matchlist Add to matchlist
        if len(df_matches) == 0:
            df_matchlist = df_matchlist.append({'source': index,
                                                'target': index,
                                                'r_name': 100,
                                                'r_address': 100,
                                                'r_location': 100,
                                               }, ignore_index=True)
        else:
            for match_index, match_row in df_matches.iterrows():

                df_matchlist = df_matchlist.append({'source': index, 
                                                    'target': match_index,
                                                    'r_name': match_row['r_name'],
                                                    'r_address': match_row['r_address'],
                                                    'r_location': match_row['r_location'],
                                                   }, ignore_index=True)


        if counter % 10 == 0:
            sys.stdout.write("\rProgress: %s%%" % round(100 / total_rows * counter, 2))
            sys.stdout.flush()
                
        counter += 1

    #elapsed_time = time.time() - start_time
    #print('\nFinished in: ' + str(round(elapsed_time / 60, 2)) + ' minutes')
    return df_matchlist
    

#Create pool
pool = mp.Pool(processes = num_process)

#Create Jobs
jobs = []
job_len = int(math.ceil(len(df_data) / num_process))
for x in range(0, num_process):
    part = df_data[x * job_len : x * job_len + job_len]
    jobs.append({"df_data": df_data.copy(), "df_part": part})
    print('Thread Data len: ' + str(len(part)))

    
print("Total len: " + str(len(df_data)))
print("")

#Run Threats 
matchlist_list = pool.map(run, jobs)
pool.close()
pool.join()

#Concat Results
ds_matchlist_new = pd.concat(matchlist_list)
#print(str(len(matchlist_list)))
print("")
print("len ds_mathclist_new " + str(len(ds_matchlist_new)))

#Time Spend
elapsed_time = time.time() - start_time

print('\nFinished in: ' + str(round(elapsed_time / 60, 2)) + ' minutes')
    


Start fuzzy matcher THREADS hco 0.5
Cores detected: 4
Threads started: 8
Rows to match: 11981
Start time: 2019-03-07 13:50:36.389041
Thread started
Thread Data len: 1498
Thread Data len: 1498
Thread Data len: 1498
Thread Data len: 1498
Thread Data len: 1498
Thread Data len: 1498
Thread Data len: 1498
Thread Data len: 1495
Total len: 11981

Thread started
Thread started
Thread started
Thread started
Thread started
Thread started
Thread started
Progress: 1.34%

KeyboardInterrupt: 

Process ForkPoolWorker-23:
Process ForkPoolWorker-22:
Process ForkPoolWorker-17:
Process ForkPoolWorker-21:
Traceback (most recent call last):
Traceback (most recent call last):
Traceback (most recent call last):
  File "/Library/Frameworks/Python.framework/Versions/3.6/lib/python3.6/multiprocessing/process.py", line 258, in _bootstrap
    self.run()
  File "/Library/Frameworks/Python.framework/Versions/3.6/lib/python3.6/multiprocessing/process.py", line 258, in _bootstrap
    self.run()
  File "/Library/Frameworks/Python.framework/Versions/3.6/lib/python3.6/multiprocessing/process.py", line 93, in run
    self._target(*self._args, **self._kwargs)
  File "/Library/Frameworks/Python.framework/Versions/3.6/lib/python3.6/multiprocessing/process.py", line 258, in _bootstrap
    self.run()
  File "/Library/Frameworks/Python.framework/Versions/3.6/lib/python3.6/multiprocessing/process.py", line 93, in run
    self._target(*self._args, **self._kwargs)
Process ForkPoolWorker-19:
  File "/Libra

  File "/Library/Frameworks/Python.framework/Versions/3.6/lib/python3.6/multiprocessing/process.py", line 93, in run
    self._target(*self._args, **self._kwargs)
  File "<ipython-input-8-3cc8199de9b9>", line 38, in run
    df_data['r_location'] = df_data['location_expand'].apply(lambda x: fuzz.token_set_ratio(x, row['location_expand']))
  File "/Library/Frameworks/Python.framework/Versions/3.6/lib/python3.6/multiprocessing/pool.py", line 119, in worker
    result = (True, func(*args, **kwds))
  File "/Library/Frameworks/Python.framework/Versions/3.6/lib/python3.6/multiprocessing/pool.py", line 44, in mapstar
    return list(map(*args))
  File "/Users/hws/Documents/python/pharmagelder/env/lib/python3.6/site-packages/pandas/core/series.py", line 3194, in apply
    mapped = lib.map_infer(values, f, convert=convert_dtype)
  File "/Library/Frameworks/Python.framework/Versions/3.6/lib/python3.6/multiprocessing/pool.py", line 119, in worker
    result = (True, func(*args, **kwds))
  File "/L

  File "/Users/hws/Documents/python/pharmagelder/env/lib/python3.6/site-packages/pandas/core/series.py", line 767, in __getitem__
    result = self.index.get_value(self, key)
  File "/Users/hws/Documents/python/pharmagelder/env/lib/python3.6/site-packages/fuzzywuzzy/fuzz.py", line 163, in _token_set
    ratio_func(combined_1to2, combined_2to1)
  File "/Users/hws/Documents/python/pharmagelder/env/lib/python3.6/site-packages/pandas/core/indexes/base.py", line 3118, in get_value
    tz=getattr(series.dtype, 'tz', None))
  File "/Users/hws/Documents/python/pharmagelder/env/lib/python3.6/site-packages/fuzzywuzzy/StringMatcher.py", line 64, in ratio
    self._ratio = ratio(self._str1, self._str2)
KeyboardInterrupt
  File "/Users/hws/Documents/python/pharmagelder/env/lib/python3.6/site-packages/fuzzywuzzy/utils.py", line 47, in decorator
    return func(*args, **kwargs)
KeyboardInterrupt
  File "/Users/hws/Documents/python/pharmagelder/env/lib/python3.6/site-packages/fuzzywuzzy/utils.py", lin

In [None]:
#Drop columns
#df_data.drop(['r_name', 'r_location', 'r_address'], axis=1, inplace=True)

In [None]:
if on_git:
    df_data.to_csv('../../data/3. transformation/4_%s_matches.csv' % run_for, index=True)
    ds_matchlist_new.to_csv('../../data/3. transformation/4_%s_matchlist.csv' % run_for, index=False)
else:
    df_data.to_csv('4_%s_matches.csv' % run_for, index=True)
    ds_matchlist_new.to_csv('4_%s_matchlist.csv' % run_for, index=False)