In [None]:
from os.path import join

import numpy as np
import pandas as pd
import warnings
warnings.filterwarnings('ignore')

from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity

In [None]:
class Rmatch():
    def __init__(self, records_x, records_y, analyser='word', ngram_range=(1,1)):
        self.records_x = records_x
        self.records_y = records_y
        self.records = pd.concat([self.records_x, self.records_y])
        self.analyser = analyser
        self.ngram_range = ngram_range

    def get_fields_renamers(self):
        candidate_fields = {}
        rec2match_fields = {}
        for c in self.records.columns:
            candidate_fields[c] = f"canditate_{c}"
            rec2match_fields[c] = f"rec2match_{c}"
        self.candidate_fields_renamer = candidate_fields
        self.rec2match_fields_renamer = rec2match_fields

    def data_preprocessing(self):
        self.records['global_title'] = self.records['global_title'].astype('U')
        self.records['title'] = self.records['title'].astype('U')
        self.records['responsability'] = self.records['responsability'].astype('U')
        self.records['publisher'] = self.records['publisher'].astype('U')

        self.get_fields_renamers()

    def get_multifields_vectorizer(self, fields2vect):
        self.records['fields2vect'] = self.records[fields2vect].apply("-".join, axis=1)
        self.records['fields2vect'] = self.records['fields2vect'].apply(lambda x: ' '.join(set(x.split())))
        self.multifields_v = {}
        self.multifields_v['vectorizer'] = TfidfVectorizer(strip_accents='unicode',
                                                lowercase=True,
                                                ngram_range=self.ngram_range,
                                                analyzer=self.analyser)
        self.multifields_v['matrix'] = self.multifields_v['vectorizer'].fit_transform(self.records['fields2vect'])

        m = len(self.records_x)
        self.multifields_v['matrix_x'] = self.multifields_v['matrix'][:m]
        self.multifields_v['matrix_y'] = self.multifields_v['matrix'][m:]

    def get_multifields_cosine_similarity(self):
        self.cosine_similarity = cosine_similarity(self.multifields_v['matrix_x'],
                                                   self.multifields_v['matrix_y'])

    def get_candidates(self, a=0.5):
        self.records_y = self.records_y.rename(columns=self.candidate_fields_renamer)
        self.records_x = self.records_x.rename(columns=self.rec2match_fields_renamer)

        candidates = []
        self.candidates = pd.DataFrame()
        for idx, r in enumerate(self.cosine_similarity):
            rec2match = self.records_x.iloc[idx,:]
            for i in r.argsort()[-20:][::-1]:
                if r[i] > a:
                    candidate = self.records_y.iloc[i,:]
                    candidate['cos'] = r[i]
                    candidates.append(pd.concat([rec2match, candidate]))
        if len(candidates) > 0:
            self.candidates = pd.DataFrame(candidates)

    def get_best_candidates(self):
        if hasattr(self, "candidates"):
            if len(self.candidates) > 0:
                self.candidates['R'] = self.candidates['cos']

                self.candidates.loc[self.candidates['canditate_publication_date'].astype(str) != self.candidates['rec2match_publication_date'].astype(str), 'R']  = self.candidates['R'] * 0.7
                self.candidates.loc[self.candidates['canditate_numero_tome'].astype(str) != self.candidates['rec2match_numero_tome'].astype(str), 'R']  = self.candidates['R'] * 0.7

                best_candidates_idx = self.candidates.groupby(['rec2match_record_id'])['R'].transform(max) == self.candidates['R']
                self.best_candidates = self.candidates[best_candidates_idx]

In [None]:
#/content/drive/MyDrive/rbx/rbx-align/
data_bnf_file = "/content/drive/MyDrive/rbx/rbx-align/data/csv/bnf_mon_1875_1894.csv.gz"
data_bnf = pd.read_csv(data_bnf_file)

data_rbx_file = "/content/drive/MyDrive/rbx/rbx-align/data/csv/2023-04-02-destombes_1880.csv.gz"
data_rbx = pd.read_csv(data_rbx_file)

data = pd.concat([data_bnf, data_rbx])

In [None]:
rmatch = Rmatch(data_rbx, data_bnf)
rmatch.data_preprocessing()
rmatch.get_multifields_vectorizer(['global_title', 'title', 'publisher', 'responsability'])
rmatch.get_multifields_cosine_similarity()

In [None]:
rmatch.get_candidates(a=0.5)
rmatch.get_best_candidates()