In [1]:
from transformers import AutoTokenizer, AutoModelForTokenClassification
from transformers import pipeline
from tqdm import tqdm
from difflib import SequenceMatcher
import time
import glob 
import concurrent.futures
import json
import re
import html
import boto3
import pandas as pd 
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.metrics.pairwise import cosine_similarity

2021-09-15 08:54:32.422728: W tensorflow/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libcudart.so.11.0'; dlerror: libcudart.so.11.0: cannot open shared object file: No such file or directory
2021-09-15 08:54:32.422781: I tensorflow/stream_executor/cuda/cudart_stub.cc:29] Ignore above cudart dlerror if you do not have a GPU set up on your machine.


# sources
- https://towardsdatascience.com/how-to-apply-transformers-to-any-length-of-text-a5601410af7f
- https://github.com/jamescalam/transformers/blob/main/course/language_classification/04_window_method_in_pytorch.ipynb
- https://towardsdatascience.com/benchmark-ner-algorithm-d4ab01b2d4c3
- https://medium.com/@armandj.olivares/using-bert-for-classifying-documents-with-long-texts-5c3e7b04573d
- https://towardsdatascience.com/calculating-string-similarity-in-python-276e18a7d33a
- https://huggingface.co/Jean-Baptiste/camembert-ner?text=Apple+est+cr%C3%A9%C3%A9e+le+1er+avril+1976

# Instanciation d'un pipeline

Instanciation d'un pipeline contenant un tokenizer ainsi qu'un modèle BERT préentraînés :

In [2]:
tokenizer = AutoTokenizer.from_pretrained("Jean-Baptiste/camembert-ner")
model = AutoModelForTokenClassification.from_pretrained("Jean-Baptiste/camembert-ner")
nlp = pipeline('ner', model=model, tokenizer=tokenizer, aggregation_strategy="simple")

# Application de BERT pour un long document 

Nous allons __segmenter le document d'entrée en textes plus petits__ (~200 mots chacun) et appliquer BERT à chacun d'eux. 

Chaque texte aura ~ 25 mots en commun avec le texte précédent, afin d'éviter une __perte d'information__, dans le cas où une organisation se trouverait tronquée en fin de texte.

In [5]:
with open ("text_folder/textfromid_1.txt", "r") as f:
    raw_text = f.read().replace('\n', '') 

## Tokenisation avec superposition

In [6]:
def get_chunks_with_overlap(raw_text, group_size=150, overlap_size=25):
    """Fonction qui split un document en sous-documents (= chunks) de 150 tokens chacun, avec un overlap de 25 tokens."""
    
    tokenized_text = raw_text.split(sep=' ')
    list_of_tokenized_chunk = [tokenized_text[i:i+group_size] for i in range(0, len(tokenized_text), group_size-overlap_size)]
    list_of_chunk = [' '.join(list_of_tokenized_chunk[i]) for i in range(len(list_of_tokenized_chunk))]
    return list_of_chunk

In [7]:
list_of_chunk = get_chunks_with_overlap(raw_text)

In [9]:
list_of_chunk[0]

"ID : WnqmjtUDCrRW_C0022A1001L029433D20161121H171206TPIJTES003PDBOR.pdf. GREFFE .DU TRIBUNALOMMERCERCS : THONON LES BAINSCode greffe : 7402Actes des sociétés, ordonnances rendues en matière de société, actes des personnes physiquesREGISTRE DU COMMERCE ET DES SOCIETESLe greffier du tribunal de commerce de THONON LES BAINS atteste l'exactitude desinformations transmises ci-aprèsNature du document : Actes des sociétés (A)Numéro de gestion : 2007 B 00457Numéro SIREN : 499 399 806Nom ou dénomination : LUDIMMOCe dépôt a été enregistré le 01/11/2016 sous le numéro de dépôt 3394DuplicataRECEPISSE DE DEPOTGREFFE DUTRIBUNAL DE COMMERCEDE THONON-LES-BAINS10, Rue de l'Hotel-Dieu - BP 6052174203 THONON LES BAINS CEDEXTel : 04.50.72.13.20SELARL IXA37 rue CassiopéeParc Altaïs74650 CHAVANODV/REF :N/REF : 2007 B 457 / 2016-A-3394Le Greffier du Tribunal de Commerce DE THONON-LES-BAINS certifie qu'il a reçu le 19/10/2016, les actessuivants :Procès-verbal d'assemblée générale extraordinaire en date du 11/

In [10]:
list_of_chunk[1]

"générale extraordinaire en date du 11/10/2016- Réduction du capital socialConcernant la sociétéLUDIMMOSociété à responsabilité limitée13 place Jules Mercier74200 Thonon-les-BainsLe dépôt a été enregistré sous le numéro 2016-A-3394 le 01/11/2016R.C.S. THONON 499 399 806 (2007 B 457)Fait à THONON-LES-BAINS le 01/11/2016,L'un des Greffiers AssociésTHONON LES BAINS * (GIOTRIBUNAL DE COMME(HTE SAVOIE) * SNITLUDIMMOSociété à responsabilité limitéeau capital de 8 000 EurosSiège social : 13 Place Jules Mercier74200 THONON LES BAINS499 399 806 RCS THONONDELIBERATIONS DE L'ASSEMBLEE GENERALE EXTRAORDINAIREEN DATE DU 11 OCTOBRE 2016- 1 -L'an Deux Mille Seize,Le onze octobre à quatorze heures trente,Les associés de la société « LUDIMMO », société à responsabilité limitée au capital de8 000 euros, divise en 500 parts de 16 euros chacune, se sont réunis en assemblée généraleextraordinaire, au siège social, sur convocation de la gérance.SONT PRESENTS :- Monsieur Ludovic GOSSELIN, détenant250 parts- 

In [12]:
len(list_of_chunk)

15

Bien que nous ayons découpé notre document de manière à avoir des petits textes de 150 tokens, le tokenizer ne tokenize pas tout à fait de la même manière, le nombre de tokens associé à chaque texte varie sensiblement d'un texte à l'autre : 

In [13]:
for i in range(len(list_of_chunk)):
    tokens = tokenizer.encode_plus(list_of_chunk[i], add_special_tokens=False, return_tensors='pt')
    print(f"Number of tokens into chunk n°{i+1} : {len(tokens['input_ids'][0])}")

Number of tokens into chunk n°1 : 382
Number of tokens into chunk n°2 : 307
Number of tokens into chunk n°3 : 238
Number of tokens into chunk n°4 : 235
Number of tokens into chunk n°5 : 229
Number of tokens into chunk n°6 : 217
Number of tokens into chunk n°7 : 211
Number of tokens into chunk n°8 : 285
Number of tokens into chunk n°9 : 210
Number of tokens into chunk n°10 : 304
Number of tokens into chunk n°11 : 224
Number of tokens into chunk n°12 : 249
Number of tokens into chunk n°13 : 239
Number of tokens into chunk n°14 : 298
Number of tokens into chunk n°15 : 162


## Application de BERT (threading)

Nous stockons dans la liste `list_entities` les entités de type __organisation__ contenus dans chaque, avec un __score strictement supérieur à 0.95__ :

In [14]:
list_entities = []

t1 = time.perf_counter()

def apply_bert(chunk):
    text_nlp = nlp(chunk)
    only_org_entities = [d for d in text_nlp if d["entity_group"] == "ORG" and d["score"] > 0.95]
    list_entities.append(only_org_entities)
    
with concurrent.futures.ThreadPoolExecutor() as executor:
    list(tqdm(executor.map(apply_bert, list_of_chunk), total=len(list_of_chunk)))
    
t2 = time.perf_counter()
print(f"Finished in {round(t2-t1, 2)} seconds")

100%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 15/15 [00:18<00:00,  1.25s/it]

Finished in 18.91 seconds





In [16]:
list_entities

[[],
 [],
 [],
 [{'entity_group': 'ORG',
   'score': 0.9787076,
   'word': 'TIT',
   'start': 111,
   'end': 115}],
 [],
 [{'entity_group': 'ORG',
   'score': 0.9754317,
   'word': 'TIT',
   'start': 780,
   'end': 784}],
 [{'entity_group': 'ORG',
   'score': 0.9515741,
   'word': 'LUDIMMO',
   'start': 95,
   'end': 102},
  {'entity_group': 'ORG',
   'score': 0.97765285,
   'word': 'SNITLUDIMMO',
   'start': 401,
   'end': 413},
  {'entity_group': 'ORG',
   'score': 0.96663314,
   'word': 'LUDIMMO',
   'start': 725,
   'end': 733},
  {'entity_group': 'ORG',
   'score': 0.98905826,
   'word': 'TIT',
   'start': 1009,
   'end': 1013}],
 [{'entity_group': 'ORG',
   'score': 0.9841066,
   'word': 'LUDIMMO',
   'start': 500,
   'end': 508},
  {'entity_group': 'ORG',
   'score': 0.9857166,
   'word': 'LUDIMMO',
   'start': 1056,
   'end': 1063}],
 [{'entity_group': 'ORG',
   'score': 0.98789907,
   'word': 'TIT',
   'start': 257,
   'end': 261},
  {'entity_group': 'ORG',
   'score': 0.99078

On fusionne les listes obtenues pour chaque texte dans une seule liste `list_entities_merged`, puis nous conservons uniquement les éléments uniques dans `list_entities_merged_unique` :

In [24]:
list_entities_merged = [item for sublist in list_entities for item in sublist]
list_entities_merged_unique = list({v['word']:v for v in list_entities_merged}.values())
list_entities_merged_unique

[{'entity_group': 'ORG',
  'score': 0.98986006,
  'word': 'TIT',
  'start': 783,
  'end': 787},
 {'entity_group': 'ORG',
  'score': 0.98635346,
  'word': 'LUDIMMO',
  'start': 752,
  'end': 760},
 {'entity_group': 'ORG',
  'score': 0.97765285,
  'word': 'SNITLUDIMMO',
  'start': 401,
  'end': 413},
 {'entity_group': 'ORG',
  'score': 0.99029267,
  'word': 'T.I.T.A',
  'start': 551,
  'end': 559}]

In [26]:
print(f"Number of candidates for this document: {len(list_entities_merged_unique)}")

Number of candidates for this document: 4


# Fonctions utiles

In [64]:
def get_chunks_with_overlap(raw_text, group_size=150, overlap_size=25):
    """Fonction qui split un document en sous-documents de 150 tokens chacun, avec un overlap de 25 tokens."""
    
    tokenized_text = raw_text.split(sep=' ')
    list_of_tokenized_chunk = [tokenized_text[i:i+group_size] for i in range(0, len(tokenized_text), group_size-overlap_size)]
    list_of_chunk = [' '.join(list_of_tokenized_chunk[i]) for i in range(len(list_of_tokenized_chunk))]
    return list_of_chunk


def get_entities_threading(document):
    """Fonction qui retourne une liste de dictionnaires contenant des informations sur les organisations du document."""
    
    with open (document, "r") as f:
        raw_text = f.read().replace('\n', ' ')
    list_of_chunk = get_chunks_with_overlap(raw_text)
    
    list_entities = []

    def apply_bert(chunk):
        text_nlp = nlp(chunk)
        only_org_entities = [d for d in text_nlp if d["entity_group"] == "ORG" and d["score"] > 0.95]
        list_entities.append(only_org_entities)
    
    with concurrent.futures.ThreadPoolExecutor() as executor:
        list(tqdm(executor.map(apply_bert, list_of_chunk), total=len(list_of_chunk)))
       
    list_entities_merged = [item for sublist in list_entities for item in sublist]
    list_entities_merged_unique = list({v['word']:v for v in list_entities_merged}.values())
    
    return list_entities_merged_unique


def get_entity_names(output):
    """Fonction qui retourne une liste des organisations en supprimant les doublons exactes."""
    
    list_of_candidates = [d["word"].lower() for d in output]
    return list(set(list_of_candidates))

# Généralisation à 100 documents

In [141]:
list_of_docs = sorted([file for file in glob.glob("text_folder/*.txt")])
list_of_output = [get_entities_threading(document=doc) for doc in list_of_docs]

100%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 17/17 [00:18<00:00,  1.09s/it]
100%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 44/44 [00:48<00:00,  1.10s/it]
100%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 67/67 [01:07<00:00,  1.01s/it]
100%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 78/78 [01:26<00:00,  1.11s/it]
100%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 32/32 [00:30<00:00,  1.04it/s]
100%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 41/41 [00:41<00:00,  1.02s/it]
100%|█████████████████████████████████████████████████████████████████

In [165]:
list_of_orga = [get_entity_names(output) for output in list_of_output]

In [183]:
# df = pd.DataFrame(list(zip(list_of_docs, list_of_orga)), columns=['text_file', 'organizations'])
# df.to_csv("list_of_orga_100_docs.csv", index=False)
# df = pd.DataFrame(list(zip(list_of_docs, list_of_output)), columns=['text_file', 'ner_output'])
# df.to_csv("list_of_output_100_docs.csv", index=False)

`list_of_orga` est une liste de $n$ listes où chaque liste contient le nom des organisations detectés au sein des $n$ documents

In [188]:
list_of_orga[:4]

[['ludimmo', 't.i.t. a', 't.i.t.', 'tit'],
 ['société civile professionnelle', 'asscoblec', 'société civile immobilière'],
 ['société', 'société civile'],
 ['société civile professionnelle',
  'kj',
  'société civile immobilière',
  'electricite de france',
  'sci kj',
  'sci k.j.']]

# Cosine similarity

\begin{equation}
sim(x,y) = \frac{x \cdot y}{\lVert x \rVert \lVert y \rVert}
\end{equation}

L'objectif de cette partie est de __calculer la cosine similarity entre les organisations que BERT a detecté pour chaque document et la liste de banque__ issue de l'appel API du regafi. L'idée est de mettre de côté les organisations detectées par BERT qui ne sont pas des banques.

In [190]:
with open("banks_folder/banksV2.json", 'r') as f :
    js = json.load(f)
    
list_bank = [html.unescape(entry["registered_name"]).lower() for entry in js]

In [8]:
def cosine_sim_vectors(vec1, vec2):
    vec1 = vec1.reshape(1, -1)
    vec2 = vec2.reshape(1, -1)
    return cosine_similarity(vec1, vec2)[0][0]

def looking_for_bank(list_of_organizations, degree_of_similarity=0.75):
    
    list_of_banks_among_entities = []
    
    orga_and_bank_list = [*list_of_organizations, *list_bank]
    vectorizer = CountVectorizer().fit_transform(orga_and_bank_list)
    vectors = vectorizer.toarray()
    csim = cosine_similarity(vectors)
    
    for i in range(len(list_of_organizations)):
        for j in range(len(list_of_organizations), len(orga_and_bank_list)):
            if cosine_sim_vectors(vectors[i], vectors[j]) >= degree_of_similarity:
                print("Similarity between \033[1m{}\033[0m and \033[1m{}\033[0m : {}".format(orga_and_bank_list[i],
                      orga_and_bank_list[j], cosine_sim_vectors(vectors[i], vectors[j])))
                list_of_banks_among_entities.append(orga_and_bank_list[i])
                list_of_banks_among_entities = list(set(list_of_banks_among_entities))
    print("")            
    print(f"List of banks : {list_of_banks_among_entities}")
    
def get_list_of_bank(list_of_organizations, degree_of_similarity=0.75):
    
    list_of_banks_among_entities = []
    
    orga_and_bank_list = [*list_of_organizations, *list_bank]
    vectorizer = CountVectorizer().fit_transform(orga_and_bank_list)
    vectors = vectorizer.toarray()
    csim = cosine_similarity(vectors)
    
    for i in range(len(list_of_organizations)):
        for j in range(len(list_of_organizations), len(orga_and_bank_list)):
            if cosine_sim_vectors(vectors[i], vectors[j]) >= degree_of_similarity:
                list_of_banks_among_entities.append(orga_and_bank_list[i])
                list_of_banks_among_entities = list(set(list_of_banks_among_entities))
                            
    return list_of_banks_among_entities

In [203]:
for i, orga in enumerate(list_of_orga):
    print(list_of_docs[i])
    looking_for_bank(orga)
    print("-"*100)

text_folder/textfromid_1.txt

List of banks : []
----------------------------------------------------------------------------------------------------
text_folder/textfromid_10.txt

List of banks : []
----------------------------------------------------------------------------------------------------
text_folder/textfromid_100.txt

List of banks : []
----------------------------------------------------------------------------------------------------
text_folder/textfromid_11.txt

List of banks : []
----------------------------------------------------------------------------------------------------
text_folder/textfromid_12.txt
Similarity between [1msociété en nom collectif[0m and [1mnatiocredimurs, société en nom collectif[0m : 0.8944271909999159
Similarity between [1msociété générale[0m and [1msociété générale[0m : 0.9999999999999998
Similarity between [1msociété générale[0m and [1msociété générale - forge[0m : 0.816496580927726
Similarity between [1msociété générale[0m a

Similarity between [1mcrédit mutuel[0m and [1mcrédit mutuel arkéa[0m : 0.816496580927726
Similarity between [1mcrédit mutuel[0m and [1mcrédit mutuel factoring[0m : 0.816496580927726
Similarity between [1mcrédit mutuel[0m and [1mcrédit mutuel leasing[0m : 0.816496580927726
Similarity between [1mcrédit agricole[0m and [1mcrédit agricole s.a.[0m : 0.9999999999999998
Similarity between [1mcrédit agricole[0m and [1mcrédit agricole titres[0m : 0.816496580927726

List of banks : ['crédit mutuel', 'crédit agricole']
----------------------------------------------------------------------------------------------------
text_folder/textfromid_36.txt
Similarity between [1mbnp paribas lease group[0m and [1mbnp paribas lease group[0m : 1.0

List of banks : ['bnp paribas lease group']
----------------------------------------------------------------------------------------------------
text_folder/textfromid_37.txt

List of banks : []
----------------------------------------------


List of banks : ['caisse de crédit agricole']
----------------------------------------------------------------------------------------------------
text_folder/textfromid_50.txt

List of banks : []
----------------------------------------------------------------------------------------------------
text_folder/textfromid_51.txt
Similarity between [1mbanque crédit lyonnais[0m and [1mcrédit lyonnais[0m : 0.816496580927726

List of banks : ['banque crédit lyonnais']
----------------------------------------------------------------------------------------------------
text_folder/textfromid_52.txt

List of banks : []
----------------------------------------------------------------------------------------------------
text_folder/textfromid_53.txt

List of banks : []
----------------------------------------------------------------------------------------------------
text_folder/textfromid_54.txt
Similarity between [1msociété en nom collectif[0m and [1mnatiocredimurs, société en nom colle


List of banks : ['caisse régionale de crédit agricole mutuel']
----------------------------------------------------------------------------------------------------
text_folder/textfromid_58.txt

List of banks : []
----------------------------------------------------------------------------------------------------
text_folder/textfromid_59.txt

List of banks : []
----------------------------------------------------------------------------------------------------
text_folder/textfromid_6.txt
Similarity between [1mbred banque populaire[0m and [1mbred - banque populaire[0m : 1.0000000000000002
Similarity between [1mbred banque populaire. société coopérative de banque populaire[0m and [1mbred - banque populaire[0m : 0.8333333333333335
Similarity between [1mbanque populaire[0m and [1mbanque centrale populaire[0m : 0.816496580927726
Similarity between [1mbanque populaire[0m and [1mbanque populaire mediterranee[0m : 0.816496580927726
Similarity between [1mbanque populaire[0m


List of banks : ['crédit agricole']
----------------------------------------------------------------------------------------------------
text_folder/textfromid_8.txt

List of banks : []
----------------------------------------------------------------------------------------------------
text_folder/textfromid_80.txt
Similarity between [1msc crédit agricole[0m and [1mcrédit agricole s.a.[0m : 0.816496580927726
Similarity between [1mcrédit agricole[0m and [1mcrédit agricole s.a.[0m : 0.9999999999999998
Similarity between [1mcrédit agricole[0m and [1mcrédit agricole titres[0m : 0.816496580927726

List of banks : ['sc crédit agricole', 'crédit agricole']
----------------------------------------------------------------------------------------------------
text_folder/textfromid_81.txt
Similarity between [1mbnp paribas[0m and [1mbgl bnp paribas[0m : 0.816496580927726
Similarity between [1mbnp paribas[0m and [1mbnp paribas[0m : 0.9999999999999998
Similarity between [1mbnp 

Similarity between [1mbred banque populaire[0m and [1mbred - banque populaire[0m : 1.0000000000000002
Similarity between [1mbred banque populaire banque populaire[0m and [1mbanque centrale populaire[0m : 0.769800358919501
Similarity between [1mbred banque populaire banque populaire[0m and [1mbanque populaire mediterranee[0m : 0.769800358919501
Similarity between [1mbred banque populaire banque populaire[0m and [1mbanque populaire occitane[0m : 0.769800358919501
Similarity between [1mbred banque populaire banque populaire[0m and [1mbred - banque populaire[0m : 0.9622504486493763
Similarity between [1mbred banque populaire banque populaire[0m and [1mcasden banque populaire[0m : 0.769800358919501

List of banks : ['bred banque populaire banque populaire', 'bred banque populaire']
----------------------------------------------------------------------------------------------------
text_folder/textfromid_99.txt
Similarity between [1mbanque société générale[0m and [

In [207]:
list_of_bank = [get_list_of_bank(orga) for orga in tqdm(list_of_orga)]

100%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 100/100 [12:45<00:00,  7.65s/it]


## Récupération des pdf_id

In [229]:
list_of_pdf_id = []
for doc in list_of_docs:
    with open(doc) as f:
        first_line = f.readline()
        pdf_id = re.findall('\w+.pdf', first_line)[0]
        list_of_pdf_id.append(pdf_id)

## Génération d'un lien vers le bucket S3

In [268]:
client = boto3.client("s3",
    aws_access_key_id = "AKIA3OMEZM6WKODIVOGG",
    aws_secret_access_key = "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",
    region_name='eu-west-3')

def get_url_to_s3(s3path):
    url = client.generate_presigned_url(
            ClientMethod='get_object', 
            Params={'Bucket': 'inpi-document', 'Key': s3path},
            ExpiresIn=3600*168) # lien valable 1 semaine (normalement)
    return url

## Rassemblement de l'ensemble des informations dans une table

In [267]:
df = pd.DataFrame({"text": list_of_docs, "pdf_id": list_of_pdf_id, "estimated_banks": list_of_bank})
df['s3_path'] = df['pdf_id'].apply(get_url_to_s3)
df.head()

Unnamed: 0,text,pdf_id,estimated_banks
0,text_folder/textfromid_1.txt,WnqmjtUDCrRW_C0022A1001L029433D20161121H171206...,[]
1,text_folder/textfromid_10.txt,mnFCe1s7nxBw_RACT2_330842709_7501_1996D02804_2...,[]
2,text_folder/textfromid_100.txt,1kfLTqLgknxi_RACT0400804187.pdf,[]
3,text_folder/textfromid_11.txt,NCck02WzUuqj_RACT0400910929.pdf,[]
4,text_folder/textfromid_12.txt,NMQZ3EapNHjr_RACT0800753139.pdf,"[société en nom collectif, société générale]"


In [257]:
df.to_csv("final_table.csv", index=False)

# Suppression des banques estimées trop similaires (bloqué)

In [1]:
import pandas as pd
import numpy as np

In [2]:
df = pd.read_csv("final_table.csv")

In [3]:
df.tail()

Unnamed: 0,text,pdf_id,estimated_banks,s3_path
95,text_folder/textfromid_95.txt,QDCzqSoFoilJ_C0022A1001L849447D20161114H165457...,[],https://inpi-document.s3.amazonaws.com/QDCzqSo...
96,text_folder/textfromid_96.txt,BkrpPkdBawrG_RACT0401005103.pdf,['banque de france'],https://inpi-document.s3.amazonaws.com/BkrpPkd...
97,text_folder/textfromid_97.txt,58371707000000_C0022A1001L382154D20140809H0721...,"['la société générale, société anonyme']",https://inpi-document.s3.amazonaws.com/5837170...
98,text_folder/textfromid_98.txt,LQkAEYQbGi7i_C0023A1001M000515D20200806H053548...,"['bred banque populaire banque populaire', 'br...",https://inpi-document.s3.amazonaws.com/LQkAEYQ...
99,text_folder/textfromid_99.txt,59564273000000_C0022A1001L080279D20140812H2338...,"['banque cic est s.a.', 'crédit lyonnais', 'ba...",https://inpi-document.s3.amazonaws.com/5956427...


In [155]:
list_of_bank = ['deutsche leasing france operating s.a.s.',
                'abn amro. bank',
                'deutsche leasing france operating sas',
                'abn amro bank abn amro bank',
                'abn amro bank n.v',
                'deutsche leasing france operating']


vectorizer = CountVectorizer().fit_transform(list_of_bank)
vectors = vectorizer.toarray()
csim = cosine_similarity(vectors)
csim

array([[1.        , 0.        , 0.89442719, 0.        , 0.        ,
        1.        ],
       [0.        , 1.        , 0.        , 1.        , 1.        ,
        0.        ],
       [0.89442719, 0.        , 1.        , 0.        , 0.        ,
        0.89442719],
       [0.        , 1.        , 0.        , 1.        , 1.        ,
        0.        ],
       [0.        , 1.        , 0.        , 1.        , 1.        ,
        0.        ],
       [1.        , 0.        , 0.89442719, 0.        , 0.        ,
        1.        ]])

In [156]:
csim[:, 0]

array([1.        , 0.        , 0.89442719, 0.        , 0.        ,
       1.        ])

In [142]:
# csim[np.triu_indices(len(csim), k = 1)]

In [143]:
# np.argwhere((csim >= 0.75) & (csim < 0.99))

In [158]:
for bank in list_of_bank:
    print(bank)

deutsche leasing france operating s.a.s.
abn amro. bank
deutsche leasing france operating sas
abn amro bank abn amro bank
abn amro bank n.v
deutsche leasing france operating


In [157]:
result_list = []
removing_list = []

for i in range(len(list_of_bank)):
    for j in range(len(list_of_bank)):
        if i != j and cosine_sim_vectors(vectors[i], vectors[j]) >= 0.75:
            print(f"i={i}, j={j}")
            print(f"{list_of_bank[i]} VS {list_of_bank[j]} : {cosine_sim_vectors(vectors[i], vectors[j])}")
            print("-"*100)

i=0, j=2
deutsche leasing france operating s.a.s. VS deutsche leasing france operating sas : 0.8944271909999159
----------------------------------------------------------------------------------------------------
i=0, j=5
deutsche leasing france operating s.a.s. VS deutsche leasing france operating : 1.0
----------------------------------------------------------------------------------------------------
i=1, j=3
abn amro. bank VS abn amro bank abn amro bank : 1.0000000000000002
----------------------------------------------------------------------------------------------------
i=1, j=4
abn amro. bank VS abn amro bank n.v : 1.0000000000000002
----------------------------------------------------------------------------------------------------
i=2, j=0
deutsche leasing france operating sas VS deutsche leasing france operating s.a.s. : 0.8944271909999159
----------------------------------------------------------------------------------------------------
i=2, j=5
deutsche leasing france ope