In [1]:
import subprocess
from pathlib import Path
import glob, re

In [None]:
######Install BioTex
#!git clone https://gitlab.irstea.fr/jacques.fize/biotex_python.git
#cd biotex_python
#sudo pip3 install 

In [None]:
# Terms Extx Pkgs
from biotex import BiotexWrapper
#EDA Pkgs
import pandas as pd

In [None]:
from sentence_transformers import SentenceTransformer, util
import numpy as np
from nltk.corpus import stopwords

In [None]:
###extract text for BioTex input
def convert_pdf_to_txt(src_file_path):
    """
        Appel externe à pdftotext.
        -q : pas de message d'erreur dans la sortie.
         - : envoie la sortie dans la console au lieu d'un fichier texte.

        Capture de la sortie texte.

        @type  src_file_path: String.
        @param src_file_path: Chemin du fichier source.

        @rtype: String.
        @return: Texte brut.
    """
    completed_process = subprocess.run(["pdftotext", "-q", src_file_path, "-"], stdout=subprocess.PIPE)
    return completed_process.stdout.decode('utf-8')

In [None]:
# clean docs
def cleanhtml(raw_text, remove_punc=False, lower=False):
    """
    Replace HTML tags in a text.

    raw_html : str
        html in its raw form
    """
    clean_text = raw_text

    # Remove hmtl and url patterns
    patterns = [re.compile('<.*?>'), re.compile('\[\d\]'), re.compile('www.\S+.com')]

    for pattern in patterns:
        clean_text = re.sub(pattern, '', clean_text)

    # Special characters causing pb with Biotex
    # ['\n', '\t', 'ã', '€', "\'", "\xa0"]
    toRemove = ['\n', '\t','\"', 'ã', '€', "\xa0"]

    for char in toRemove:
        clean_text = re.sub(char, '', clean_text)

    # add whitespace after a dot
    rx = r"\.(?=\S)"
    clean_text = re.sub(rx, ". ", clean_text)

    if remove_punc:
        clean_text = re.sub('[^A-Za-z0-9]+', ' ', clean_text)

    if lower:
        clean_text = clean_text.lower()

    return clean_text.strip()

In [None]:
# src_file_path = "/home/rodrique/Bureau/Jupyter-notebook/herelles/corpus_experts/Urbanisme/Etude-urbaine-et-paysagere-A9-deplacee.pdf"
# convert_pdf_to_txt(src_file_path)

In [None]:
urb = "./corpus_experts/Urbanisme/*.pdf"
urb_file_list = glob.glob(urb) # Include slash or it will search in the wrong directory!!

In [None]:
risq = "./corpus_experts/Risques naturels/*.pdf"
risq_file_list = glob.glob(risq) # Include slash or it will search in the wrong directory!!

In [None]:
# file_list

In [None]:
urb_doc = {}
risq_doc = {}
for doc in urb_file_list:
    fnamesrc = Path(doc).stem
    print(fnamesrc)
    urb_doc[fnamesrc] = cleanhtml(convert_pdf_to_txt(doc))
    with open('./corpus_experts/terms_urb/urb_docB.txt', 'a') as f:
        f.write("%s\n" % convert_pdf_to_txt(doc))
        f.write("\n##########END##########\n")
    
for doc_ in risq_file_list:
    fnamesrc = Path(doc_).stem
    print(fnamesrc)
    risq_doc[fnamesrc] = cleanhtml(convert_pdf_to_txt(doc_))
    with open('./corpus_experts/terms_risq/risq_docB.txt', 'a') as f:
        f.write("%s\n" % convert_pdf_to_txt(doc_))
        f.write("\n##########END##########\n")

In [None]:
urb_doc.keys()

In [None]:
risq_doc.keys()

In [None]:
#### Extract terms with BioTEX
def biotex_terms_extractor(corpus, language):
    params = ['C_value','F-TFIDF-C_M']
    for p in params:
        wrapper_p = BiotexWrapper(language=language, score=p)
        
        root, filename = os.path.split(corpus)
        filename = filename.split('.txt')[0]
        filc = open(corpus, 'r')
        
        content = filc.read()
        contentdata = [content]
        data = wrapper_p.terminology(contentdata)
        data.to_csv('./corpus_experts/terms_urb/'+ filename + '_' +p+ ".csv", sep='\t')

In [None]:
# for k in urb_doc.keys():
#     content = urb_doc[k]
#     biotex_terms_extractor(content, 'french')

In [2]:
corpus_urb = "./corpus_experts/terms_urb/urb_docB.txt"
corpus_risq = "./corpus_experts/terms_risq/risq_docB.txt"

In [None]:
biotex_terms_extractor(corpus_urb, 'french')

In [None]:
# cv = "./corpus_experts/terms_risq/risq_docB_C_value.csv"  
# tf = "./corpus_experts/terms_risq/risq_docB_F-TFIDF-C_M.csv"

cv = "./corpus_experts/terms_urb/urb_docB_C_value.csv"
tf = "./corpus_experts/terms_urb/urb_docB_F-TFIDF-C_M.csv"
df1 = pd.read_csv(cv, sep='\t')
df2 = pd.read_csv(tf, sep='\t')

In [None]:
df1 = df1[['term', 'rank']]
df2 = df2[['term', 'rank']]
DF = []
DF.append(df1)
DF.append(df2)

In [None]:
from functools import reduce
df = reduce(lambda df1,df2: pd.merge(df1,df2,how="left", on=['term']), DF)


In [None]:
df

In [None]:
df['average'] = df.mean(axis=1)

In [None]:
df
final_df = df.sort_values(by=['average'], ascending=False)
final_df

In [None]:
final_df.to_csv('./corpus_experts/terms_urb/final_terms.csv', index=False)

In [None]:
# read kwds list
def read_kwd(txtfile):
    f = open(txtfile)
    content = f.read()
    kw_list = (content.lower()).split('\n')
    return kw_list

In [None]:
# terms semantic similarity evaluation
def eval_terms(corpus,queries,top_k):
    embedder = SentenceTransformer('distilbert-base-nli-stsb-mean-tokens')
    corpus_embeddings = embedder.encode(corpus, convert_to_tensor=True)
    D = {}
    for query in queries:
        D[query] = []
        query_embedding = embedder.encode(query, convert_to_tensor=True)
        cos_scores = util.pytorch_cos_sim(query_embedding, corpus_embeddings)[0]
        cos_scores = cos_scores.cpu()

        #We use np.argpartition, to only partially sort the top_k results
        top_results = np.argpartition(-cos_scores, range(top_k))[0:top_k]

        for idx in top_results[0:top_k]:
            D[query].append( float('%.4f' % (cos_scores[idx])) )
    for val_ in D:
        D[val_]= sum(D[val_])/len(D[val_])

    return D

In [None]:
# main function for similarity measure
def core_biotT_and_expertT(t_file, kwd_file, n_first):
    d_biotex = {}
#     files = glob.glob(t_file + '/*mean.csv',recursive = True) 
    root, filenam = os.path.split(t_file)
    filenam = filenam.split('.csv')[0]

    kw_list = read_kwd(kwd_file)
    kw_list = kw_list[:-1]

    terms = pd.read_csv(t_file, sep='\t|,', engine='python')

    df = terms['term']
    df = np.array(df)
    
    df = [word for word in df if (word not in stopwords.words('french'))]
    df = [word for word in df if word]
    d_bert = eval_terms(kw_list ,df[:1000],n_first)
    
    df = pd.DataFrame(list(d_bert.items()), columns=['term','rank'])
    final_df = df.sort_values(by=['rank'], ascending=False)

    final_df.to_csv(root+'/'+filenam+'_Bert.csv',index= False )

In [None]:
# score_biotT_and_expertT(t_file, kwd_file, n_first)

In [None]:
# biotex_out = "./corpus_experts/terms_risq/final_terms.csv"
# expert_concept = './Herelles_ress/termes_graines_natural.dangers.txt'  

biotex_out = "./corpus_experts/terms_urb/final_terms.csv"
expert_concept = "./Herelles_ress/termes_graines_urbanisme.txt"

n_first = 10
core_biotT_and_expertT(biotex_out, expert_concept, n_first) # data is in dic format 

In [None]:
####Check if terms already exist in the expert terms
expert_concept_risq = './Herelles_ress/termes_graines_natural.dangers.txt'  
expert_concept_urb = "./Herelles_ress/termes_graines_urbanisme.txt"

In [None]:
no_sim_risq = "./corpus_experts/terms_risq/final_terms.csv"
no_sim_urb = "./corpus_experts/terms_urb/final_terms.csv"

In [None]:
sim_risq = "./corpus_experts/terms_risq/final_terms_Bert.csv"
sim_urb = "./corpus_experts/terms_urb/final_terms_Bert.csv"

In [None]:
expert_concept_risq_list = read_kwd(expert_concept_risq)
expert_concept_urb_list = read_kwd(expert_concept_urb)

In [None]:
# expert_concept_risq_list

In [None]:
no_sim_risq_list = pd.read_csv(no_sim_risq)
no_sim_risq_list = no_sim_risq_list['term'].to_list()

In [None]:
no_sim_urb_list = pd.read_csv(no_sim_urb)
no_sim_urb_list = no_sim_urb_list['term'].to_list()

In [None]:

sim_urb_list = pd.read_csv(sim_urb)
sim_urb_list = sim_urb_list['term'].to_list()

In [None]:
sim_risq_list = pd.read_csv(sim_risq)
sim_risq_list = sim_risq_list['term'].to_list()

In [None]:
def list_to_csv(some_list, fname):
    df = pd.DataFrame(some_list, columns=["Termes"])
    df.to_csv(fname, index=False)

In [None]:
###################
final_no_sim_risq_list = [i for i in no_sim_risq_list if i not in expert_concept_risq_list]

In [None]:
final_no_sim_risq_list = [i for i in final_no_sim_risq_list if i]
list_to_csv(final_no_sim_risq_list, './corpus_experts/terms_risq/final_no_sim_risq_list.csv')

In [None]:
final_no_sim_urb_list = [i for i in no_sim_urb_list if i not in expert_concept_urb_list]

In [None]:
final_no_sim_urb_list
list_to_csv(final_no_sim_urb_list, './corpus_experts/terms_urb/final_no_sim_urb_list.csv')

In [None]:
###################"
final_sim_risq_list = [i for i in sim_risq_list if i not in expert_concept_risq_list]
final_sim_risq_list
list_to_csv(final_sim_risq_list, './corpus_experts/terms_risq/final_sim_risq_list.csv')

In [None]:
final_sim_urb_list = [i for i in sim_urb_list if i not in expert_concept_urb_list]
final_sim_urb_list
list_to_csv(final_sim_urb_list, './corpus_experts/terms_urb/final_sim_urb_list.csv')