In [None]:
import tika
from tika import parser
import re
import glob
import pandas as pd
import numpy as np
from nltk.corpus import stopwords
import nltk
import itertools

import warnings
warnings.filterwarnings("ignore")
from random import randint

import seaborn as sns
%matplotlib inline
import matplotlib.pyplot as plt
plt.style.use('seaborn')

#from gensim.models import KeyedVectors
from textblob import TextBlob
from sklearn.feature_extraction.text import CountVectorizer

from sklearn.metrics import confusion_matrix, precision_score, recall_score, classification_report, average_precision_score

In [None]:
# get data file names:
path = r'/Users/robinmasliah/Documents/BanffMining/biopsy_scrap/'
filenames = glob.glob(path + "/*.pdf")

pdfs = []
for filename in filenames:
    pdfs.append(parser.from_file(filename))

In [None]:
filenames

In [None]:
# À compléter
# fibrinoid necrosis => v3
# transmural inflammation => v3

medical_words = {
    "nb_glomeruli" : ['number of glomeruli', 'glomeruli'],
    "glom_scler" : ['globally sclerotic'],
    "g": ['glomerulitis', 'marginated cells', 'acute allograft glomerulopathy'],
    "i-ifta": ['inflammatory cell infiltrate'],
    "i": ['tubulointerstitial', 'infiltrate', 'inflammatory', 'unscarred cortical inflammation', 'inflammatory cells'],
    "t": ['tubulitis'],
    "ti": ['total inflammation', 'total cortical inflammation scarred and unscarred'],
    "ci": ['interstitial fibrosis', 'tubulointerstitial fibrosis', 'interstitium'],
    "ct": ['tubular atrophy', 'atrophic tubules', 'tubules'],
    "cv": ['fibrous intimal thickening', 'fibrous intimal arteriosclerosis', 
           'intimal fibrosis', 'arterial sclerosis', 'atherosclerosis', 'thickened intima',
          'vessels', 'arteries', 'arterioles'],
    "v": ['endothelialitis', 'trombosis', 'intimal arteritis', 'endarteritis', 'endovasculitis',
         'endotheliitis', 'endarteritis', 'fibrinoid necrosis', 'transmural inflammation',
         'vessels', 'arteries', 'arterioles'],
    "ah": ['arteriolar hyalinosis', 'vessels', 'arteries', 'arterioles', 'arteriolar hyaline'],
    "mm": ['mesangial matrix increase '],
    "ptc": ['peritubular capilaritis', 'peripheral capilaries','peritubular capillary', 
            'peritubular capillaries', 'peritubular capillaritis'],
    "cg": ['transplant glomerulopathy', 'glomerular basement membranes', 'doubles countours', 
           'double contour'],
    'c4d' : ['c4d']
    
}

degrees = {
    "-1": [' negativity ', ' negative '],
    "+1": [' positivity ', ' positive '],
    "0": [' no ', ' not ', ' non-', 'minimal', '5%', 'without', 'none'],
    "1": ['mild', ' is noted', 'focal', '15-20%'],
    "2": ['moderate', '30%'],
    "3": ['severe', '50%', '95%', 'dense']
}

numbers = {
    "0": [' zero '],
    "1": [' one '],
    "2": [' two '],
    "3": [' three '],
    "4": [' four '],
    "5": [' five '],
    "6": [' six '],
    "7": [' seven '],
    "8": [' eight '],
    "9": [' nine '],
    "10": [' ten '],
}

banff_list = ['nb_glomeruli', 'glom_scler', ' g ', ' i ', ' t ', 
              ' ti ', ' ci ', ' ct ', ' cv ', ' v ', ' ah ',
             ' mm ', ' ptc ', ' cg ', ' c4d ']

In [None]:
# Levenstein distance
def LD(s, t):
    
    if s == "":
        return len(t)
    if t == "":
        return len(s)
    if s[-1] == t[-1]:
        cost = 0
    else:
        cost = 1

    res = min([LD(s[:-1], t) + 1,
               LD(s, t[:-1]) + 1,
               LD(s[:-1], t[:-1]) + cost])
    return res

# Text cleaning
def replace(m_string):
    return m_string.replace(":", "").replace(";", "").replace(")", "").replace("(", "").replace("\u200b", "").lower()

# Remove stopwords
def remove_stopwords(word_list):
    '''''
    I : Liste de mots
    O : Liste de mots filtrée
    '''''
    return [word for word in word_list if word not in stopwords.words('english')]

# regex
def regex_transformation(regex, str):
    
    t = regex.findall(str)
    t = ''.join(t)
    match_number = re.compile('-?\ *[0-9]+\.?[0-9]*(?:[Ee]\ *-?\ *[0-9]+)?')
    t = [float(x) for x in re.findall(match_number, t)]
    return t

# -*- coding: utf-8 -*-
alphabets = "([A-Za-z])"
prefixes = "(Mr|St|Mrs|Ms|Dr)[.]"
suffixes = "(Inc|Ltd|Jr|Sr|Co)"
starters = "(Mr|Mrs|Ms|Dr|He\s|She\s|It\s|They\s|Their\s|Our\s|We\s|But\s|However\s|That\s|This\s|Wherever)"
acronyms = "([A-Z][.][A-Z][.](?:[A-Z][.])?)"
websites = "[.](com|net|org|io|gov)"


# split texte en listes de phrases
def split_into_sentences(text):
    
    text = " " + text + "  "
    text = text.replace("\n", " ")
    text = re.sub(prefixes, "\\1<prd>", text)
    text = re.sub(websites, "<prd>\\1", text)
    if "Ph.D" in text:
        text = text.replace("Ph.D.", "Ph<prd>D<prd>")
    text = re.sub("\s" + alphabets + "[.] ", " \\1<prd> ", text)
    text = re.sub(acronyms + " " + starters, "\\1<stop> \\2", text)
    text = re.sub(alphabets + "[.]" + alphabets + "[.]" +
                  alphabets + "[.]", "\\1<prd>\\2<prd>\\3<prd>", text)
    text = re.sub(alphabets + "[.]" + alphabets +
                  "[.]", "\\1<prd>\\2<prd>", text)
    text = re.sub(" " + suffixes + "[.] " + starters, " \\1<stop> \\2", text)
    text = re.sub(" " + suffixes + "[.]", " \\1<prd>", text)
    text = re.sub(" " + alphabets + "[.]", " \\1<prd>", text)
    if "”" in text:
        text = text.replace(".”", "”.")
    if "\"" in text:
        text = text.replace(".\"", "\".")
    if "!" in text:
        text = text.replace("!\"", "\"!")
    if "?" in text:
        text = text.replace("?\"", "\"?")
    text = text.replace(".", ".<stop>")
    text = text.replace("?", "?<stop>")
    text = text.replace("!", "!<stop>")
    text = text.replace("<prd>", ".")
    sentences = text.split("<stop>")
    sentences = sentences[:-1]
    sentences = [s.strip() for s in sentences]
    return sentences

# Bug des listes dans le dataframe
def clean_table(appended_data):
    
    for i, var in enumerate(appended_data):
        appended_data[var] = appended_data[var].str[0]
    return appended_data

# Prend les valeurs dans le texte
def search_direct_values(str):

    str = replace(str)

    regex_glomeruli = re.compile('number of glomeruli -?\ *[0-9]+\.?[0-9]*(?:[Ee]\ *-?\ *[0-9]+)?')
    regex_glom_scler = re.compile('number globally sclerotic -?\ *[0-9]+\.?[0-9]*(?:[Ee]\ *-?\ *[0-9]+)?')
    regex_g = re.compile(' g -?\ *[0-9]+\.?[0-9]*(?:[Ee]\ *-?\ *[0-9]+)?')
    regex_i = re.compile(' i -?\ *[0-9]+\.?[0-9]*(?:[Ee]\ *-?\ *[0-9]+)?')
    regex_v = re.compile(' v -?\ *[0-9]+\.?[0-9]*(?:[Ee]\ *-?\ *[0-9]+)?')
    regex_t = re.compile(' t -?\ *[0-9]+\.?[0-9]*(?:[Ee]\ *-?\ *[0-9]+)?')
    regex_ah = re.compile(' ah -?\ *[0-9]+\.?[0-9]*(?:[Ee]\ *-?\ *[0-9]+)?')
    regex_cg = re.compile(' cg -?\ *[0-9]+\.?[0-9]*(?:[Ee]\ *-?\ *[0-9]+)?')
    regex_mm = re.compile(' mm -?\ *[0-9]+\.?[0-9]*(?:[Ee]\ *-?\ *[0-9]+)?')
    regex_ci = re.compile(' ci -?\ *[0-9]+\.?[0-9]*(?:[Ee]\ *-?\ *[0-9]+)?')
    regex_ct = re.compile(' ct -?\ *[0-9]+\.?[0-9]*(?:[Ee]\ *-?\ *[0-9]+)?')
    regex_cv = re.compile(' cv -?\ *[0-9]+\.?[0-9]*(?:[Ee]\ *-?\ *[0-9]+)?')
    regex_ptc = re.compile(' ptc -?\ *[0-9]+\.?[0-9]*(?:[Ee]\ *-?\ *[0-9]+)?')
    regex_ti = re.compile(' ti -?\ *[0-9]+\.?[0-9]*(?:[Ee]\ *-?\ *[0-9]+)?')


    nb_glomeruli = regex_transformation(regex_glomeruli, str)
    glom_scler = regex_transformation(regex_glom_scler, str)
    g = regex_transformation(regex_g, str)
    i = regex_transformation(regex_i, str)
    v = regex_transformation(regex_v, str)
    t = regex_transformation(regex_t, str)
    ah = regex_transformation(regex_ah, str)
    cg = regex_transformation(regex_cg, str)
    mm = regex_transformation(regex_mm, str)
    ci = regex_transformation(regex_ci, str)
    ct = regex_transformation(regex_ct, str)
    cv = regex_transformation(regex_cv, str)
    ptc = regex_transformation(regex_ptc, str)
    ti = regex_transformation(regex_ti, str)


    df = pd.DataFrame(columns=['glomerulis', 'g', 'i', 't', 'v',
                               'ah', 'cg', 'ci', 'ct', 'ti', 'cv', 
                               'mm', 'ptc', 'glom_scler'])

    y = {'glomerulis': nb_glomeruli, 'g': g, 'i': i, 
         't': t, 'v': v, 'ah': ah, 'cg': cg, 'ci': ci, 
         'ct': ct, 'ti': ti, 'cv': cv, 'mm': mm, 
         'ptc': ptc, 'glom_scler': glom_scler
        }

    df.loc['y'] = y
    df = df.reset_index(drop=True)

    return df

def add_ifta(df):
    
    if(df.ci.values[0] == 0 and df.ct.values[0] == 1) or (df.ci.values[0] == 1 and df.ct.values[0] == 0) or (df.ci.values[0] == 1 and df.ct.values[0] == 1):
        df['IFTA'] = 1
    elif(df.ci.values[0] == 2 and df.ct.values[0] == 2) or (df.ci.values[0] == 2 or df.ct.values[0] == 2):
         df['IFTA'] = 2
    elif(df.ci.values[0] == 3 and df.ct.values[0] == 3) or (df.ci.values[0] == 3 or df.ct.values[0] == 3):
         df['IFTA'] = 3
    else:
        df['IFTA'] = 0
    return df

def get_banff_code(dictOfElements, valueToFind):
    for code, name in medical_words.items():
        if valueToFind in name:
            return ' ' + code + ' '

def get_degree(dictOfElements, valueToFind):
    for code, name in degrees.items():
        if valueToFind in name:
            return ' ' + code + ' '
        
def get_numbers(dictOfElements, valueToFind):
    for code, name in numbers.items():
        if valueToFind in name:
            return ' ' + code + ' '

In [None]:
import time
start_time = time.time()
print(get_banff_code(medical_words, 'glomeruli'))
print("--- %s seconds ---" % (time.time() - start_time))

In [None]:
appended_data = []
for i, pdf in enumerate(pdfs):
    df = search_direct_values(pdfs[i]['content'])
    appended_data.append(df)
appended_data = pd.concat(appended_data)
appended_data = appended_data.reset_index(drop=True)

In [None]:
appended_data = clean_table(appended_data)
d = add_ifta(appended_data)

In [None]:
# Saved into database CSV
file_name = 'banff_table.csv'
appended_data.to_csv(file_name, sep=',', encoding='utf-8')
appended_data.head()

In [None]:
split_into_sentences(pdfs[0]['content'])

# Extract text

In [None]:
# get data file names:
path = r'/Users/robinmasliah/Documents/BanffMining/files_to_test'
filenames = glob.glob(path + "/*.pdf")

pdfs = []
for filename in filenames:
    pdfs.append(parser.from_file(filename))

In [None]:
for idx, file in enumerate(filenames):
    print(idx, file)

In [None]:
medical_flat_list = [item for sublist in medical_words.values() for item in sublist]
degrees_flat_list = [item for sublist in degrees.values() for item in sublist]
numbers_flat_list = [item for sublist in numbers.values() for item in sublist]

In [None]:
liste_pdf = []
for idx, pdf in enumerate(pdfs):
    liste_pdf.append(pdfs[idx]['content'])

In [None]:
# Text detection

start = 'LIGHT MICROSCOPY'
end = 'IMMUNOFLUORESCEN'

def loop_text(start, end, liste_pdf):
    list_text = []
    for idx, text in enumerate(liste_pdf):
        text = (text.split(start))[1].split(end)[0].lower()
        text = text.lower().strip().replace('\n', ' ').replace('  ', ' ').replace('[^\w\s]','')
        for i in medical_flat_list:
            if i in text:
                text = text.replace(i, get_banff_code(medical_words, i))
        
        for i in numbers_flat_list:
            if i in text:
                text = text.replace(i, get_numbers(numbers, i))

        for i in degrees_flat_list:
            if i in text:
                text = text.replace(i, get_degree(degrees, i))
        list_text.append(split_into_sentences(text))
    return list_text

In [None]:
list_text = loop_text(start, end, liste_pdf)

# Recherche par mots

In [None]:
# Initialisation du dataset
var0 = 0
var1 = 0
var2 = 0
var3 = 0
var4 = 0
var5 = 0
var6 = 0
var7 = 0
var8 = 0
var9 = 0
var10 = 0
var11 = 0
var12 = 0
var13 = 0

df = pd.DataFrame(columns=['glomerulis', 'g', 'i', 't', 'v', 
                           'ah', 'cg', 'ci', 'ct', 'ti', 'cv', 
                           'mm', 'ptc', 'glom_scler'])

y = {'glomerulis': var0, 'g': var2, 'i': var3, 
     't': var4, 'v': var5, 'ah': var6, 'cg': var7, 
     'ci': var8, 'ct': var9, 'ti': var10, 'cv': var11, 
     'mm': var12, 'ptc': var13, 'glom_scler': var1
    }

df.loc['y'] = y
df = df.reset_index(drop=True)

In [None]:
# mots à enlever = ['do', 'or', 'on', 'is', ':']

def rand_int_text():
    random_text = randint(0, 20)
    return random_text

print(rand_int_text())
sentences = list_text[rand_int_text()]

In [None]:
# Find glomeruli and glom_scler

for i in sentences:
    if(i.find('glomeruli') != -1):
        phrase = i
        break
phrase
nb_glomeruli = (re.findall(r'\d+', phrase))
print(nb_glomeruli)

nb_glomeruli_1 = nb_glomeruli[0]
glom_scler = nb_glomeruli[1]

try:
    print('nb_glomeruli : ', nb_glomeruli_1)
    print('glom scler : ', glom_scler)
    nb_glomeruli_1 = nb_glomeruli[0]
    glom_scler = nb_glomeruli[1]
except Exception as e:
    print(isinstance(e, NameError))

var0 = nb_glomeruli_1
var1 = glom_scler

In [None]:
# Find g
new_list = []
for i in sentences:
    if((" g " in i) == True):
        i = ' '.join([word for word in i.split() if word not in (stopwords.words('english'))])
        i = i.split()
        print(i)
        break
    else:
        g = 0
    
try:        
    for word in i:
        if(len(word) <= 2):
            new_list.append(word)
    print(new_list)


    phrase = ' '.join(new_list)
    print(phrase)
    g = re.findall(r'g (\d+)', phrase)
    if((g == []) is True):
        g = re.findall(r'(\d+) g', phrase)
    
    print(g[0])
    var2 = g[0]
    print('g = ', var2)

except IndexError as e:
    var2 = 0
    print(var2)

In [None]:
# Find i
new_list = []
for i in sentences:
    if((" i " in i) == True):
        i = ' '.join([word for word in i.split() if word not in (stopwords.words('english'))])
        i = i.split()
        print(i)
        break
    else:
        g = 0
    
try:        
    for word in i:
        if(len(word) == 1):
            new_list.append(word)
    print(new_list)


    phrase = ' '.join(new_list)
    print(phrase)
    i = re.findall(r'i (\d+)', phrase)
    if((i == []) is True):
        i = re.findall(r'(\d+) i', phrase)
    
    print(i[0])
    var3 = i[0]
    print('i = ', var3)

except IndexError as e:
    var3 = 0
    print(var3)

In [None]:
# Find t
new_list = []
for i in sentences:
    if((" t " in i) == True):
        i = ' '.join([word for word in i.split() if word not in (stopwords.words('english'))])
        i = i.split()
        print(i)
        break
    else:
        g = 0
    
try:        
    for word in i:
        if(len(word) == 1):
            new_list.append(word)
    print(new_list)


    phrase = ' '.join(new_list)
    print(phrase)
    t = re.findall(r't (\d+)', phrase)
    if((t == []) is True):
        t = re.findall(r'(\d+) t', phrase)
    
    print(t[0])
    var4 = t[0]
    print('t = ', var4)

except IndexError as e:
    var4 = 0
    print(var4)

In [None]:
# Find cv
new_list = []
for i in sentences:
    if((" cv " in i) == True):
        i = ' '.join([word for word in i.split() if word not in (stopwords.words('english'))])
        i = i.split()
        print(i)
        break
    else:
        g = 0
        
try:
    for word in i:
        if(len(word) <= 2):
            new_list.append(word)
    print(new_list)


    phrase = ' '.join(new_list)
    print(phrase)
    cv = re.findall(r'cv (\d+)', phrase)
    print(cv[0])
    var5 = cv[0]
    print('cv = ', var5)

except IndexError as e:
    var5 = 0
    print(var5)

In [None]:
# Find v
new_list = []
for i in sentences:
    if((" v " in i) == True):
        i = ' '.join([word for word in i.split() if word not in (stopwords.words('english'))])
        i = i.split()
        print(i)
        break
    else:
        g = 0
    
try:        
    for word in i:
        if(len(word) < 2):
            new_list.append(word)
    print(new_list)


    phrase = ' '.join(new_list)
    print(phrase)
    v = re.findall(r'v (\d+)', phrase)
    if((v == []) is True):
        v = re.findall(r'(\d+) v', phrase)
    
    print(v[0])
    var6 = v[0]
    print('v = ', var6)

except IndexError as e:
    var6 = 0
    print(var6)

In [None]:
# Find ah
new_list = []
for i in sentences:
    if((" ah " in i) == True):
        i = ' '.join([word for word in i.split() if word not in (stopwords.words('english'))])
        i = i.split()
        print(i)
        break
    else:
        g = 0
    
try:        
    for word in i:
        if(len(word) <= 2):
            new_list.append(word)
    print(new_list)


    phrase = ' '.join(new_list)
    print(phrase)
    ah = re.findall(r'ah (\d+)', phrase)
    if((ah == []) is True):
        ah = re.findall(r'(\d+) ah', phrase)
    
    print(ah[0])
    var7 = ah[0]
    print('ah = ', var7)

except IndexError as e:
    var7 = 0
    print(var7)

In [None]:
# Find cg
new_list = []
for i in sentences:
    if((" cg " in i) == True):
        i = ' '.join([word for word in i.split() if word not in (stopwords.words('english'))])
        i = i.split()
        print(i)
        break
    else:
        g = 0
    
try:        
    for word in i:
        if(len(word) <= 2):
            new_list.append(word)
    print(new_list)


    phrase = ' '.join(new_list)
    print(phrase)
    cg = re.findall(r'cg (\d+)', phrase)
    if((cg == []) is True):
        cg = re.findall(r'(\d+) cg', phrase)
    
    print(cg[0])
    var8 = cg[0]
    print('cg = ', var8)

except IndexError as e:
    var8 = 0
    print(var8)

In [None]:
# Find g
new_list = []
for i in sentences:
    if((" ci " in i) == True):
        i = ' '.join([word for word in i.split() if word not in (stopwords.words('english'))])
        i = i.split()
        print(i)
        break
    else:
        g = 0
    
try:        
    for word in i:
        if(len(word) <= 2):
            new_list.append(word)
    print(new_list)


    phrase = ' '.join(new_list)
    print(phrase)
    ci = re.findall(r'ci (\d+)', phrase)
    if((ci == []) is True):
        ci = re.findall(r'(\d+) ci', phrase)
    
    print(ci[0])
    var9 = ci[0]
    print('ci = ', var9)

except IndexError as e:
    var9 = 0
    print(var9)

In [None]:
# Find ct
new_list = []
for i in sentences:
    if((" ct " in i) == True):
        i = ' '.join([word for word in i.split() if word not in (stopwords.words('english'))])
        i = i.split()
        print(i)
        break
    else:
        g = 0
    
try:        
    for word in i:
        if(len(word) <= 2):
            new_list.append(word)
    print(new_list)


    phrase = ' '.join(new_list)
    print(phrase)
    ct = re.findall(r'ct (\d+)', phrase)
    if((ct == []) is True):
        ct = re.findall(r'(\d+) ct', phrase)
    
    print(ct[0])
    var10 = ct[0]
    print('ct = ', var10)

except IndexError as e:
    var10 = 0
    print(var10)

In [None]:
# Find g
new_list = []
for i in sentences:
    if((" ti " in i) == True):
        i = ' '.join([word for word in i.split() if word not in (stopwords.words('english'))])
        i = i.split()
        print(i)
        break
    else:
        g = 0
    
try:        
    for word in i:
        if(len(word) <= 2):
            new_list.append(word)
    print(new_list)


    phrase = ' '.join(new_list)
    print(phrase)
    ti = re.findall(r'ti (\d+)', phrase)
    if((ti == []) is True):
        ti = re.findall(r'(\d+) ti', phrase)
    
    print(ti[0])
    var11 = ti[0]
    print('ti = ', var11)

except IndexError as e:
    var11 = 0
    print(var11)

In [None]:
# Find g
new_list = []
for i in sentences:
    if((" mm " in i) == True):
        i = ' '.join([word for word in i.split() if word not in (stopwords.words('english'))])
        i = i.split()
        print(i)
        break
    else:
        g = 0
    
try:        
    for word in i:
        if(len(word) <= 2):
            new_list.append(word)
    print(new_list)


    phrase = ' '.join(new_list)
    print(phrase)
    mm = re.findall(r'mm (\d+)', phrase)
    if((mm == []) is True):
        mm = re.findall(r'(\d+) mm', phrase)
    
    print(mm[0])
    var12 = mm[0]
    print('mm = ', var12)

except IndexError as e:
    var12 = 0
    print(var12)

In [None]:
# Find g
new_list = []
for i in sentences:
    if((" ptc " in i) == True):
        i = ' '.join([word for word in i.split() if word not in (stopwords.words('english'))])
        i = i.split()
        print(i)
        break
    else:
        g = 0
    
try:        
    for word in i:
        if(len(word) <= 3):
            new_list.append(word)
    print(new_list)


    phrase = ' '.join(new_list)
    print(phrase)
    ptc = re.findall(r'ptc (\d+)', phrase)
    if((ptc == []) is True):
        ptc = re.findall(r'(\d+) ptc', phrase)
    
    print(ptc[0])
    var13 = ptc[0]
    print('ptc = ', var13)

except IndexError as e:
    var13 = 0
    print(var13)

In [None]:
# Add row to Dataframe

df = df.append({'glomerulis':var0, 'g':var2, 'i': var3, 
                't': var4, 'v': var5, 'ah': var6, 'cg': var7, 
                'ci': var8, 'ct': var9, 'ti': var10, 'cv': var11, 
                'mm': var12, 'ptc': var13, 'glom_scler': var1}, 
               ignore_index=True)

In [None]:
df

# Correction des fautes d'orthographes

In [None]:
from fuzzywuzzy import fuzz
from fuzzywuzzy import process

In [None]:
proba = fuzz.ratio("tubulointerstitial fibrosis", "tublointstitial fibris")

In [None]:
proba

In [None]:
if(proba >= 90):
    print('Same word')
else:
    print('Different word')

# Word2Vec

In [None]:
from collections import Counter
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.metrics.pairwise import cosine_similarity
from nltk.corpus import stopwords
import nltk
nltk.download('stopwords')

def get_cosine_sim(*strs): 
    vectors = [t for t in get_vectors(*strs)]
    return cosine_similarity(vectors)
    
def get_vectors(*strs):
    text = [t for t in strs]
    vectorizer = CountVectorizer(text)
    vectorizer.fit(text)
    return vectorizer.transform(text).toarray()

In [None]:
text_1 = list_text[rand_int_text()]
text_1 = ' '.join(text_1)
text_1 = ' '.join([word for word in text_1.split() if word not in (stopwords.words('english'))])

In [None]:
text_2 = list_text[rand_int_text()]
text_2 = ' '.join(text_2)
text_2 = ' '.join([word for word in text_2.split() if word not in (stopwords.words('english'))])

In [None]:
text_1

In [None]:
text_2

In [None]:
vect_text = get_vectors(text)

In [None]:
vect_text

In [None]:
nltk.download('punkt')

In [None]:
#using NLTK library, we can do lot of text preprocesing
import nltk
from nltk.tokenize import word_tokenize
#function to split text into word
tokens = word_tokenize(pdfs[0]['content'])
nltk.download('stopwords')
print(tokens)

In [None]:
from nltk.corpus import stopwords
stop_words = set(stopwords.words('english'))
tokens = [w for w in tokens if not w in stop_words]
print(tokens)

In [None]:
# NLTK provides several stemmer interfaces like Porter stemmer, #Lancaster Stemmer, Snowball Stemmer
from nltk.stem.porter import PorterStemmer
porter = PorterStemmer()
stems = []
for t in tokens:    
    stems.append(porter.stem(t))
print(stems)

In [None]:
import nltk
from nltk.tokenize import word_tokenize
reviews = df.review.str.cat(sep=' ')
#function to split text into word
tokens = word_tokenize(reviews)
vocabulary = set(tokens)
print(len(vocabulary))
frequency_dist = nltk.FreqDist(tokens)
sorted(frequency_dist,key=frequency_dist.__getitem__, reverse=True)[0:50]

In [None]:
from wordcloud import WordCloud
import matplotlib.pyplot as plt
wordcloud = WordCloud().generate_from_frequencies(frequency_dist)
plt.imshow(wordcloud)
plt.axis("off")
plt.show()

In [None]:
# get data file names:
path = r'/home/robin/Documents/INSERM/files_to_test'
filenames = glob.glob(path + "/*.pdf")

pdfs = []
for filename in filenames:
    pdfs.append(parser.from_file(filename))

In [None]:
filenames

In [None]:
banff_true = pd.read_csv('/home/robin/Documents/INSERM/valeurs.csv', sep=',', delimiter=None, header='infer')

In [None]:
banff_true = banff_true.T

In [None]:
df_vrai = pd.DataFrame()
for i, line in enumerate(banff_true):
    df_vrai = banff_true.loc[:, line]
    df_vrai = pd.DataFrame(df_vrai)
    print(df_vrai)

In [None]:
df_vrai = pd.DataFrame(df_vrai)

In [None]:
df_vra

In [None]:
banff_true = pd.read_csv('/home/robin/Documents/INSERM/csv_trueValue_mayo/newcsv.csv', sep=',', delimiter=None)

In [None]:
banff_true.index = banff_true['banff']

In [None]:
del banff_true['banff']

In [None]:
banff = banff_true.copy()
banff.head(15)

In [None]:
class_names = np.array(['0', '1', '2', '3', '4'])

def plot_confusion_matrix(cm, classes,
                          normalize=False,
                          title='Confusion matrix',
                          cmap=plt.cm.Blues):
    """
    This function prints and plots the confusion matrix.
    Normalization can be applied by setting `normalize=True`.
    """
    if normalize:
        cm = cm.astype('float') / cm.sum(axis=1)[:, np.newaxis]
        print("Normalized confusion matrix")
    else:
        print('Confusion matrix, without normalization')

    print(cm)

    plt.imshow(cm, interpolation='nearest', cmap=cmap)
    plt.title(title)
    plt.colorbar()
    tick_marks = np.arange(len(classes))
    plt.xticks(tick_marks, classes, rotation=45)
    plt.yticks(tick_marks, classes)

    fmt = '.2f' if normalize else 'd'
    thresh = cm.max() / 2.
    for i, j in itertools.product(range(cm.shape[0]), range(cm.shape[1])):
        plt.text(j, i, format(cm[i, j], fmt),
                 horizontalalignment="center",
                 color="white" if cm[i, j] > thresh else "black")

    plt.ylabel('True label')
    plt.xlabel('Predicted label')
    plt.tight_layout()


# Compute confusion matrix
cnf_matrix = confusion_matrix(banff['true'], banff['mined'])
np.set_printoptions(precision=2)

# Plot non-normalized confusion matrix
plt.figure()
plot_confusion_matrix(cnf_matrix, classes=class_names, title='Confusion matrix, without normalization')

# Plot normalized confusion matrix
plt.figure()
plot_confusion_matrix(cnf_matrix, classes=class_names, normalize=True, title='Normalized confusion matrix')

plt.show()

In [None]:
df = banff['true']==banff['mined']

In [None]:
cpt=0
for i in df:
    if(i==True):
        cpt+=1
cpt

In [None]:
from collections import Counter
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.metrics.pairwise import cosine_similarity

def get_cosine_sim(*strs): 
    vectors = [t for t in get_vectors(*strs)]
    return cosine_similarity(vectors)
    
def get_vectors(*strs):
    text = [t for t in strs]
    vectorizer = CountVectorizer(text)
    vectorizer.fit(text)
    return vectorizer.transform(text).toarray()

In [None]:
get_cosine_sim()

In [None]:
get_vectors(string)

In [None]:
list_text[3]

# Test scrap variable

In [None]:
liste_pdf[12]

In [None]:
# Text detection

start = 'LIGHT MICROSCOPY'
end = 'IMMUNOFLUORESCEN'

def loop_text(start, end, liste_pdf):
    list_text = []
    for idx, text in enumerate(liste_pdf):
        text = (text.split(start))[1].split(end)[0].lower()
        text = text.lower().strip().replace('\n', ' ').replace('  ', ' ').replace('[^\w\s]','')
        for i in medical_flat_list:
            if i in text:
                text = text.replace(i, get_banff_code(medical_words, i))

        for i in degrees_flat_list:
            if i in text:
                text = text.replace(i, get_degree(degrees, i))
        list_text.append(split_into_sentences(text))
    return list_text

In [None]:
loop_text(start, end, liste_pdf)