In [1]:
import pandas as pd
import re
import numpy as np
from tqdm import tnrange

In [2]:
import requests
import urllib
from bs4 import BeautifulSoup

# ATH table

In [3]:
ath_codes = pd.read_csv('Support data/ath_codes_upd2.csv', index_col=0)
ath_codes.head()

Unnamed: 0,ID,Name
0,A01,Стоматологические препараты
1,A01A,Стоматологические препараты
2,A01AA,Препараты для профилактики кариеса
3,A01AA01,Sodium fluoride
4,A01AA02,Sodium monofluorophosphate


# Code

In [46]:
def clear_name(x):
    #delete brackets
    br = re.findall('\(.{,100}\)', str(x))
    if len(br) > 0:
        x = str(x).replace(br[0], '').strip()
    #delete digits
    di = re.findall('\d', str(x))
    if len(di)>0:
        for el in di:
            x = str(x).replace(el, '').replace('мг', '')
    #delete symbols
    x = str(x).replace('.', '').replace(',', '').replace(':', '').replace(';', '').replace('+', '').replace('\d', '').strip()
    return x

In [5]:
def normalize_units(x):
    tab = re.findall('таб', str(x))
    if len(tab) > 0:
        form = 'таб'
    
    vpr = re.findall('прыск', str(x))
    inst = re.findall('инстил', str(x))
    if (len(vpr) > 0) or (len(inst) > 0):
        form = ''
    
    else:
        form = x
    
    return form

In [6]:
def make_request(in_dr):
    drug = urllib.request.quote(str(in_dr).lower()) #кодировка в URL
    link = 'https://www.vidal.ru/drugs?t=all&q=' + str(drug) #создаем ссылку
    html_page = urllib.request.urlopen(link) #отправляем запрос на сайт
    soup = BeautifulSoup(html_page, "html.parser") #открываем HTML
    return soup

In [7]:
def find_names(soup):
    suggested_drugs = soup.find_all('td', "products-table-name")
    drug_names = []
    for i in range(len(suggested_drugs)):
        drug_name = re.findall('\">\n\s{,100}\w{,100}<?s?u?p?>?\W{,1}<?/?s?u?p?>?\s{,1}\w{,20}\S{,1}\w{,3}', str(suggested_drugs[i].a))[0].replace('<sup>', ' ').replace('</sup>', ' ').replace('sup', '').replace('\n', '').replace('<', '').replace('\n', '').replace('\">', '').replace('  ', ' ').strip()
        drug_names.append(drug_name)
    return suggested_drugs, drug_names

In [64]:
def find_forms(soup):
    form_drugs = soup.find_all('div', 'hyphenate')
    drug_forms = []
    for form in range(len(form_drugs)):
        drug_form = re.findall('>.{,10000}<', str(form_drugs[form]))[0].replace('>', '').replace('<', '')
        drug_forms.append(drug_form)
    return drug_forms

In [9]:
def detect_needed(fdr, forms):
    if fdr == '':
        need_ids = [0]
    else:
        fdr = normalize_units(fdr)
        fdr = fdr.replace(' ', '').replace('.', '').replace(',', '').replace(':', '').replace(';', '').lower()
        need_ids = []
        for i in range(len(forms)):
            look = re.findall(str(fdr).lower(), str(forms[i]).replace(' ', '').replace('.', '').replace(',', '').replace(':', '').replace(';', '').lower())
            if len(look) > 0:
                need_ids.append(i)
            else:
                pass
        if need_ids ==[]:
            need_ids = [0]
    return need_ids

In [25]:
def get_soup2(dose, suggested_drugs, forms):
    need_ids = detect_needed(str(dose), forms)
    #print(need_ids)
    if len(need_ids) > 0:
        hr = str(suggested_drugs[need_ids[0]].a)
        p_url = re.findall('href=\"\S{,100}', hr)[0].replace('href=\"/', '').replace('\">', '')
        up_url = str('https://www.vidal.ru/'+p_url)
        html_page2 = urllib.request.urlopen(up_url)
        soup2 = BeautifulSoup(html_page2, "html.parser")
        return soup2

In [26]:
def ath_actmol(soup2):
    r = soup2.find_all('a', "no-underline", 'href')
    active = []
    atc = []
    for i in range(len(r)):
        ch = re.findall('molecule', str(r[i]))
        at = re.findall('atc', str(r[i]))
        if len(ch) > 0:
            active.append(r[i])
        if len(at) > 0:
            atc.append(r[i])
    if len(active)>0:     
        act_mol = re.findall('>.{,1000}<', str(active[0]))[0].replace('>', '').replace('<', '').strip().capitalize()
    else:
        act_mol = 'nan'
    ath_code = re.findall('[A-Z]\d{1,2}\w{0,2}\d{0,2}', str(atc[0]))[0].replace('>', '').replace('<', '').strip()
    return act_mol, ath_code

In [27]:
def recommendation(need_ids, suggested_drugs):
    j = need_ids[0]
    sug_dr = str(suggested_drugs[j].div).replace('<div>', '').replace('<li>', '').replace('<ul>', '').replace('</div>', '').replace('</li>', '').replace('</ul>', '').strip().capitalize()
    return sug_dr

In [81]:
def get_info(dn, fname):
    #insert drug_name
    dname = clear_name(dn)
    
    soup = make_request(dname)
    suggested_drugs, drug_names = find_names(soup)
    if len(drug_names) > 0:
        forms = find_forms(soup)
        #insert drug form
        need_ids = detect_needed(fname, forms)
        if len(suggested_drugs) == len(drug_names):
            soup2 = get_soup2(fname, suggested_drugs, forms)
            name = drug_names[need_ids[0]].capitalize()
            form = forms[need_ids[0]].capitalize()
            act_mol, ath_code = ath_actmol(soup2)
            recom = recommendation(need_ids, suggested_drugs)
            return name, form, act_mol, ath_code, recom
        else:
            soup2 = get_soup2(fname, suggested_drugs, forms)
            name = drug_names[0].capitalize()
            form = forms[0].capitalize()
            act_mol, ath_code = ath_actmol(soup2)
            recom = recommendation(need_ids, suggested_drugs)
            
    else:
        return 'nan', 'nan', 'nan', 'nan', 'nan'

In [82]:
def ath_processing(ath_code):
    len_ath = len(ath_code)
    res0 = ath_codes['ID'] == ath_code
    res1 = ath_codes['ID'] == ath_code[:3]
    if len_ath > 3:
        res2 = ath_codes['ID'] == ath_code[:5]
    else:
        res2 = res0

    mdf = ath_codes[res0 | res1 | res2].reset_index()
    name2 = mdf.loc[0, 'Name']
    code2 = mdf.loc[0, 'ID']
    if len(mdf) == 3:
        name4 = mdf.loc[1, 'Name']
        code4 = mdf.loc[1, 'ID']
        name5 = mdf.loc[2, 'Name']
        code5 = mdf.loc[2, 'ID']

    elif len(mdf) == 2:
        name4 = mdf.loc[1, 'Name']
        code4 = mdf.loc[1, 'ID']
        name5 = 'nan'
        code5 = 'nan' 

    else:
        name4 = 'nan'
        code4 = 'nan'
        name5 = 'nan'
        code5 = 'nan'
        
    return name5, code5, name4, code4, name2, code2

In [91]:
name, form, act_mol, ath_code, recom = get_info('аква марис', '')

In [92]:
print(name, form, act_mol, ath_code, recom)

Аква марис Капли назальные д/детей: фл.-капельница 10 мл nan R01AX10 None


In [93]:
n5, c5, n4, c4, n2, c2 = ath_processing(ath_code)

In [94]:
print(n5, c5, n4, c4, n2, c2)

Прочие препараты R01AX10 Прочие препараты для местного применения при заболеваниях носа R01AX Препараты для лечения заболеваний носа R01


In [95]:
row = {'Название': [name], 'Форма': [form], 'Действующее вещество': [act_mol], 'АТХ': [ath_code], 'Показание к применению': [recom], 
       'АТХ_5':[c5], 'АТХ_5_имя':[n5], 'АТХ_4':[c4], 'АТХ_4_имя':[n4], 'АТХ_2':[c2], 'АТХ_2_имя':[n2]} 
row = pd.DataFrame.from_dict(row, orient='columns')
row

Unnamed: 0,Название,Форма,Действующее вещество,АТХ,Показание к применению,АТХ_5,АТХ_5_имя,АТХ_4,АТХ_4_имя,АТХ_2,АТХ_2_имя
0,Аква марис,Капли назальные д/детей: фл.-капельница 10 мл,,R01AX10,,R01AX10,Прочие препараты,R01AX,Прочие препараты для местного применения при з...,R01,Препараты для лечения заболеваний носа


# По действующим веществам

In [99]:
def for_compounds(dname):
    name = clear_name(dname)
    soupp = make_request(name)
    req = soupp.find_all('a', "no-underline", 'href')
    link = []
    for i in range(len(req)):
        mol = re.findall('/molecule-in/', str(req[i]))
        if len(mol) > 0:
            link.append(req[i])
    if len(link) > 0:
        link_dr = re.findall('href=".{,1000}"', str(link[0]))[0].replace('href=\"', '').replace('\"', '')
        link = 'https://www.vidal.ru' + str(link_dr) #создаем ссылку
        html_page = urllib.request.urlopen(link) #отправляем запрос на сайт
        soup3 = BeautifulSoup(html_page, "html.parser") #открываем HTML
        return soup3
    else:
        pass

In [100]:
def active_compounds(soup3):
    suggested_drugs, drug_names = find_names(soup3)
    if len(drug_names) > 0:
        forms = find_forms(soup3)
        print(forms)
        #insert drug form
        fname=''
        need_ids = detect_needed(fname, forms)
        print(need_ids)
        soup2 = get_soup2(fname, suggested_drugs, forms)
        name = drug_names[need_ids[0]].capitalize()
        print(name)
        form = forms[need_ids[0]].capitalize()
        act_mol, ath_code = ath_actmol(soup2)
        recom = recommendation(need_ids, suggested_drugs)
        return name, form, act_mol, ath_code, recom
    else:
        return 'nan', 'nan', 'nan', 'nan', 'nan'

In [103]:
proc_df[proc_df['АТХ'] !='nan']

Unnamed: 0,Табличное_имя,Название,Форма,Действующее вещество,АТХ,Показание к применению,АТХ_5,АТХ_5_имя,АТХ_4,АТХ_4_имя,АТХ_2,АТХ_2_имя
0,моксонидин,Моксонидин,"Таб., покр. пленочной оболочкой, 0.2 мг: 20 ил...",Моксонидин,C02AC05,,C02AC05,Moxonidine,C02AC,Агонисты имидазолиновых рецепторов,C02,Антигипертензивные препараты
