<a href="https://colab.research.google.com/github/sugerdonut/my-first-repo/blob/master/Untitled6.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [5]:
from time import sleep
import requests
from bs4 import BeautifulSoup
# 検索対象のリスト
SEARCH_LIST = ['rs3781264', 'rs2274223', 'rs2294008', 'no_contents']
# 定数

BASE_URL = 'https://www.ncbi.nlm.nih.gov'
SNP = '/snp/'
SEARCH_QUERY = "?term={}"

In [18]:
def get_soup(url: str) -> BeautifulSoup:
    """
    urlを受け取って、soup型を返す
    
    @param url: 対象のurl e.g.) 'https://www.ncbi.nlm.nih.gov/snp/?term=rs3781264'
    @return: BeautifulSoup
    """
    response = requests.get(url)
    # ステータスコードが200番台(成功)以外はエラー
    response.raise_for_status()
    # 文字化けが起こらないようにコンテンツ取得
    response.encoding = response.apparent_encoding
    
    return BeautifulSoup(response.text, 'html.parser')

In [19]:
def get_first_title(soup: BeautifulSoup) -> str:
    """
    検索結果一番上のタイトルを取得
    """
    first_area = soup.find('div', class_='rprt')
    return first_area.find_all('span')[1].find('span').text
def get_first_link(soup: BeautifulSoup) -> str:
    """
    検索結果一番上のリンクを取得
    """
    first_area = soup.find('div', class_='rprt')
    href = first_area.find('p', class_='title').find('a').get('href')
    return f"{BASE_URL}{href}"

In [20]:

def get_variation_type(soup: BeautifulSoup) -> str:
    return soup.find(
        'dl', class_='usa-width-one-half'
    ).find_all('dd')[3].text.strip().replace('\n', '').replace(' ', '')

In [21]:

def get_gene_consequence(soup: BeautifulSoup) -> str:
    return soup.find_all(
        'dl', class_='usa-width-one-half'
    )[1].find_all('dd')[1].text.strip().replace('\n', '').replace(' ', '')

In [22]:
def main():
    for target in SEARCH_LIST:
        print(target)
        url = BASE_URL + '/snp/' + SEARCH_QUERY.format(target)
        soup = get_soup(url)
        sleep(1)
        try:
            get_first_title(soup)
        except Exception:
            print(f'{target}は存在しません')
            continue
        url = get_first_link(soup)
        detail_page_soup = get_soup(url)
        variation_type = get_variation_type(detail_page_soup)
        print('Variation Type => ' + variation_type)
        gene_consequence = get_gene_consequence(detail_page_soup)
        print('Gene : Consequence => ' + gene_consequence)
        print('====================================================')

In [23]:
if __name__ == '__main__':
    main()

rs3781264
Variation Type => SNVSingleNucleotideVariation
Gene : Consequence => PLCE1:IntronVariant
rs2274223
Variation Type => SNVSingleNucleotideVariation
Gene : Consequence => PLCE1:MissenseVariant
rs2294008
Variation Type => SNVSingleNucleotideVariation
Gene : Consequence => PSCA:IntronVariant
no_contents
no_contentsは存在しません
