# Scraping Rosalind problems


This script will take a url of a Rosalind problem 
(such as https://rosalind.info/problems/lcsq/)
and generate a python script in puzzles/ folder
with a skeleton of the problem solution.

Use lewagon environment (has BeautifulSoup).


In [1]:

import requests
import os
from bs4 import BeautifulSoup
from scrape_rosalind_problem import scrape_rosalind_problem

In [2]:
main_url = "https://rosalind.info/problems/"
urls = ["ksim", "sims", "loca", "glob",
        "edta", "edit"]
for url in [main_url+u for u in urls]:
    print(url)
    scrape_rosalind_problem(url)

https://rosalind.info/problems/ksim
https://rosalind.info/problems/sims
https://rosalind.info/problems/loca
https://rosalind.info/problems/glob
https://rosalind.info/problems/edta
https://rosalind.info/problems/edit


In [68]:
def scrape_rosalind_problem(url):
    """
    This opens a rosalind problem and creates the following template files:
    
    problem_name.py in puzzles - script to write your code into
    problem_name.txt - sample input form the description (fasta)
    
    :param url: url address of the puzzle to be scraped 
    """
    
    # get the name of the dataset
    problem_suffix = [s for s in url.split('/') if s][-1]
    dataset_name = os.path.join('..', 'data', 'rosalind_' + problem_suffix + '.txt')


    # get the html of the page
    response = requests.get(url)
    soup = BeautifulSoup(response.content, "html.parser")
    
    # problem title and file names
    title = soup.find('h1').text.split('\r\n')[0]
    puzzle = title.lower().replace(' ','_')
    file_name = puzzle + '.py'
    sample_data_path = os.path.join('..','puzzles','sample_data',puzzle+'.txt')

    # description text
    ps = soup.find_all("p")
    topics = soup.find('p', class_='topics')
    
    # sample data and answer
    sample_data = soup.find_all('div', class_='codehilite')
    sample = sample_data[0].text
    answer = sample_data[1].text
    
    # write everything to a file
    with open(os.path.join('..','puzzles',file_name), 'w') as f:
        f.write('from rosalind.utils import read_multifasta\n\n')
        # function definition to solve the puzzle
        f.write('def solve_' + puzzle + '(fasta_path):\n')
        # write docstring from description
        f.write('\t"""\n')
        f.write('\t'+title+'\n\n')
        f.write(topics.text)
        f.writelines(['\t'+p.text for p in ps[2:8]])
        f.write('\n\t"""\n\n')
        f.write('\t# Import sample sequences\n')
        f.write('\tsequences = read_multifasta(fasta_path)\n\n')
        f.write('\treturn\n')
        f.write('\n\n\n')
        f.write('if __name__=="__main__":\n')
        f.write('\tassert solve_'+puzzle+'('+sample_data_path+')' +' == '+ '"'+answer.strip()+'"\n')
        f.write('print("------rosalind problem------")\n')
        f.write(f'print(solve_{puzzle}({dataset_name}))')
        f.close()    

    # write a sample input file
    with open(sample_data_path, 'w') as t:
        t.write(sample)
        t.close()
    
    
scrape_rosalind_problem(url)