In [1]:
import os

from bs4 import BeautifulSoup
import requests
import json
import re
from time import sleep

In [32]:
def produceID(title, package):
    s = re.sub("[():,]", " ", title)
    s = "_".join([
        x for x in s.split(" ")
    ])
    
    file_name = (
        s
        .replace(".","_")
        .replace(" ", "_")
        .replace("__","_")
        .replace("-","")
        .strip("_")
        .lower()
    )
    
    
    
    full_name = f'{package}_{file_name}'
    
    return full_name
    

# NumPy

In [201]:
quickstart = 'https://numpy.org/doc/stable/user/quickstart.html'

In [202]:
req = requests.get(quickstart)

In [203]:
soup = BeautifulSoup(req.content)

In [209]:
# soup.find("div", {'class':"sphinxsidebarwrapper"})

In [211]:
print(soup.prettify())

<!DOCTYPE html>
<html>
 <head>
  <meta charset="utf-8"/>
  <title>
   Quickstart tutorial — NumPy v1.19 Manual
  </title>
  <link href="../_static/css/spc-bootstrap.css" rel="stylesheet" type="text/css"/>
  <link href="../_static/css/spc-extend.css" rel="stylesheet" type="text/css"/>
  <link href="../_static/scipy.css" rel="stylesheet" type="text/css"/>
  <link href="../_static/pygments.css" rel="stylesheet" type="text/css"/>
  <link href="../_static/graphviz.css" rel="stylesheet" type="text/css"/>
  <script type="text/javascript">
   var DOCUMENTATION_OPTIONS = {
        URL_ROOT:    '../',
        VERSION:     '1.19.0',
        COLLAPSE_INDEX: false,
        FILE_SUFFIX: '.html',
        HAS_SOURCE:  false
      };
  </script>
  <script src="../_static/jquery.js" type="text/javascript">
  </script>
  <script src="../_static/underscore.js" type="text/javascript">
  </script>
  <script src="../_static/doctools.js" type="text/javascript">
  </script>
  <script src="../_static/language_d

In [233]:
test = splitSections(soup, '<h3>')[6][1]

In [243]:
produceID(BeautifulSoup(test).find("h3").text, 'numpy')

'numpy_\n_____indexing_slicing_and_iterating\n_____\n______¶\n_____\n'

In [212]:
def getPage(url):
    req = requests.get(url)
    soup = BeautifulSoup(req.content)
    content_text = soup.find('div', {'class':"body"})
    
    return content_text

In [245]:
def splitSections(page, header):

    if isinstance(page, str):
        soup = BeautifulSoup(page)
    else:
        soup = page
        
    html_sections = []
    for idx, data in enumerate(soup.prettify().split(header)):
        if idx == 0:
            continue

        data_string = f'{header}{data}'
        section_name = (
            produceID(
                BeautifulSoup(data_string)
                .find(header.strip("<>"))
                .text
                .strip()
                .split('\n')[0],
            package='numpy'
            )
        )
        
        html_sections.append((section_name, data_string))
        
    return html_sections

In [246]:

def processhtml(section):
    
    if isinstance(section, str):
        soup = BeautifulSoup(section)
    else:
        soup = section
    
    for div in soup.find_all("span", {'class':'math notranslate nohighlight'}): 
        div.decompose()
        
    for div in soup.find_all("code", {'class':'docutils literal notranslate'}): 
        div.decompose()
        
    for div in soup.find_all("pre"): 
        div.decompose()
        
    rem_newlines = re.sub(r'[\n]{1,}', ' ', soup.text)
    x = re.sub(r'[\ ]{2,}', ' ', rem_newlines)
    x = x.replace('=', '')
    x = x.replace('>>>', '')
    x = x.replace('¶', '')

    return x

In [248]:

if not os.path.exists('numpy'):
    os.mkdir('numpy')


page = getPage(quickstart)
splits = splitSections(page, '<h3>')

for section_name, section_data in splits:
    link = BeautifulSoup(section_data).find("h3").find('a')['href']
    pageid = section_name
    text = processhtml(section_data)
    print(pageid, section_name)
    with open(f'numpy/{pageid}.txt', 'w') as f:
        f.write(f"{pageid}\n{title}\n{link}\n") 
        f.write(str(text))



numpy_an_example numpy_an_example
numpy_array_creation numpy_array_creation
numpy_printing_arrays numpy_printing_arrays
numpy_basic_operations numpy_basic_operations
numpy_universal_functions numpy_universal_functions
numpy_indexing_slicing_and_iterating numpy_indexing_slicing_and_iterating
numpy_changing_the_shape_of_an_array numpy_changing_the_shape_of_an_array
numpy_stacking_together_different_arrays numpy_stacking_together_different_arrays
numpy_splitting_one_array_into_several_smaller_ones numpy_splitting_one_array_into_several_smaller_ones
numpy_no_copy_at_all numpy_no_copy_at_all
numpy_view_or_shallow_copy numpy_view_or_shallow_copy
numpy_deep_copy numpy_deep_copy
numpy_functions_and_methods_overview numpy_functions_and_methods_overview
numpy_broadcasting_rules numpy_broadcasting_rules
numpy_indexing_with_arrays_of_indices numpy_indexing_with_arrays_of_indices
numpy_indexing_with_boolean_arrays numpy_indexing_with_boolean_arrays
numpy_the_ix__function numpy_the_ix__function
nump


# SciPy

In [173]:
base_scipy = "https://docs.scipy.org/doc/scipy/reference/tutorial/"

In [174]:
req = requests.get(base_scipy + "index.html")

In [175]:
soup = BeautifulSoup(req.content)

In [179]:
scipy_sections = []

for li in soup.find_all('li', {'class': 'toctree-l1'}):

    children = li.findChildren("a" , recursive=False)
    for child in children:
        if "#" in child['href']:
            continue
        print(child['href'])
        print(child.text)
        id_ = produceID(child.text, 'scipy')
        scipy_sections.append((id_, child.text, child['href']))
    

general.html
Introduction
basic.html
Basic functions
special.html
Special functions (scipy.special)
integrate.html
Integration (scipy.integrate)
optimize.html
Optimization (scipy.optimize)
interpolate.html
Interpolation (scipy.interpolate)
fft.html
Fourier Transforms (scipy.fft)
signal.html
Signal Processing (scipy.signal)
linalg.html
Linear Algebra (scipy.linalg)
arpack.html
Sparse eigenvalue problems with ARPACK
csgraph.html
Compressed Sparse Graph Routines (scipy.sparse.csgraph)
spatial.html
Spatial data structures and algorithms (scipy.spatial)
stats.html
Statistics (scipy.stats)
ndimage.html
Multidimensional image processing (scipy.ndimage)
io.html
File IO (scipy.io)


In [181]:
scipy_sections[:5]

[('scipy_introduction', 'Introduction', 'general.html'),
 ('scipy_basic_functions', 'Basic functions', 'basic.html'),
 ('scipy_special_functions_scipy_special',
  'Special functions (scipy.special)',
  'special.html'),
 ('scipy_integration_scipy_integrate',
  'Integration (scipy.integrate)',
  'integrate.html'),
 ('scipy_optimization_scipy_optimize',
  'Optimization (scipy.optimize)',
  'optimize.html')]

In [183]:
req = requests.get(base_scipy + scipy_sections[5][2])
soup = BeautifulSoup(req.content)

In [186]:
print(base_scipy + scipy_sections[5][2])

https://docs.scipy.org/doc/scipy/reference/tutorial/interpolate.html


In [188]:
print(soup.prettify())

<!DOCTYPE html>
<html>
 <head>
  <meta charset="utf-8"/>
  <title>
   Interpolation (scipy.interpolate) — SciPy v1.5.2 Reference Guide
  </title>
  <link href="../_static/css/spc-bootstrap.css" rel="stylesheet" type="text/css"/>
  <link href="../_static/css/spc-extend.css" rel="stylesheet" type="text/css"/>
  <link href="../_static/scipy.css" rel="stylesheet" type="text/css"/>
  <link href="../_static/pygments.css" rel="stylesheet" type="text/css"/>
  <script data-url_root="../" id="documentation_options" src="../_static/documentation_options.js" type="text/javascript">
  </script>
  <script src="../_static/jquery.js" type="text/javascript">
  </script>
  <script src="../_static/underscore.js" type="text/javascript">
  </script>
  <script src="../_static/doctools.js" type="text/javascript">
  </script>
  <script src="../_static/language_data.js" type="text/javascript">
  </script>
  <script src="../_static/versioncheck.js" type="text/javascript">
  </script>
  <script src="../_static/s

In [192]:
def getPage(url):
    req = requests.get(url)
    soup = BeautifulSoup(req.content)
    content_text = soup.find('div', {'class':"body"})
    
    return content_text

In [193]:
def splitSections(page, header):

    if isinstance(page, str):
        soup = BeautifulSoup(page)
    else:
        soup = page
        
    html_sections = []
    for idx, data in enumerate(soup.prettify().split(header)):
        if idx == 0:
            continue

        data_string = f'{header}{data}'
        section_name = (
            re.sub( "_+", "_",produceID(
                '_'.join(BeautifulSoup(data_string)
                .find(header.strip("<>"))
                .text
                .strip()
                .split('\n')[:3]),
            package=''
            ))
        )
        
        html_sections.append((section_name, data_string))
        
    return html_sections

In [194]:

def processhtml(section):
    
    if isinstance(section, str):
        soup = BeautifulSoup(section)
    else:
        soup = section
    
    for div in soup.find_all("span", {'class':'math notranslate nohighlight'}): 
        div.decompose()
        
    for div in soup.find_all("code", {'class':'docutils literal notranslate'}): 
        div.decompose()
        
    for div in soup.find_all("pre"): 
        div.decompose()
        
    rem_newlines = re.sub(r'[\n]{1,}', ' ', soup.text)
    x = re.sub(r'[\ ]{2,}', ' ', rem_newlines)
    x = x.replace('=', '')
    x = x.replace('>>>', '')
    x = x.replace('¶', '')

    return x

In [200]:
for pageid, title, link in scipy_sections:
    print(title, link)
    
    if not os.path.exists('scipy'):
        os.mkdir('scipy')
    
    try:
        page = getPage(base_scipy + link)
        splits = splitSections(page, '<h2>')
        if not splits:
            splits = splitSections(page, '<h3>')

        for section_name, section_data in splits:
            text = processhtml(section_data)
            print(pageid, section_name)
            with open(f'scipy/{pageid}{section_name}.txt', 'w') as f:
                f.write(f"{pageid}\n{title}\n{link}\n") 
                f.write(str(text))
    except:
        print(f"Error retrieving page {title}")
        
    sleep(2)
    

Introduction general.html
scipy_introduction _scipy_organization
scipy_introduction _finding_documentation
Basic functions basic.html
scipy_basic_functions _interaction_with_numpy
Special functions (scipy.special) special.html
scipy_special_functions_scipy_special _bessel_functions_of_real_order
scipy_special_functions_scipy_special _cython_bindings_for_special_functions
scipy_special_functions_scipy_special _functions_not_in
Integration (scipy.integrate) integrate.html
scipy_integration_scipy_integrate _general_integration
scipy_integration_scipy_integrate _general_multiple_integration
scipy_integration_scipy_integrate _gaussian_quadrature_¶
scipy_integration_scipy_integrate _romberg_integration_¶
scipy_integration_scipy_integrate _integrating_using_samples_¶
scipy_integration_scipy_integrate _faster_integration_using_lowlevel_callback_functions_¶
scipy_integration_scipy_integrate _ordinary_differential_equations
Optimization (scipy.optimize) optimize.html
scipy_optimization_scipy_opt

# SymPy

# PyTorch

In [None]:
pytorch_case = ''

# Fast.ai

# Tensorflow

# Caret

In [3]:
caret_base = "https://topepo.github.io/caret/"

In [4]:
req = requests.get(caret_base)

In [12]:
soup = BeautifulSoup(req.content)

In [132]:
caret_sections = []

for li in soup.find_all('li', {'class': 'chapter'}):

    children = li.findChildren("a" , recursive=False)
    for child in children:
        if "#" in child['href']:
            continue
        print(child['href'])
        print(child.text)
        id_ = produceID(child.text, 'caret')
        caret_sections.append((id_, child.text, child['href']))
    

index.html
1 Introduction
visualizations.html
2 Visualizations
pre-processing.html
3 Pre-Processing
data-splitting.html
4 Data Splitting
model-training-and-tuning.html
5 Model Training and Tuning
available-models.html
6 Available Models
train-models-by-tag.html
7 train Models By Tag
models-clustered-by-tag-similarity.html
8 Models Clustered by Tag Similarity
parallel-processing.html
9 Parallel Processing
random-hyperparameter-search.html
10 Random Hyperparameter Search
subsampling-for-class-imbalances.html
11 Subsampling For Class Imbalances
using-recipes-with-train.html
12 Using Recipes with train
using-your-own-model-in-train.html
13 Using Your Own Model in train
adaptive-resampling.html
14 Adaptive Resampling
variable-importance.html
15 Variable Importance
miscellaneous-model-functions.html
16 Miscellaneous Model Functions
measuring-performance.html
17 Measuring Performance
feature-selection-overview.html
18 Feature Selection Overview
feature-selection-using-univariate-filters.html


In [159]:
test = getPage(caret_base + caret_sections[3][-1])

In [164]:
sections = splitSections(test, '<h2>')

In [165]:
processed = []
for name, text in sections:
    processed.append(processhtml(text))

In [166]:
processed

[' 4.1 Simple Splitting Based on the Outcome The function createDataPartition can be used to create balanced splits of the data. If the y argument to this function is a factor, the random sampling occurs within each class and should preserve the overall class distribution of the data. For example, to create a single 80/20% split of the iris data: The list  FALSE avoids returning the data as a list. This function also has an argument, times , that can create multiple splits at once; the data indices are returned in a list of integer vectors. Similarly, createResample can be used to make simple bootstrap samples and createFolds can be used to generate balanced cross–validation groupings from a set of data. ',
 ' 4.2 Splitting Based on the Predictors Also, the function maxDissim can be used to create sub–samples using a maximum dissimilarity approach ( Willett, 1999 ). Suppose there is a data set A with m samples and a larger data set B with n samples. We may want to create a sub–sample f

In [168]:
def getPage(url):
    req = requests.get(url)
    soup = BeautifulSoup(req.content)
    content_text = soup.find('div', {'role':"main"})
    
    return content_text

In [170]:
def splitSections(page, header):

    if isinstance(page, str):
        soup = BeautifulSoup(page)
    else:
        soup = page
        
    html_sections = []
    for idx, data in enumerate(soup.prettify().split(header)):
        if idx == 0:
            continue

        data_string = f'{header}{data}'
        section_name = (
            re.sub( "_+", "_",produceID(
                '_'.join(BeautifulSoup(data_string)
                .find(header.strip("<>"))
                .text
                .strip()
                .split('\n')[:3]),
            package=''
            ))
        )
        
        html_sections.append((section_name, data_string))
        
    return html_sections

In [171]:

def processhtml(section):
    
    if isinstance(section, str):
        soup = BeautifulSoup(section)
    else:
        soup = section
    
    for div in soup.find_all("span", {'class':'math notranslate nohighlight'}): 
        div.decompose()
        
    for div in soup.find_all("code", {'class':'docutils literal notranslate'}): 
        div.decompose()
        
    for div in soup.find_all("pre"): 
        div.decompose()
        
    rem_newlines = re.sub(r'[\n]{1,}', ' ', soup.text)
    x = re.sub(r'[\ ]{2,}', ' ', rem_newlines)
    x = x.replace('=', '')
    x = x.replace('>>>', '')
    x = x.replace('¶', '')

    return x

In [172]:
for pageid, title, link in caret_sections:
    print(title, link)
    
    if not os.path.exists('caret'):
        os.mkdir('caret')
    
    try:
        page = getPage(caret_base + link)
        splits = splitSections(page, '<h2>')
        if not splits:
            splits = splitSections(page, '<h3>')

        for section_name, section_data in splits:
            text = processhtml(section_data)
            print(pageid, section_name)
            with open(f'caret/{pageid}{section_name}.txt', 'w') as f:
                f.write(f"{pageid}\n{title}\n{link}\n") 
                f.write(str(text))
    except:
        print(f"Error retrieving page {title}")
        
    sleep(2)
    

1 Introduction index.html
2 Visualizations visualizations.html
3 Pre-Processing pre-processing.html
caret_3_preprocessing _3_1_creating_dummy_variables
caret_3_preprocessing _3_2_zero_and_near_zerovariance_predictors
caret_3_preprocessing _3_3_identifying_correlated_predictors
caret_3_preprocessing _3_4_linear_dependencies
caret_3_preprocessing _3_5_the
caret_3_preprocessing _3_6_centering_and_scaling
caret_3_preprocessing _3_7_imputation
caret_3_preprocessing _3_8_transforming_predictors
caret_3_preprocessing _3_9_putting_it_all_together
caret_3_preprocessing _3_10_class_distance_calculations
4 Data Splitting data-splitting.html
caret_4_data_splitting _4_1_simple_splitting_based_on_the_outcome
caret_4_data_splitting _4_2_splitting_based_on_the_predictors
caret_4_data_splitting _4_3_data_splitting_for_time_series
caret_4_data_splitting _4_4_simple_splitting_with_important_groups
5 Model Training and Tuning model-training-and-tuning.html
caret_5_model_training_and_tuning _5_1_model_trai

21 Feature Selection using Genetic Algorithms feature-selection-using-genetic-algorithms.html
caret_21_feature_selection_using_genetic_algorithms _21_1_genetic_algorithms
caret_21_feature_selection_using_genetic_algorithms _21_2_internal_and_external_performance_estimates
caret_21_feature_selection_using_genetic_algorithms _21_3_basic_syntax
caret_21_feature_selection_using_genetic_algorithms _21_4_genetic_algorithm_example
caret_21_feature_selection_using_genetic_algorithms _21_5_customizing_the_search
caret_21_feature_selection_using_genetic_algorithms _21_6_the_example_revisited
caret_21_feature_selection_using_genetic_algorithms _21_7_using_recipes
22 Feature Selection using Simulated Annealing feature-selection-using-simulated-annealing.html
caret_22_feature_selection_using_simulated_annealing _22_1_simulated_annealing
caret_22_feature_selection_using_simulated_annealing _22_2_internal_and_external_performance_estimates
caret_22_feature_selection_using_simulated_annealing _22_3_ba