In [2]:
import os

from bs4 import BeautifulSoup
import requests
import json
import re
from time import sleep

In [58]:
base_link = "https://scikit-learn.org/stable/"

In [59]:
req = requests.get("https://scikit-learn.org/stable/user_guide.html#")

In [60]:
soup = BeautifulSoup(req.content)

In [61]:
print(soup.prettify())

<!DOCTYPE html>
<!--[if IE 8]><html class="no-js lt-ie9" lang="en" > <![endif]-->
<!--[if gt IE 8]><!-->
<html class="no-js" lang="en">
 <!--<![endif]-->
 <head>
  <meta charset="utf-8"/>
  <meta content="width=device-width, initial-scale=1.0" name="viewport"/>
  <meta content="scikit-learn: machine learning in Python" name="Description"/>
  <title>
   User guide: contents — scikit-learn 0.23.2 documentation
  </title>
  <link href="http://scikit-learn.org/stable/user_guide.html" rel="canonical"/>
  <link href="_static/favicon.ico" rel="shortcut icon"/>
  <link href="_static/css/vendor/bootstrap.min.css" rel="stylesheet" type="text/css"/>
  <link href="_static/gallery.css" rel="stylesheet" type="text/css"/>
  <link href="_static/gallery-binder.css" rel="stylesheet" type="text/css"/>
  <link href="_static/gallery-dataframe.css" rel="stylesheet" type="text/css"/>
  <link href="_static/css/theme.css" rel="stylesheet" type="text/css"/>
  <script data-url_root="./" id="documentation_options

In [62]:
link_list = []
for link in soup.find_all('a', {'class': 'reference internal'}):
    link_list.append((link.text, link['href']))

In [63]:
top_sections = sorted(list(set([x for x in link_list if re.match('[0-9].[ ].*', x[0])])))

In [64]:
second_sections = sorted(list(set([x for x in link_list if re.match('[0-9]\.[0-9]{1,2}\.[ ].*', x[0])])))

In [123]:
def produceID(title, package):
    s = re.sub("[():,]", " ", title)
    s = "_".join([
        x for x in s.split(" ")
#         if not re.match('[0-9]', x)
    ])
    
    file_name = (
        s
        .replace(".","_")
        .replace(" ", "_")
        .replace("__","_")
        .strip("_")
        .lower()
    )
    
    
    
    full_name = f'{package}_{file_name}'
    
    return full_name
    

In [124]:
sections = [(produceID(x[0], 'sklearn'), x[0], x[1]) for x in second_sections]

In [125]:
sections[:5]

[('sklearn_1_1_linear_models',
  '1.1. Linear Models',
  'modules/linear_model.html'),
 ('sklearn_1_10_decision_trees', '1.10. Decision Trees', 'modules/tree.html'),
 ('sklearn_1_11_ensemble_methods',
  '1.11. Ensemble methods',
  'modules/ensemble.html'),
 ('sklearn_1_12_multiclass_and_multilabel_algorithms',
  '1.12. Multiclass and multilabel algorithms',
  'modules/multiclass.html'),
 ('sklearn_1_13_feature_selection',
  '1.13. Feature selection',
  'modules/feature_selection.html')]

In [9]:
with open('sklearn_data/1_1_Linear_Models.txt') as f:
    html_str = f.read()
    
soup = BeautifulSoup(html_str)

In [22]:
soup2 = BeautifulSoup(html_sections[0])

In [91]:
def getPage(url):
    req = requests.get(url)
    soup = BeautifulSoup(req.content)
    content_text = soup.find('div', {'class':"section"})
    
    return content_text

In [130]:
def splitSections(page):

    if isinstance(page, str):
        soup = BeautifulSoup(page)
    else:
        soup = page
        
    html_sections = []
    for idx, data in enumerate(soup.prettify().split('<h2>')):
        if idx == 0:
            continue

        data_string = f'<h2>{data}'
        section_name = (
            produceID(
                BeautifulSoup(data_string)
                .find("h2")
                .text
                .strip()
                .split('\n')[0],
            package=''
            )
        )
        
        html_sections.append((section_name, data_string))
        
    return html_sections

In [132]:

def processhtml(section):
    
    if isinstance(section, str):
        soup = BeautifulSoup(section)
    else:
        soup = section
    
    for div in soup.find_all("span", {'class':'math notranslate nohighlight'}): 
        div.decompose()
        
    for div in soup.find_all("code", {'class':'docutils literal notranslate'}): 
        div.decompose()
        
    for div in soup.find_all("pre"): 
        div.decompose()
        
    rem_newlines = re.sub(r'[\n]{1,}', ' ', soup.text)
    x = re.sub(r'[\ ]{2,}', ' ', rem_newlines)
    x = x.replace('=', '')
    x = x.replace('>>>', '')
    x = x.replace('¶', '')

    return x

In [76]:
test = processhtml(BeautifulSoup(html_sections[2]))

In [79]:
test_soup = BeautifulSoup(html_sections[2])

In [135]:
for pageid, title, link in sections:
    print(title)
    
    if not os.path.exists('sklearn'):
        os.mkdir('sklearn')
    
    try:
        page = getPage(base_link + link)
        splits = splitSections(page)
        for section_name, section_data in splits:
            text = processhtml(section_data)
            print(pageid, section_name)
            with open(f'sklearn/{pageid}{section_name}.txt', 'w') as f:
                f.write(f"{pageid}\n{title}\n{link}\n")
                f.write(str(text))
    except:
        print(f"Error retrieving page {title}")
        
    sleep(2)
    

1.1. Linear Models
sklearn_1_1_linear_models _1_1_1_ordinary_least_squares
sklearn_1_1_linear_models _1_1_2_ridge_regression_and_classification
sklearn_1_1_linear_models _1_1_3_lasso
sklearn_1_1_linear_models _1_1_4_multi-task_lasso
sklearn_1_1_linear_models _1_1_5_elastic-net
sklearn_1_1_linear_models _1_1_6_multi-task_elastic-net
sklearn_1_1_linear_models _1_1_7_least_angle_regression
sklearn_1_1_linear_models _1_1_8_lars_lasso
sklearn_1_1_linear_models _1_1_9_orthogonal_matching_pursuit_omp
sklearn_1_1_linear_models _1_1_10_bayesian_regression
sklearn_1_1_linear_models _1_1_11_logistic_regression
sklearn_1_1_linear_models _1_1_12_generalized_linear_regression
sklearn_1_1_linear_models _1_1_13_stochastic_gradient_descent_-_sgd
sklearn_1_1_linear_models _1_1_14_perceptron
sklearn_1_1_linear_models _1_1_15_passive_aggressive_algorithms
sklearn_1_1_linear_models _1_1_16_robustness_regression_outliers_and_modeling_errors
sklearn_1_1_linear_models _1_1_17_polynomial_regression_extending_l

sklearn_2_3_clustering _2_3_10_clustering_performance_evaluation
2.4. Biclustering
sklearn_2_4_biclustering _2_4_1_spectral_co-clustering
sklearn_2_4_biclustering _2_4_2_spectral_biclustering
sklearn_2_4_biclustering _2_4_3_biclustering_evaluation
2.5. Decomposing signals in components (matrix factorization problems)
sklearn_2_5_decomposing_signals_in_components_matrix_factorization_problems _2_5_1_principal_component_analysis_pca
sklearn_2_5_decomposing_signals_in_components_matrix_factorization_problems _2_5_2_truncated_singular_value_decomposition_and_latent_semantic_analysis
sklearn_2_5_decomposing_signals_in_components_matrix_factorization_problems _2_5_3_dictionary_learning
sklearn_2_5_decomposing_signals_in_components_matrix_factorization_problems _2_5_4_factor_analysis
sklearn_2_5_decomposing_signals_in_components_matrix_factorization_problems _2_5_5_independent_component_analysis_ica
sklearn_2_5_decomposing_signals_in_components_matrix_factorization_problems _2_5_6_non-negativ

sklearn_7_1_general_dataset_api _7_5_loading_other_datasets
7.2. Toy datasets
sklearn_7_2_toy_datasets _7_1_general_dataset_api
sklearn_7_2_toy_datasets _7_2_toy_datasets
sklearn_7_2_toy_datasets _7_3_real_world_datasets
sklearn_7_2_toy_datasets _7_4_generated_datasets
sklearn_7_2_toy_datasets _7_5_loading_other_datasets
7.3. Real world datasets
sklearn_7_3_real_world_datasets _7_1_general_dataset_api
sklearn_7_3_real_world_datasets _7_2_toy_datasets
sklearn_7_3_real_world_datasets _7_3_real_world_datasets
sklearn_7_3_real_world_datasets _7_4_generated_datasets
sklearn_7_3_real_world_datasets _7_5_loading_other_datasets
7.4. Generated datasets
sklearn_7_4_generated_datasets _7_1_general_dataset_api
sklearn_7_4_generated_datasets _7_2_toy_datasets
sklearn_7_4_generated_datasets _7_3_real_world_datasets
sklearn_7_4_generated_datasets _7_4_generated_datasets
sklearn_7_4_generated_datasets _7_5_loading_other_datasets
7.5. Loading other datasets
sklearn_7_5_loading_other_datasets _7_1_gener

In [45]:
sorted(os.listdir('sklearn_data'))

['.ipynb_checkpoints',
 '1_10_Decision_Trees.txt',
 '1_11_Ensemble_methods.txt',
 '1_12_Multiclass_and_multilabel_algorithms.txt',
 '1_13_Feature_selection.txt',
 '1_14_Semi-Supervised.txt',
 '1_15_Isotonic_regression.txt',
 '1_16_Probability_calibration.txt',
 '1_17_Neural_network_models_(supervised).txt',
 '1_1_Linear_Models.txt',
 '1_2_Linear_and_Quadratic_Discriminant_Analysis.txt',
 '1_3_Kernel_ridge_regression.txt',
 '1_4_Support_Vector_Machines.txt',
 '1_5_Stochastic_Gradient_Descent.txt',
 '1_6_Nearest_Neighbors.txt',
 '1_7_Gaussian_Processes.txt',
 '1_8_Cross_decomposition.txt',
 '1_9_Naive_Bayes.txt',
 '2_1_Gaussian_mixture_models.txt',
 '2_2_Manifold_learning.txt',
 '2_3_Clustering.txt',
 '2_4_Biclustering.txt',
 '2_5_Decomposing_signals_in_components_(matrix_factorization_problems).txt',
 '2_6_Covariance_estimation.txt',
 '2_7_Novelty_and_Outlier_Detection.txt',
 '2_8_Density_Estimation.txt',
 '2_9_Neural_network_models_(unsupervised).txt',
 '3_1_Cross-validation:_evaluatin