In [13]:
import os

from bs4 import BeautifulSoup
import requests
import json
import re
from time import sleep

In [2]:
base_link = "https://scikit-learn.org/stable/"

In [3]:
req = requests.get("https://scikit-learn.org/stable/user_guide.html#")

In [4]:
soup = BeautifulSoup(req.content)

In [48]:
print(soup.prettify())

<!DOCTYPE html>
<!--[if IE 8]><html class="no-js lt-ie9" lang="en" > <![endif]-->
<!--[if gt IE 8]><!-->
<html class="no-js" lang="en">
 <!--<![endif]-->
 <head>
  <meta charset="utf-8"/>
  <meta content="width=device-width, initial-scale=1.0" name="viewport"/>
  <meta content="scikit-learn: machine learning in Python" name="Description"/>
  <title>
   User guide: contents — scikit-learn 0.23.1 documentation
  </title>
  <link href="http://scikit-learn.org/stable/user_guide.html" rel="canonical"/>
  <link href="_static/favicon.ico" rel="shortcut icon"/>
  <link href="_static/css/vendor/bootstrap.min.css" rel="stylesheet" type="text/css"/>
  <link href="_static/gallery.css" rel="stylesheet" type="text/css"/>
  <link href="_static/gallery-binder.css" rel="stylesheet" type="text/css"/>
  <link href="_static/gallery-dataframe.css" rel="stylesheet" type="text/css"/>
  <link href="_static/css/theme.css" rel="stylesheet" type="text/css"/>
  <script data-url_root="./" id="documentation_options

In [5]:
link_list = []
for link in soup.find_all('a', {'class': 'reference internal'}):
    link_list.append((link.text, link['href']))

In [6]:
top_sections = sorted(list(set([x for x in link_list if re.match('[0-9].[ ].*', x[0])])))

In [7]:
second_sections = sorted(list(set([x for x in link_list if re.match('[0-9]\.[0-9]{1,2}\.[ ].*', x[0])])))

In [37]:
def produceID(title, package):
    s = re.sub("[():,]", " ", title)
    s = "_".join([
        x for x in s.split(" ")
        if not re.match('[0-9]', x)
    ])
    
    file_name = (
        s
        .replace(".","_")
        .replace(" ", "_")
        .replace("__","_")
        .strip("_")
        .lower()
    )
    
    
    
    full_name = f'{package}_{file_name}'
    
    return full_name
    

In [39]:
sections = [(produceID(x[0], 'sklearn'), x[0], x[1]) for x in second_sections]

In [8]:
def getPage(url):
    req = requests.get(url)
    soup = BeautifulSoup(req.content)
    content_text = soup.find('div', {'class':"section"})
    
    return content_text

In [46]:
with open('sklearn_data/1_1_Linear_Models.txt') as f:
    html_str = f.read()
    
soup = BeautifulSoup(html_str)

In [48]:
soup.find_all('h2')

[<h2>1.1.1. Ordinary Least Squares<a class="headerlink" href="#ordinary-least-squares" title="Permalink to this headline">¶</a></h2>,
 <h2>1.1.2. Ridge regression and classification<a class="headerlink" href="#ridge-regression-and-classification" title="Permalink to this headline">¶</a></h2>,
 <h2>1.1.3. Lasso<a class="headerlink" href="#lasso" title="Permalink to this headline">¶</a></h2>,
 <h2>1.1.4. Multi-task Lasso<a class="headerlink" href="#multi-task-lasso" title="Permalink to this headline">¶</a></h2>,
 <h2>1.1.5. Elastic-Net<a class="headerlink" href="#elastic-net" title="Permalink to this headline">¶</a></h2>,
 <h2>1.1.6. Multi-task Elastic-Net<a class="headerlink" href="#multi-task-elastic-net" title="Permalink to this headline">¶</a></h2>,
 <h2>1.1.7. Least Angle Regression<a class="headerlink" href="#least-angle-regression" title="Permalink to this headline">¶</a></h2>,
 <h2>1.1.8. LARS Lasso<a class="headerlink" href="#lars-lasso" title="Permalink to this headline">¶</a><

In [54]:
html_sections = []
for idx, data in enumerate(soup.prettify().split('<h2>')):
    if idx == 0:
        continue
    html_sections.append(f'<h2>{data}')

In [57]:
soup2 = BeautifulSoup(html_sections[0])

In [62]:
for div in soup2.find_all("span", {'class':'math notranslate nohighlight'}): 
    div.decompose()

In [78]:
# print(soup2.text)

def processhtml()
    rem_newlines = re.sub(r'[\n]{1,}', ' ', soup2.text)
    rem_whitespace = re.sub(r'[\ ]{2,}', ' ', rem_newlines)



In [79]:
rem_whitespace

' 1.1.1. Ordinary Least Squares ¶ LinearRegression fits a linear model with coefficients to minimize the residual sum of squares between the observed targets in the dataset, and the targets predicted by the linear approximation. Mathematically it solves a problem of the form: \\[\\min_{w} || X w - y||_2^2\\] LinearRegression will take in its fit method arrays X, y and will store the coefficients of the linear model in its coef_ member: >>> from sklearn import linear_model >>> reg = linear_model.LinearRegression() >>> reg.fit([[0, 0], [1, 1], [2, 2]], [0, 1, 2]) LinearRegression() >>> reg.coef_ array([0.5, 0.5]) The coefficient estimates for Ordinary Least Squares rely on the independence of the features. When features are correlated and the columns of the design matrix have an approximate linear dependence, the design matrix becomes close to singular and as a result, the least-squares estimate becomes highly sensitive to random errors in the observed target, producing a large variance.

In [47]:
print(soup.prettify())

<div class="section" id="linear-models">
 <span id="linear-model">
 </span>
 <h1>
  1.1. Linear Models
  <a class="headerlink" href="#linear-models" title="Permalink to this headline">
   ¶
  </a>
 </h1>
 <p>
  The following are a set of methods intended for regression in which
the target value is expected to be a linear combination of the features.
In mathematical notation, if
  <span class="math notranslate nohighlight">
   \(\hat{y}\)
  </span>
  is the predicted
value.
 </p>
 <div class="math notranslate nohighlight">
  \[\hat{y}(w, x) = w_0 + w_1 x_1 + ... + w_p x_p\]
 </div>
 <p>
  Across the module, we designate the vector
  <span class="math notranslate nohighlight">
   \(w = (w_1,
..., w_p)\)
  </span>
  as
  <code class="docutils literal notranslate">
   <span class="pre">
    coef_
   </span>
  </code>
  and
  <span class="math notranslate nohighlight">
   \(w_0\)
  </span>
  as
  <code class="docutils literal notranslate">
   <span class="pre">
    intercept_
   </span>
  <

In [10]:
for pageid, title, link in sections:
    print(title)
    
    try:
        page = getPage(base_link + link)
        if not os.path.exists('sklearn_data'):
            os.mkdir('sklearn_data')
        with open(f'sklearn_data/{pageid}.txt', 'w') as f:
            f.write(str(page))
    except:
        print(f"Error retrieving page {title}")
        
    sleep(2)
    

1.1. Linear Models
1.10. Decision Trees
1.11. Ensemble methods
1.12. Multiclass and multilabel algorithms
1.13. Feature selection
1.14. Semi-Supervised
1.15. Isotonic regression
1.16. Probability calibration
1.17. Neural network models (supervised)
1.2. Linear and Quadratic Discriminant Analysis
1.3. Kernel ridge regression
1.4. Support Vector Machines
1.5. Stochastic Gradient Descent
1.6. Nearest Neighbors
1.7. Gaussian Processes
1.8. Cross decomposition
1.9. Naive Bayes
2.1. Gaussian mixture models
2.2. Manifold learning
2.3. Clustering
2.4. Biclustering
2.5. Decomposing signals in components (matrix factorization problems)
2.6. Covariance estimation
2.7. Novelty and Outlier Detection
2.8. Density Estimation
2.9. Neural network models (unsupervised)
3.1. Cross-validation: evaluating estimator performance
3.2. Tuning the hyper-parameters of an estimator
3.3. Metrics and scoring: quantifying the quality of predictions
3.4. Model persistence
3.5. Validation curves: plotting scores to ev

In [45]:
sorted(os.listdir('sklearn_data'))

['.ipynb_checkpoints',
 '1_10_Decision_Trees.txt',
 '1_11_Ensemble_methods.txt',
 '1_12_Multiclass_and_multilabel_algorithms.txt',
 '1_13_Feature_selection.txt',
 '1_14_Semi-Supervised.txt',
 '1_15_Isotonic_regression.txt',
 '1_16_Probability_calibration.txt',
 '1_17_Neural_network_models_(supervised).txt',
 '1_1_Linear_Models.txt',
 '1_2_Linear_and_Quadratic_Discriminant_Analysis.txt',
 '1_3_Kernel_ridge_regression.txt',
 '1_4_Support_Vector_Machines.txt',
 '1_5_Stochastic_Gradient_Descent.txt',
 '1_6_Nearest_Neighbors.txt',
 '1_7_Gaussian_Processes.txt',
 '1_8_Cross_decomposition.txt',
 '1_9_Naive_Bayes.txt',
 '2_1_Gaussian_mixture_models.txt',
 '2_2_Manifold_learning.txt',
 '2_3_Clustering.txt',
 '2_4_Biclustering.txt',
 '2_5_Decomposing_signals_in_components_(matrix_factorization_problems).txt',
 '2_6_Covariance_estimation.txt',
 '2_7_Novelty_and_Outlier_Detection.txt',
 '2_8_Density_Estimation.txt',
 '2_9_Neural_network_models_(unsupervised).txt',
 '3_1_Cross-validation:_evaluatin

In [None]:
"https://scikit-learn.org/stable/modules/u
req = requests.get("https://scikit-learn.org/stable/modules/linear_model.html#lars-lasso")
soup = BeautifulSoup(req.content)
print(soup.prettify())