## 1. Get webpage using *requests*

In [50]:
import requests

req = requests.get('https://en.wikipedia.org/wiki/Deep_learning')

In [51]:
webpage = req.text

In [52]:
print(webpage)

<!DOCTYPE html>
<html class="client-nojs vector-feature-language-in-header-enabled vector-feature-language-in-main-page-header-disabled vector-feature-language-alert-in-sidebar-enabled vector-feature-sticky-header-disabled vector-feature-page-tools-disabled vector-feature-page-tools-pinned-disabled vector-feature-main-menu-pinned-disabled vector-feature-limited-width-enabled vector-feature-limited-width-content-enabled" lang="en" dir="ltr">
<head>
<meta charset="UTF-8"/>
<title>Deep learning - Wikipedia</title>
<script>document.documentElement.className="client-js vector-feature-language-in-header-enabled vector-feature-language-in-main-page-header-disabled vector-feature-language-alert-in-sidebar-enabled vector-feature-sticky-header-disabled vector-feature-page-tools-disabled vector-feature-page-tools-pinned-disabled vector-feature-main-menu-pinned-disabled vector-feature-limited-width-enabled vector-feature-limited-width-content-enabled";(function(){var cookie=document.cookie.match(/

## 2. Get specific contents using BeatifulSoup

In [53]:
from bs4 import BeautifulSoup

soup = BeautifulSoup(webpage, 'html.parser')

### 2.1 Prettify the webpage

In [54]:
print(soup.prettify())

<!DOCTYPE html>
<html class="client-nojs vector-feature-language-in-header-enabled vector-feature-language-in-main-page-header-disabled vector-feature-language-alert-in-sidebar-enabled vector-feature-sticky-header-disabled vector-feature-page-tools-disabled vector-feature-page-tools-pinned-disabled vector-feature-main-menu-pinned-disabled vector-feature-limited-width-enabled vector-feature-limited-width-content-enabled" dir="ltr" lang="en">
 <head>
  <meta charset="utf-8"/>
  <title>
   Deep learning - Wikipedia
  </title>
  <script>
   document.documentElement.className="client-js vector-feature-language-in-header-enabled vector-feature-language-in-main-page-header-disabled vector-feature-language-alert-in-sidebar-enabled vector-feature-sticky-header-disabled vector-feature-page-tools-disabled vector-feature-page-tools-pinned-disabled vector-feature-main-menu-pinned-disabled vector-feature-limited-width-enabled vector-feature-limited-width-content-enabled";(function(){var cookie=docum

### 2.2 Get the first paragraph with information

You can try to remove "attrs" to see how it works.

In [55]:
paragraphs = soup.find_all('p', attrs={"class":False})

In [56]:
paragraph = paragraphs[1]

### 2.3 Get all the links in the paragraph which point to other webpages

In [57]:
data = {"title":[], "href":[]}
for link in paragraph.find_all('a', attrs={"title":True}):
    data["title"].append(link["title"])
    data["href"].append(link["href"])

In [58]:
import pandas as pd
df = pd.DataFrame(data)

In [59]:
df

Unnamed: 0,title,href
0,Machine learning,/wiki/Machine_learning
1,Artificial neural network,/wiki/Artificial_neural_network
2,Representation learning,/wiki/Representation_learning
3,Supervised learning,/wiki/Supervised_learning
4,Semi-supervised learning,/wiki/Semi-supervised_learning
5,Unsupervised learning,/wiki/Unsupervised_learning


## 3. Get the contents from all the webpages

In [60]:
webpages = []
head = "https://en.wikipedia.org"
for href in data["href"]:
    link = head + href
    req = requests.get(link)
    webpage = req.text
    webpages.append(webpage)

In [61]:
len(webpages)

6

## 4. Repeat the above processes twice to get more data

In [62]:
for webpage in webpages:
    soup = BeautifulSoup(webpage, 'html.parser')
    paragraphs = soup.find_all('p', attrs={"class":False})
    paragraph = paragraphs[1]
    for link in paragraph.find_all('a', attrs={"title":True}):
        data["title"].append(link["title"])
        data["href"].append(link["href"])

In [65]:
webpages = []
head = "https://en.wikipedia.org"
for href in data["href"]:
    link = head + href
    req = requests.get(link)
    webpage = req.text
    webpages.append(webpage)

In [66]:
for webpage in webpages:
    soup = BeautifulSoup(webpage, 'html.parser')
    paragraphs = soup.find_all('p', attrs={"class":False})
    paragraph = paragraphs[1]
    for link in paragraph.find_all('a', attrs={"title":True}):
        data["title"].append(link["title"])
        data["href"].append(link["href"])

In [67]:
df = pd.DataFrame(data)

In [68]:
df

Unnamed: 0,title,href
0,Machine learning,/wiki/Machine_learning
1,Artificial neural network,/wiki/Artificial_neural_network
2,Representation learning,/wiki/Representation_learning
3,Supervised learning,/wiki/Supervised_learning
4,Semi-supervised learning,/wiki/Semi-supervised_learning
...,...,...
187,Map (mathematics),/wiki/Map_(mathematics)
188,Training set,/wiki/Training_set
189,Inductive bias,/wiki/Inductive_bias
190,Generalization error,/wiki/Generalization_error


## 5. Save df to csv

In [69]:
df.to_csv('Sample_data.csv', index=False)