# Web Scraping Mini Project

In [75]:
from bs4 import BeautifulSoup
import requests
import pandas as pd

In [2]:
url = 'https://github.com/topics'
response = requests.get(url)

In [3]:
response.status_code

200

In [4]:
page_contents = response.text 

In [5]:
page_contents[:1000]

'\n\n<!DOCTYPE html>\n<html lang="en" data-color-mode="auto" data-light-theme="light" data-dark-theme="dark">\n  <head>\n    <meta charset="utf-8">\n  <link rel="dns-prefetch" href="https://github.githubassets.com">\n  <link rel="dns-prefetch" href="https://avatars.githubusercontent.com">\n  <link rel="dns-prefetch" href="https://github-cloud.s3.amazonaws.com">\n  <link rel="dns-prefetch" href="https://user-images.githubusercontent.com/">\n  <link rel="preconnect" href="https://github.githubassets.com" crossorigin>\n  <link rel="preconnect" href="https://avatars.githubusercontent.com">\n\n\n\n  <link crossorigin="anonymous" media="all" integrity="sha512-E9wnWjoxQmh5A1jiWVYDPKOvA8VPf0iKQYoc+9ycMJvtAi9gOSlaUci+W2smxFIlWkV8hkX+O27S8NIB59iIDw==" rel="stylesheet" href="https://github.githubassets.com/assets/light-13dc275a3a314268790358e25956033c.css" /><link crossorigin="anonymous" media="all" integrity="sha512-nYSv3KrFhMlGUpjkFQBLMEN6HvHhijcoubQLjV3DWlcABEi2yDYf6KGUjRubJ5R+dJnKXR7jA4wu5Dg2

In [7]:
with open('webpage.html','w',encoding='utf-8') as f:
    f.write(page_contents)

<h3>Beautiful Soup starts

In [12]:
doc = BeautifulSoup(page_contents,'html.parser')

<h3>Extract the titles of all topics

In [29]:
p_tag = doc.find_all('p',class_='f3 lh-condensed mb-0 mt-1 Link--primary')
topic_title = []
for title in p_tag:
    topic_title.append(title.text)
print(topic_title)    

['3D', 'Ajax', 'Algorithm', 'Amp', 'Android', 'Angular', 'Ansible', 'API', 'Arduino', 'ASP.NET', 'Atom', 'Awesome Lists', 'Amazon Web Services', 'Azure', 'Babel', 'Bash', 'Bitcoin', 'Bootstrap', 'Bot', 'C', 'Chrome', 'Chrome extension', 'Command line interface', 'Clojure', 'Code quality', 'Code review', 'Compiler', 'Continuous integration', 'COVID-19', 'C++']


<h3>Extract Description of Topics

In [35]:
p_tag_desc = doc.find_all('p',class_='f5 color-fg-muted mb-0 mt-1')
topic_desc = []
for desc in p_tag_desc:
    desc_text = desc.text
    topic_desc.append(desc_text)
topic_desc  

['\n          3D modeling is the process of virtually developing the surface and structure of a 3D object.\n        ',
 '\n          Ajax is a technique for creating interactive web applications.\n        ',
 '\n          Algorithms are self-contained sequences that carry out a variety of tasks.\n        ',
 '\n          Amp is a non-blocking concurrency framework for PHP.\n        ',
 '\n          Android is an operating system built by Google designed for mobile devices.\n        ',
 '\n          Angular is an open source web application platform.\n        ',
 '\n          Ansible is a simple and powerful automation engine.\n        ',
 '\n          An API (Application Programming Interface) is a collection of protocols and subroutines for building software.\n        ',
 '\n          Arduino is an open source hardware and software company and maker community.\n        ',
 '\n          ASP.NET is a web framework for building modern web apps and services.\n        ',
 '\n          Atom

<h3>Extract urls of all topics title

In [74]:
topic_link_tag = doc.find_all('a',class_='no-underline flex-1 d-flex flex-column',href=True)
topic_url = []
for i in topic_link_tag:
    ok_url = 'https://github.com'+i['href']
    topic_url.append(ok_url)
#     print(i['href'])
print(topic_url)   

['https://github.com/topics/3d', 'https://github.com/topics/ajax', 'https://github.com/topics/algorithm', 'https://github.com/topics/amphp', 'https://github.com/topics/android', 'https://github.com/topics/angular', 'https://github.com/topics/ansible', 'https://github.com/topics/api', 'https://github.com/topics/arduino', 'https://github.com/topics/aspnet', 'https://github.com/topics/atom', 'https://github.com/topics/awesome', 'https://github.com/topics/aws', 'https://github.com/topics/azure', 'https://github.com/topics/babel', 'https://github.com/topics/bash', 'https://github.com/topics/bitcoin', 'https://github.com/topics/bootstrap', 'https://github.com/topics/bot', 'https://github.com/topics/c', 'https://github.com/topics/chrome', 'https://github.com/topics/chrome-extension', 'https://github.com/topics/cli', 'https://github.com/topics/clojure', 'https://github.com/topics/code-quality', 'https://github.com/topics/code-review', 'https://github.com/topics/compiler', 'https://github.com/t

In [83]:
final_desc = []
for i in topic_desc:
    i = i.strip()
    final_desc.append(i)
#     print(i)
final_desc    

['3D modeling is the process of virtually developing the surface and structure of a 3D object.',
 'Ajax is a technique for creating interactive web applications.',
 'Algorithms are self-contained sequences that carry out a variety of tasks.',
 'Amp is a non-blocking concurrency framework for PHP.',
 'Android is an operating system built by Google designed for mobile devices.',
 'Angular is an open source web application platform.',
 'Ansible is a simple and powerful automation engine.',
 'An API (Application Programming Interface) is a collection of protocols and subroutines for building software.',
 'Arduino is an open source hardware and software company and maker community.',
 'ASP.NET is a web framework for building modern web apps and services.',
 'Atom is a open source text editor built with web technologies.',
 'An awesome list is a list of awesome things curated by the community.',
 'Amazon Web Services provides on-demand cloud computing platforms on a subscription basis.',
 'A

<h3>Creating dataset from extract info

In [84]:
df = pd.DataFrame({
    'title':topic_title,
    'description': final_desc,
    'url': topic_url
})

In [85]:
df

Unnamed: 0,title,description,url
0,3D,3D modeling is the process of virtually develo...,https://github.com/topics/3d
1,Ajax,Ajax is a technique for creating interactive w...,https://github.com/topics/ajax
2,Algorithm,Algorithms are self-contained sequences that c...,https://github.com/topics/algorithm
3,Amp,Amp is a non-blocking concurrency framework fo...,https://github.com/topics/amphp
4,Android,Android is an operating system built by Google...,https://github.com/topics/android
5,Angular,Angular is an open source web application plat...,https://github.com/topics/angular
6,Ansible,Ansible is a simple and powerful automation en...,https://github.com/topics/ansible
7,API,An API (Application Programming Interface) is ...,https://github.com/topics/api
8,Arduino,Arduino is an open source hardware and softwar...,https://github.com/topics/arduino
9,ASP.NET,ASP.NET is a web framework for building modern...,https://github.com/topics/aspnet


<img src="topics_github.png" width='600px'>

<h3>Convert the data into csv file

In [87]:
df.to_csv('topics.csv',index=False)

<h2 style="text-align:center">This is a mini web scraping poject
<PRE><h3 style="text-align:center">------*---------*---------*-----------