forked from rahulbanerjee26/githubScraper
-
Notifications
You must be signed in to change notification settings - Fork 0
/
scrape.py
69 lines (59 loc) · 2.1 KB
/
scrape.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
from bs4 import BeautifulSoup
import requests
import pandas as pd
def getData(userName):
url = "https://github.com/{}?tab=repositories".format(userName)
page = requests.get(url)
soup = BeautifulSoup(page.content , 'html.parser')
info = {}
# Scraping Profile Info
#full Name
info['name'] = soup.find(class_ = 'vcard-fullname').get_text()
#image
info['image_url'] = soup.find(class_ = 'avatar-user')['src']
#followers and follwoing
info['followers'] = soup.select_one("a[href*=followers]").get_text().strip().split('\n')[0]
info['following'] = soup.select_one("a[href*=following]").get_text().strip().split('\n')[0]
#location
try:
info['location'] = soup.select_one('li[itemprop*=home]').get_text().strip()
except:
info['location'] = ''
#url
try:
info['url'] = soup.select_one('li[itemprop*=url]').get_text().strip()
except:
info['url'] = ''
#get Repositories as a dataframe
repos = soup.find_all(class_ = 'source')
repo_info = []
for repo in repos:
#repo name and link
try:
name = repo.select_one('a[itemprop*=codeRepository]').get_text().strip()
link = 'https://github.com/{}/{}'.format(userName,name)
except:
name = ''
link = ''
#repo update time
try:
updated = repo.find('relative-time').get_text()
except:
updated = ''
# programming language
try:
language = repo.select_one('span[itemprop*=programmingLanguage]').get_text()
except:
language = ''
# description
try:
description = repo.select_one('p[itemprop*=description]').get_text().strip()
except:
description = ''
repo_info.append({'name': name ,
'link': link ,
'updated ':updated ,
'language': language ,
'description':description})
repo_info = pd.DataFrame(repo_info)
return info , repo_info