# Let's Begin "How to Learn Scraping" 

In [148]:
import re
import socket
import urllib.request, urllib.parse, urllib.error
from bs4 import BeautifulSoup
import ssl
import xml.etree.ElementTree as ET
import json

### Beginning a tad revision of regular expression 

In [2]:
base = 'From: and To: are important constructs'
line = base.rstrip()

In [8]:
# How we can use re.search and find

print("Regular:", re.search('From:', line))
print("String: ", line.find('rom:'))

Regular: <re.Match object; span=(0, 5), match='From:'>
String:  1


In [9]:
# How we can use re.search and startswith

print("Regular: ", re.search('^From:', line))
print("String: ", line.startswith('From:'))

Regular:  <re.Match object; span=(0, 5), match='From:'>
String:  True


In [26]:
if re.match('^.*@(gmail|google)\.(com)$', 'shivam13juna@gmail.com'):
    print("Found Match")
else:
    print("Didn't find match")

Found Match


In [36]:
line = 'X-plane is behind schedule:'
line1 = 'X-Sieve:'
re.match('X-^\s+:', line1)
re.match('X-\S+:', line1)

<re.Match object; span=(0, 8), match='X-Sieve:'>

In [2]:
# Demonstrating find all

x = 'My 2 favorite numbers are 19 and 42'
y = re.findall('[0-9]+', x)
print(y)


'''
^F.+?: is non greedy matching
^F.+: is greedy matching

'''

['2', '19', '42']


'\n^F.+?: is non greedy matching\n^F.+: is greedy matching\n\n'

In [42]:
re.findall('\S+?@\S+', 'From stephen.marquard@uct.ac.za Sat Jan  5 09:14:16 2008')

['stephen.marquard@uct.ac.za']

In [43]:
store = 'Why should you learn to write programs? 7746 \
12 1929 8827\
Writing programs (or programming) is a very creative \
7 and rewarding activity.  You can write programs for \
many reasons, ranging from making your living to solving\
8837 a difficult data analysis problem to having fun to helping 128\
someone else solve a problem.  This book assumes that \
everyone needs to know how to program ...'

In [54]:
total_sum = 0
for i in re.findall('\d+', store):
    total_sum += int(i)
    
print("Total sum was: ", total_sum)

Total sum was:  27486


In [1]:
import re

file = ''.join(list(open('data.txt', 'r')))

print("Total sum is: ", sum(list(map(lambda x: int(x), re.findall('\d+', file)))))

Total sum is:  475263


## Starting with Sockets 

In [18]:
mysock = socket.socket(socket.AF_INET, socket.SOCK_STREAM)
mysock.connect(('www.data.pr4e.org', 80))

In [19]:
cmd = 'GET http://data.pr4e.org/romeo.txt HTTP/1.0\r\n\r\n'.encode()
mysock.send(cmd)

47

In [20]:
while True:
    data = mysock.recv(1024)
    if (len(data) < 1):
        break
    print(data.decode())
mysock.close()

HTTP/1.1 200 OK
Date: Tue, 21 Jul 2020 12:19:36 GMT
Server: Apache/2.4.18 (Ubuntu)
Last-Modified: Sat, 13 May 2017 11:22:22 GMT
ETag: "a7-54f6609245537"
Accept-Ranges: bytes
Content-Length: 167
Cache-Control: max-age=0, no-cache, no-store, must-revalidate
Pragma: no-cache
Expires: Wed, 11 Jan 1984 05:00:00 GMT
Connection: close
Content-Type: text/plain

But soft what light through yonder window breaks
It is the east and Juliet is the sun
Arise fair sun and kill the envious moon
Who is already sick and pale with grief



## Starting with URLLIB

In [24]:
fhand = urllib.request.urlopen('http://data.pr4e.org/romeo.txt')

# The way we read file off the internet is

for line in fhand:
    print(line.decode().strip())

In [31]:
# Add .read() if you wanna do it all in one hit

fhand = urllib.request.urlopen('http://data.pr4e.org/romeo.txt').read()

fhand

b'But soft what light through yonder window breaks\nIt is the east and Juliet is the sun\nArise fair sun and kill the envious moon\nWho is already sick and pale with grief\n'

## Starting with Beautiful Soup now 

In [34]:
url = 'http://data.pr4e.org/romeo.txt'
html = urllib.request.urlopen(url).read()

# This is how beautiful soup parses all the html content
soup = BeautifulSoup(html, 'html.parser') # ad-hoc heuristic

# Let's try retrieving all the anchor tags
tags = soup('a') # Gimme list of all the anchor tags in the code

for tag in tags:
    print(tag.get('href', None)) # Gonna pull out the text in href, so either pull href or None
    

# How someone can access other entities from soup object

# Retrieve all of the anchor tags
# tags = soup('a')
# for tag in tags:
#    # Look at the parts of a tag
#    print 'TAG:',tag
#    print 'URL:',tag.get('href', None)
#    print 'Contents:',tag.contents[0]
#    print 'Attrs:',tag.attrs

### Trick for ignoring SSL errors (HTTPS)

In [36]:
ctx = ssl.create_default_context()
ctx.check_hostname = False
ctx.verify_mode = ssl.CERT_NONE

In [53]:
url = 'http://dr-chuck.com'
html = urllib.request.urlopen(url, context=ctx).read()

soup = BeautifulSoup(html, 'html.parser') 

tags = soup('a') 

for tag in tags:
    print(tag.get('href', None)) 
    
    

https://www.learnerprivacy.org/
https://www.si.umich.edu/
https://www.ratemyprofessors.com/ShowRatings.jsp?tid=1159280
https://www.learnerprivacy.org
https://www.dr-chuck.com/csev-blog/
https://www.twitter.com/drchuck/
https://www.dr-chuck.com/dr-chuck/resume/speaking.htm
https://www.slideshare.net/csev
/dr-chuck/resume/index.htm
https://amzn.to/1K5Q81K
https://www.coursera.org/instructor/drchuck
http://afs.dr-chuck.com/papers/
https://itunes.apple.com/us/podcast/computing-conversations/id731495760
https://www.youtube.com/playlist?list=PLHJB2bhmgB7dFuY7HmrXLj5BmHGKTD-3R
https://developers.imsglobal.org/
https://www.youtube.com/user/csev
https://vimeo.com/drchuck/videos
https://backpack.openbadges.org/share/4f76699ddb399d162a00b89a452074b3/
https://www.linkedin.com/in/charlesseverance/
https://www.researchgate.net/profile/Charles_Severance/
https://www.learnerprivacy.org/
https://www.py4e.com/
https://www.dj4e.com/
https://www.wa4e.com/
https://www.coursera.org/course/insidetheinternet


In [63]:
# Graded assignment

url = 'http://py4e-data.dr-chuck.net/comments_821796.html'
html = urllib.request.urlopen(url, context=ctx).read()

soup = BeautifulSoup(html, 'html.parser') 

tags = soup('span') 

print("Total sum is: ", sum(list(map(lambda x: int(x.contents[0]), tags))))

Total sum is:  2395


In [66]:
# Graded Assignment
    
url = 'http://py4e-data.dr-chuck.net/known_by_Fikret.html'
html = urllib.request.urlopen(url, context=ctx).read()

soup = BeautifulSoup(html, 'html.parser') 

tags = soup('a') 


In [69]:
name = list(map(lambda x: x.get('href'), tags))

In [71]:
name[2]

'http://py4e-data.dr-chuck.net/known_by_Montgomery.html'

In [86]:
count = 3
nex = 'http://py4e-data.dr-chuck.net/known_by_Fikret.html'
while count >= 0:
    print(nex.split('_')[-1].split('.')[0])
    nex = list(map(lambda x: x.get('href'), BeautifulSoup(urllib.request.urlopen(nex, context=ctx).read(), 'html.parser')('a')))[2]
    count-=1

Fikret
Montgomery
Mhairade
Butchi


In [91]:
count = 7
nex = 'http://py4e-data.dr-chuck.net/known_by_Majid.html'
while count >= 0:
    print(nex.split('_')[-1].split('.')[0])
    nex = list(map(lambda x: x.get('href'), BeautifulSoup(urllib.request.urlopen(nex, context=ctx).read(), 'html.parser')('a')))[17]
    count-=1

Majid
Alba
Armaan
Temilade
Reno
Muriel
Lilliarna
Adana


In [84]:
nex.split('_')[-1].split('.')[0]

'Anayah'

In [81]:
re.match('\.html', nex)

## Parsing XML Structure 

In [118]:
# Example of single line tag

data = '''
<person>
    <name>Chuck</name>
    <phone type='int1'>
        +1 734 303 4456
    </phone>
    <email hide="yes"/>
</person>
'''

tree = ET.fromstring(data)
print('Name: ', tree.find('name').text)
print('Attr: ', tree.find('email').get('hide'))

Name:  Chuck
Attr:  yes


In [121]:
# Example of multi line tag

input = '''
<stuff>
    <users>
        <user x='2'>
            <id>001</id>
            <name>Chuck</name>
        </user>
        <user x='7'>
            <id>009</id>
            <name>Brent</name>
        </user>
    </users>
</stuff>
    
'''

stuff = ET.fromstring(input)
lst = stuff.findall('users/user')
print('User Count: ', len(lst))

for item in lst:
    print('Name', item.find('name').text)
    print('Id', item.find('id').text)
    print('Attribute', item.get('x'))



User Count:  2
Name Chuck
Id 001
Attribute 2
Name Brent
Id 009
Attribute 7


In [144]:
# Graded assignment

url = 'http://py4e-data.dr-chuck.net/comments_821798.xml'
xml = urllib.request.urlopen(url, context=ctx).read().decode()

stuff = ET.fromstring(xml)
lst = stuff.findall('comments/comment')
print('User Count: ', len(lst))

total_sum = 0

for item in lst:
    total_sum += int(item.find('count').text)
    
print("Total sum was: ", total_sum)

User Count:  50
Total sum was:  2723


In [159]:
# Graded Assignment

content = urllib.request.urlopen('http://py4e-data.dr-chuck.net/comments_821799.json').read().decode()

data = json.loads(content)

total_sum = 0

for i in data['comments']:
    total_sum += i['count']

print("Total sum is: ", total_sum)

Total sum is:  2286


In [175]:
# Graded Assignment

import urllib.request, urllib.parse, urllib.error
import json
import ssl

api_key = False

if api_key is False:
    api_key = 42
    serviceurl = 'http://py4e-data.dr-chuck.net/json?'
else :
    serviceurl = 'https://maps.googleapis.com/maps/api/geocode/json?'

# Ignore SSL certificate errors
ctx = ssl.create_default_context()
ctx.check_hostname = False
ctx.verify_mode = ssl.CERT_NONE

# while True:
address = 'Warsaw University'

parms = dict()
parms['address'] = address
if api_key is not False: parms['key'] = api_key
url = serviceurl + urllib.parse.urlencode(parms)

print('Retrieving', url)
uh = urllib.request.urlopen(url, context=ctx)
data = uh.read().decode()
print('Retrieved', len(data), 'characters')

try:
    js = json.loads(data)
except:
    js = None

print("Place id is: ", js['results'][0]['place_id'])

Retrieving http://py4e-data.dr-chuck.net/json?address=Warsaw+University&key=42
Retrieved 2331 characters
Place id is:  ChIJCWP6Nl7MHkcRf_xabKDNoaQ


# Scraping Brahma Kumaris site for downloading all murlis 

In [99]:
req = urllib.request.Request('https://madhubanmurli.org', headers={'User-Agent': 'Chrome/84.0.4147.89'})
html = urllib.request.urlopen(req).read()

In [109]:

req = urllib.request.Request('https://madhubanmurli.org', headers={'User-Agent': 'Chrome/84.0.4147.89'})
html = urllib.request.urlopen(req).read()

soup = BeautifulSoup(html, 'html.parser') 

tags = soup('div')
# tags = soup.findAll("div", {"class": "murli-body"})

In [113]:
for i in tags:
    if 'murli-body' in i:
        print(i)

In [None]:
soup.findAll("div", {"class": "stylelistrow"})

In [None]:
from bs4 import BeautifulSoup
import sys 

for foo in soup.find_all('div', attrs={'class': 'foo'}):
    bar = foo.find('div', attrs={'class': 'bar'})
    print(bar.text)