# Web scraping using Python

The network protocol that powers the web is actually quite simple and there is
built-in support in Python called sockets which makes it very easy to make network
connections and retrieve data over those sockets in a Python program.

In [2]:
import socket

# TheWorld’s Simplest Web Browser

In [3]:
mysock = socket.socket(socket.AF_INET, socket.SOCK_STREAM)
mysock.connect(('www.py4inf.com', 80))
mysock.send('GET http://www.py4inf.com/code/romeo.txt HTTP/1.0\n\n')
while True:
    data = mysock.recv(512)
    if ( len(data) < 1 ) :
        break
    print data
mysock.close()

HTTP/1.0 200 OK
Content-Type: text/plain
Content-Length: 167
Date: Fri, 23 Feb 2018 05:48:19 GMT
Server: Apache
Last-Modified: Fri, 04 Dec 2015 19:05:04 GMT
ETag: "a7-526172f5b5d89"
Accept-Ranges: bytes
Cache-Control: max-age=604800, public
Access-Control-Allow-Origin: *
Access-Control-Allow-Headers: origin, x-requested-with, content-type
Access-Control-Allow-Methods: GET
X-Cache: MISS from localhost
X-Cache-Lookup: MISS from localhost:8080
Via: 1.0 localhost (squid/3.1.19)
Connection: close


But soft what light through yonder window breaks
It is the east and Juliet is the sun
Arise fair sun and kill the envious moon
Who is already sick and pale with grief



# Retrieving an image over HTTP

In [4]:
import socket
import time
mysock = socket.socket(socket.AF_INET, socket.SOCK_STREAM)
mysock.connect(('www.py4inf.com', 80))
mysock.send('GET http://www.py4inf.com/cover.jpg HTTP/1.0\n\n')
count = 0
picture = "";
while True:
    data = mysock.recv(5120)
    if ( len(data) < 1 ) : break
    # time.sleep(0.25)
    count = count + len(data)
    print len(data),count
    picture = picture + data
mysock.close()
# Look for the end of the header (2 CRLF)
pos = picture.find("\r\n\r\n");
print 'Header length',pos
print picture[:pos]
# Skip past the header and save the picture data
picture = picture[pos+4:]
fhand = open("stuff.jpg","wb")
fhand.write(picture);
fhand.close()

1416 1416
1338 2754
1338 4092
2676 6768
1338 8106
1338 9444
1350 10794
2664 13458
1338 14796
2676 17472
1338 18810
1338 20148
1338 21486
1338 22824
2676 25500
1338 26838
1338 28176
1338 29514
1338 30852
2676 33528
1338 34866
2676 37542
1338 38880
1338 40218
1338 41556
1338 42894
1338 44232
1338 45570
1338 46908
2676 49584
2676 52260
1338 53598
2676 56274
1338 57612
2676 60288
2676 62964
2688 65652
1326 66978
2676 69654
747 70401
Header length 340
HTTP/1.0 200 OK
Content-Type: image/jpeg
Content-Length: 70057
Date: Fri, 23 Feb 2018 05:54:25 GMT
Server: Apache
Last-Modified: Fri, 04 Dec 2015 19:05:04 GMT
ETag: "111a9-526172f5b7cc9"
Accept-Ranges: bytes
X-Cache: MISS from localhost
X-Cache-Lookup: MISS from localhost:8080
Via: 1.0 localhost (squid/3.1.19)
Connection: close


# Retrieving web pages with urllib

As an example, we can write a program to retrieve the data for romeo.txt and
compute the frequency of each word in the file as follows:

In [5]:
import urllib
counts = dict()
fhand = urllib.urlopen('http://www.py4inf.com/code/romeo.txt')
for line in fhand:
    words = line.split()
    for word in words:
        counts[word] = counts.get(word,0) + 1
print counts

{'and': 3, 'envious': 1, 'already': 1, 'fair': 1, 'is': 3, 'through': 1, 'pale': 1, 'yonder': 1, 'what': 1, 'sun': 2, 'Who': 1, 'But': 1, 'moon': 1, 'window': 1, 'sick': 1, 'east': 1, 'breaks': 1, 'grief': 1, 'with': 1, 'light': 1, 'It': 1, 'Arise': 1, 'kill': 1, 'the': 3, 'soft': 1, 'Juliet': 1}


# Parsing HTML and scraping the web

Web scraping is when we write a program that pretends to be a web browser and
retrieves pages, then examines the data in those pages looking for patterns.As an example, a search engine such as Google will look at the source of one web
page and extract the links to other pages and retrieve those pages, extracting links,
and so on. Using this technique, Google spiders its way through nearly all of the
pages on the web.
Google also uses the frequency of links from pages it finds to a particular page as
one measure of how “important” a page is and how high the page should appear
in its search results.

In [7]:
import urllib
import re
url = raw_input('Enter - ')
html = urllib.urlopen(url).read()
links = re.findall('href="(http://.*?)"', html)
for link in links:
    print link

Enter - http://www.py4inf.com/book.htm
http://amzn.to/1KkULF3
http://www.py4e.com/book
http://amzn.to/1KkULF3
http://amzn.to/1hLcoBy
http://amzn.to/1KkV42z
http://amzn.to/1fNOnbd
http://amzn.to/1N74xLt
http://do1.dr-chuck.net/py4inf/EN-us/book.pdf
http://do1.dr-chuck.net/py4inf/ES-es/book.pdf
http://do1.dr-chuck.net/py4inf/PT-br/book.pdf
http://www.xwmooc.net/python/
http://fanwscu.gitbooks.io/py4inf-zh-cn/
http://itunes.apple.com/us/book/python-for-informatics/id554638579?mt=13
http://www-personal.umich.edu/~csev/books/py4inf/ibooks//python_for_informatics.ibooks
http://www.py4inf.com/code
http://www.greenteapress.com/thinkpython/thinkCSpy/
http://allendowney.com/


# Parsing HTML using BeautifulSoup

In [11]:
import BeautifulSoup

In [12]:
import urllib
from BeautifulSoup import *

In [15]:
url = raw_input('Enter - ')
html = urllib.urlopen(url).read()
soup = BeautifulSoup(html)
# Retrieve all of the anchor tags
tags = soup('a')
for tag in tags:
    print tag.get('href', None)

Enter - http://mail-archives.apache.org/mod_mbox/maven-users/
http://mail-archives.apache.org/mod_mbox/
?format=atom
201802.mbox/thread
201802.mbox/date
201802.mbox/author
201801.mbox/thread
201801.mbox/date
201801.mbox/author
201712.mbox/thread
201712.mbox/date
201712.mbox/author
201711.mbox/thread
201711.mbox/date
201711.mbox/author
201710.mbox/thread
201710.mbox/date
201710.mbox/author
201709.mbox/thread
201709.mbox/date
201709.mbox/author
201708.mbox/thread
201708.mbox/date
201708.mbox/author
201707.mbox/thread
201707.mbox/date
201707.mbox/author
201706.mbox/thread
201706.mbox/date
201706.mbox/author
201705.mbox/thread
201705.mbox/date
201705.mbox/author
201704.mbox/thread
201704.mbox/date
201704.mbox/author
201703.mbox/thread
201703.mbox/date
201703.mbox/author
201702.mbox/thread
201702.mbox/date
201702.mbox/author
201701.mbox/thread
201701.mbox/date
201701.mbox/author
201612.mbox/thread
201612.mbox/date
201612.mbox/author
201611.mbox/thread
201611.mbox/date
201611.mbox/author
201

In [14]:
import urllib
from BeautifulSoup import *
url = raw_input('Enter - ')
html = urllib.urlopen(url).read()
soup = BeautifulSoup(html)
# Retrieve all of the anchor tags
tags = soup('a')
for tag in tags:
    # Look at the parts of a tag
    print 'TAG:',tag
    print 'URL:',tag.get('href', None)
    print 'Content:',tag.contents[0]
    print 'Attrs:',tag.attrs

Enter - http://www.py4inf.com/book.htm
TAG: <a href="http://amzn.to/1KkULF3" target="_blank">
<img width="175" height="245" style="padding:5px; border-color: black" src="BookCoverPreviewFront.jpg" border="2" alt="A picture of the book cover art" />
</a>
URL: http://amzn.to/1KkULF3
Content: 

Attrs: [(u'href', u'http://amzn.to/1KkULF3'), (u'target', u'_blank')]
TAG: <a href="http://www.py4e.com/book" target="_blank">http://www.py4e.com/book</a>
URL: http://www.py4e.com/book
Content: http://www.py4e.com/book
Attrs: [(u'href', u'http://www.py4e.com/book'), (u'target', u'_blank')]
TAG: <a href="http://amzn.to/1KkULF3" target="_blank">Amazon</a>
URL: http://amzn.to/1KkULF3
Content: Amazon
Attrs: [(u'href', u'http://amzn.to/1KkULF3'), (u'target', u'_blank')]
TAG: <a href="http://amzn.to/1hLcoBy" target="_blank">Kindle Store</a>
URL: http://amzn.to/1hLcoBy
Content: Kindle Store
Attrs: [(u'href', u'http://amzn.to/1hLcoBy'), (u'target', u'_blank')]
TAG: <a href="http://amzn.to/1KkV42z" target="

In [33]:
import requests
import os.path

link = "http://mail-archives.apache.org/mod_mbox/maven-users/201802.mbox/%3C1517492902255-0.post@n5.nabble.com%3E"
f = requests.get(link)
save_path = 'C:/Users/sravyay/WritableFiles'
completeName = os.path.join(save_path, "python.txt")
#print f.text
with open(completeName,"wb") as pdf:
    for chunk in f.iter_content(chunk_size=1024):
         # writing one chunk at a time to pdf file
         if chunk:
             pdf.write(chunk)

# Web Crawler in Python

Write a simple crawler to crawl and download all mails for year 2014 from http://mail-archives.apache.org/mod_mbox/maven-users/

Note: At any point, the project must have a build system. The build system should create a jar with a defined application entry point. So that anyone can run the project using the command,

java -jar crawler.jar [opts]

assuming crawler.jar was the jar created by the build system.

Iterations
Time: 1 week Deliverable: working crawler which can download all mails
Time: 1 week Deliverable: Robust crawler which can survive internet connection loss and can resume from last run
Time: 1 week Deliverable: Performance and memory optimization for the crawler
Goals
Write good code which follows best practices (naming conventions, formatting, optimal use)
Write unit tests for relevant sections
Write production quality code (options to set the log level at runtime, decent logging which can be used to troubleshoot)

In [54]:
import urllib
from BeautifulSoup import *
import requests
import os
import io
import shutil

#url = "http://mail-archives.apache.org/mod_mbox/maven-users/"
url = raw_input('Enter - ')
html = urllib.urlopen(url).read()
soup = BeautifulSoup(html)
# Retrieve all of the anchor tags
tags = soup('a')
useful_urls1=[]
for tag in tags:
    new_url = url + tag.get('href', None)
    if("2014" in new_url and ".mbox/thread" in new_url):
        useful_urls1.append(new_url)
#print useful_urls1
useful_urls2=[]
save_path = 'C:/Users/sravyay/WritableFiles'
for every_url in useful_urls1:
    #print(every_url)
    new_html = urllib.urlopen(every_url).read()
    new_soup = BeautifulSoup(new_html)
    new_tags = new_soup('a')
    for new_tag in new_tags:
        some_tag = new_tag.get('href',None)
        if "%" in some_tag:
            useful_urls2.append(every_url[:-6]+some_tag)
    #break
#print useful_urls2
k=0
for link in useful_urls2:
    #print(link)
    f = urllib.urlopen(link).read()
    s= BeautifulSoup(f)
    t = s('tr')
    for small_t in t:
        some_t = small_t.get('class', None)
        #print(some_t)
        if(some_t == "contents"):
            content = small_t.contents[0].text
            #print(content)
            file_name = link[53:59]+ "(" + str(k) +")"
            k +=1
            complete_name = os.path.join(save_path, file_name+".txt").encode('utf-8').strip()
            with io.open(complete_name,"wb+") as pdf:
                pdf.write((content).encode('utf-8').strip())
                pdf.close()
    #break
print("done succesfully...check ur save path for downloaded mails")

Enter - http://mail-archives.apache.org/mod_mbox/maven-users/


KeyboardInterrupt: 