# Using Python Packages
---

- install requests and beautifulsoup

- Using Virtualenv

```shell

$ pip install requests beautifulsoup4

```

- Using Anaconda/Miniconda

```shell

$ conda install requests beautifulsoup4

```

In [1]:
# import library
import requests

In [2]:
# now fetch google page with it and store to resp object
resp = requests.get('https://www.google.com')

# this can be written as well
resp = requests.api.get('https://www.google.com')

In [3]:
# show what methods/attributes
dir(resp)

['__attrs__',
 '__bool__',
 '__class__',
 '__delattr__',
 '__dict__',
 '__dir__',
 '__doc__',
 '__eq__',
 '__format__',
 '__ge__',
 '__getattribute__',
 '__getstate__',
 '__gt__',
 '__hash__',
 '__init__',
 '__iter__',
 '__le__',
 '__lt__',
 '__module__',
 '__ne__',
 '__new__',
 '__nonzero__',
 '__reduce__',
 '__reduce_ex__',
 '__repr__',
 '__setattr__',
 '__setstate__',
 '__sizeof__',
 '__str__',
 '__subclasshook__',
 '__weakref__',
 '_content',
 '_content_consumed',
 'apparent_encoding',
 'close',
 'connection',
 'content',
 'cookies',
 'elapsed',
 'encoding',
 'headers',
 'history',
 'is_permanent_redirect',
 'is_redirect',
 'iter_content',
 'iter_lines',
 'json',
 'links',
 'ok',
 'raise_for_status',
 'raw',
 'reason',
 'request',
 'status_code',
 'text',
 'url']

In [4]:
# show status code
resp.status_code

200

In [5]:
resp.headers

{'X-XSS-Protection': '1; mode=block', 'Expires': '-1', 'Set-Cookie': 'NID=76=geX2zzvBuCVn4_YqLj5j5SKnL357v3c8nkeVIflx9wb86B8w-UdPAhZ3o2oxQiwYIIIxuyqm4wqkT5kLCRjQ9S72t5a--P_KEPRr2GxxT2nfRevx3RIS9U_dCK1iSDEd; expires=Thu, 11-Aug-2016 06:58:06 GMT; path=/; domain=.google.com.np; HttpOnly', 'Date': 'Wed, 10 Feb 2016 06:58:06 GMT', 'Server': 'gws', 'Content-Encoding': 'gzip', 'P3P': 'CP="This is not a P3P policy! See https://www.google.com/support/accounts/answer/151657?hl=en for more info."', 'Cache-Control': 'private, max-age=0', 'X-Frame-Options': 'SAMEORIGIN', 'Transfer-Encoding': 'chunked', 'Content-Type': 'text/html; charset=UTF-8'}

In [6]:
resp.text

'<!doctype html><html itemscope="" itemtype="http://schema.org/WebPage" lang="ne"><head><meta content="text/html; charset=UTF-8" http-equiv="Content-Type"><meta content="/images/branding/googleg/1x/googleg_standard_color_128dp.png" itemprop="image"><title>Google</title><script>(function(){window.google={kEI:\'ft-6VvOiBImNuATF2ICoBg\',kEXPI:\'8370,1350255,3700288,3700385,4026241,4028790,4028875,4029815,4031109,4032677,4033307,4036509,4036527,4038012,4039268,4040135,4042785,4042793,4043492,4044606,4045821,4045841,4046304,4047780,4048909,4049549,4049557,4050886,4050891,4050908,4051241,4051559,4051596,4051714,4052304,4054284,4055276,4055553,4056038,4056589,4057170,4057586,4057836,4058004,4058033,4058117,4058337,4058382,4059274,4059316,4059327,4059635,8300273,8300287,8300310,8300318,8502315,8502347,8502451,8502690,8502986,8503039,8503109,8503132,8503212,8503303,8503306,10200083,10201630\',authuser:0,kscs:\'c9c918f0_24\'};google.kHL=\'ne\';})();(function(){google.lc=[];google.li=0;google.get

In [7]:
# import can be done this way
from bs4 import BeautifulSoup

In [8]:
# now lets parse obtained html file
soup = BeautifulSoup(resp.text, 'lxml')

In [9]:
# find first a tag
soup.find('a')

<a class="gb1" href="https://www.google.com.np/imghp?hl=ne&amp;tab=wi">तस्बिर</a>

In [10]:
# find all span
soup.find_all('span')

[<span class="gbi" id="gbn"></span>,
 <span class="gbf" id="gbf"></span>,
 <span id="gbe"></span>,
 <span class="ds"><span class="lsbb"><input class="lsb" name="btnG" type="submit" value="Google खोजी"/></span></span>,
 <span class="lsbb"><input class="lsb" name="btnG" type="submit" value="Google खोजी"/></span>,
 <span class="ds"><span class="lsbb"><input class="lsb" name="btnI" onclick="if(this.form.q.value)this.checked=1; else top.location='/doodles/'" type="submit" value="म भाग्यमानी अनुभूति गरिरहेछु"/></span></span>,
 <span class="lsbb"><input class="lsb" name="btnI" onclick="if(this.form.q.value)this.checked=1; else top.location='/doodles/'" type="submit" value="म भाग्यमानी अनुभूति गरिरहेछु"/></span>,
 <span id="footer"><div style="font-size:10pt"><div id="fll" style="margin:19px auto;text-align:center"><a href="/intl/ne/about.html">Googleको बारेमा सम्पूर्ण</a><a href="https://www.google.com.np/setprefdomain?prefdom=US&amp;sig=__px6FLBMrtt2uxPIBz-EEpaLs7_E%3D" id="fehl">Google.com<

In [15]:
for each in resp.history:
    print(each.url)

https://www.google.com/


** H/W: crawl some site using requests and beautifulsoup **

## Example
---

** Code below uses mechnanize package to crawl through songs.pk, use beautifulsoup and requests instead **

```python
#!/usr/bin/env python

import os
import shutil
import subprocess
import mechanize
from collections import defaultdict


# __file__ returns path of this file ( one currently running )
# os.path.abspath -> returns absolute path
# os.path.dirname -> returns directory name
basepath = os.path.abspath(os.path.dirname(__file__))
songsdir = os.path.join(basepath, 'Songs')
tempdir = os.path.join(basepath, 'temp')

# initialize directory
if not os.path.exists(songsdir):
    os.mkdir(songsdir)

if not os.path.exists(tempdir):
    os.mkdir(tempdir)


url = 'http://www.songspk.pk'

"""
mechanize is browser implementation library

 doesnot require any browser for it to work => it itself is a browser
"""
browser = mechanize.Browser()
browser.open(url)

browser.follow_link(text_regex=r'Bollywood\s*Songs', nr=1)


movies_list = [line.strip() for line in open('pkmovies.txt', 'r').readlines()
               if line.strip()]

indexed_movies = defaultdict(list)
for movie in movies_list:
    char = movie[0]
    if char.isdigit():
        indexed_movies['numeric'].append(movie)
    elif char.isalpha():
        indexed_movies[char.lower()].append(movie)
    else:
        pass


def open_movie_page(title):
    # open movie page, collect download urls process download and go back
    print('Loading page for ... {}'.format(title))
    browser.follow_link(text_regex=title)
    links = browser.links(url_regex=r'songid=')
    moviepath = os.path.join(songsdir, title)
    if not os.path.exists(moviepath):
        os.mkdir(moviepath)
    download_all(moviepath, links)
    browser.back()


def download_all(movie_path, links):
    # create movie folder create multiple subprocess each with wget
    global tempdir
    for link in links:
        songname = link.text + '.mp3'
        print('Downloading... {}'.format(songname))
        if os.path.exists(os.path.join(movie_path, songname)):
            print('File exists ... skipping')
            continue
        os.chdir(tempdir)
        subprocess.call(["wget", "-O", songname, link.url])
        shutil.move(os.path.join(tempdir, songname), movie_path)


def open_list_page(index):
    links = browser.links(url_regex='{}_list.html'.format(index))
    browser.follow_link(links.next())


for index, movies in indexed_movies.items():
    open_list_page(index)
    for m in movies:
        open_movie_page(m)
    browser.back()
```

** Note: to use above code, just use *pip install mechanize*, it also requires *wget* so may not work in windows **

** It is very old code so may not work altogether **

** Above code reads file something like this **

- pkmovies.txt

```

7 Khoon Maaf - 2011
3 Idiots
1942 - A Love Story
```

In [None]:
class SomeClass:
    def __init__(self):
        pass