# Starting with Scrapy 

BeautifulSoup and lxml are libraries for parsing HTML and XML. Scrapy is an application framework for writing web spiders that crawl web sites and extract data from them.

In [1]:
import os
import scrapy
import requests
from scrapy.http import TextResponse, FormRequest, Request
from scrapy.loader import ItemLoader
from scrapy.loader.processors import MapCompose, Join
from grand.grand.items import GrandItem
import urllib3.request, urllib.parse, urllib.error
from collections import deque
from datetime import timedelta, date
from ebooklib import epub
import numpy as np
import string
import time
import random
import calendar
import pickle


### XPath is a language



<h5>How to use commands in console</h5>
<img src="x_console.png" alt="Drawing" style="width: 400px;"/>


* Using //p will select all the p elements and //a for all the links
* To find all the links under div we'll use $x('//div//a')
* You can also select just the text by using the text()
* You can also select just the text by using the text()
* There are tens of XPath functions like not(), contains(), and startswith() that you can find in the online documentation (http://www.w3schools.com/xsl/xpath_functions.asp),

In [4]:
conv = '''accept: text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3;q=0.9
accept-encoding: gzip, deflate, br
accept-language: en-US,en;q=0.9
cache-control: max-age=0
cookie: prov=6ee1b1c0-a975-57c1-b806-d6f02acdc078; _ga=GA1.2.1770530859.1579667732; __qca=P0-709205387-1579667731930; __gads=ID=7abeba23853bfab5:T=1579667732:S=ALNI_MZQpLsyZugJqQyXRBUJPOkgNjsDbw; sgt=id=57291405-92ae-4989-a741-f989cf2c6903; _gid=GA1.2.1065622312.1595226703; arp_scroll_position=136; acct=t=JE1j0YL6mlLl7%2bJDAZ0zaTJQ%2bUidPKXE&s=xcKiH08vYJFwXKLlol2HcBhRPB70gUZR
dnt: 1
referer: https://www.google.com/
sec-fetch-dest: document
sec-fetch-mode: navigate
sec-fetch-site: cross-site
sec-fetch-user: ?1
upgrade-insecure-requests: 1
user-agent: Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_5) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/84.0.4147.89 Safari/537.36'''

conv = conv.split('\n')

headers = dict()

for i in range(len(conv)):
    headers[conv[i].split(': ')[0]] = conv[i].split(': ')[1]

del headers['accept-encoding']

In [91]:
headers

{'accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3;q=0.9',
 'accept-language': 'en-US,en;q=0.9',
 'cache-control': 'max-age=0',
 'cookie': 'prov=6ee1b1c0-a975-57c1-b806-d6f02acdc078; _ga=GA1.2.1770530859.1579667732; __qca=P0-709205387-1579667731930; __gads=ID=7abeba23853bfab5:T=1579667732:S=ALNI_MZQpLsyZugJqQyXRBUJPOkgNjsDbw; sgt=id=57291405-92ae-4989-a741-f989cf2c6903; _gid=GA1.2.1065622312.1595226703; arp_scroll_position=136; acct=t=JE1j0YL6mlLl7%2bJDAZ0zaTJQ%2bUidPKXE&s=xcKiH08vYJFwXKLlol2HcBhRPB70gUZR',
 'dnt': '1',
 'referer': 'https://www.google.com/',
 'sec-fetch-dest': 'document',
 'sec-fetch-mode': 'navigate',
 'sec-fetch-site': 'cross-site',
 'sec-fetch-user': '?1',
 'upgrade-insecure-requests': '1',
 'user-agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_5) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/84.0.4147.89 Safari/537.36'}

# Note visual tags can't be used with attribute tags
XPath is practically used for locating XML nodes.

General Syntax:

xpath=//tag[@attribute='value']

// : Select current node.

tag: Tagname of the particular node. Also, "*" is for searching any tag in the xml structure

@: Select attribute.

attribute: Attribute name of the node.

value: Value of the attribute.

# Starting with MagicBricks 

In [114]:
res = requests.get('https://www.magicbricks.com/propertyDetails/2-BHK-1030-Sq-ft-Multistorey-Apartment-FOR-Sale-Thanisandra-Main-Road-in-Bangalore&id=4d423231313036303433?sem=Y', headers=headers)

# res = requests.get('https://madhubanmurli.org/#')#, headers = headers)
response = TextResponse(res.url, body=res.text, encoding='utf-8')

Another way of reading webpages

req = urllib.request.Request('https://www.magicbricks.com/propertyDetails/2-BHK-1404-Sq-ft-Multistorey-Apartment-FOR-Sale-Electronic-City-Phase-2-in-Bangalore&id=4d423439383237313532', headers={'User-Agent': 'Chrome/84.0.4147.89'})
response = urllib.request.urlopen(req).read()

In [134]:
response.xpath('//*[@id="coveredAreaDisplay"]//text()').extract()

['1030']

In [181]:


item = GrandItem()
iloader = ItemLoader(item=GrandItem(), response=response)


# self.log("developer: %s" % response.xpath('((//*[@id="thirdFoldDisplay"]//*[@target="_blank"])//text())[1]').extract()[0].strip())
# self.log("area: %s" % response.xpath('//*[@id="coveredAreaDisplay"]//text()').extract()[0] + ' sqft')
# self.log("no_bedroom: %s" % int(''.join(x.strip() for x in response.xpath('//*[@class="seeBedRoomDimen"]//text()').extract())))
# self.log("no_bathroom: %s" % int(response.xpath('//div[contains(@class, "p_infoColumn") and contains(.//div, "Bathrooms")]/*[@class="p_value"]/text()').extract()[0]))
# self.log("status: %s" % response.xpath('//div[contains(@class, "p_infoColumn") and contains(.//div, "Status")]/*[@class="p_value"]/text()').extract()[0])
# self.log("property_type: %s" % response.xpath('//div[contains(@class, "p_infoColumn") and contains(.//div, "Transaction type")]/*[@class="p_value"]/text()').extract()[0])
# self.log("car_parking: %s" % response.xpath('//div[contains(@class, "p_infoColumn") and contains(.//div, "Car parking")]/*[@class="p_value"]/text()').extract()[0])
# self.log("furnished_status: %s" % response.xpath('//div[contains(@class, "p_infoColumn") and contains(.//div, "Furnished status")]/*[@class="p_value"]/text()').extract()[0])


iloader.add_xpath("developer", '((//*[@id="thirdFoldDisplay"]//*[@target="_blank"])//text())[1]', MapCompose(str.strip))
iloader.add_xpath("area", '//*[@id="coveredAreaDisplay"]//text()', MapCompose(lambda i: i + ' sqft'))
iloader.add_xpath("no_bedroom", '//*[@class="seeBedRoomDimen"]//text()', lambda x: ''.join(i.strip() for i in x))
iloader.add_xpath("no_bathroom", '//div[contains(@class, "p_infoColumn") and contains(.//div, "Bathrooms")]/*[@class="p_value"]/text()', MapCompose(int))
iloader.add_xpath("status", '//div[contains(@class, "p_infoColumn") and contains(.//div, "Status")]/*[@class="p_value"]/text()')
iloader.add_xpath("property_type", '//div[contains(@class, "p_infoColumn") and contains(.//div, "Transaction type")]/*[@class="p_value"]/text()')
iloader.add_xpath("car_parking", '//div[contains(@class, "p_infoColumn") and contains(.//div, "Car parking")]/*[@class="p_value"]/text()')
iloader.add_xpath("furnished_status", '//div[contains(@class, "p_infoColumn") and contains(.//div, "Furnished status")]/*[@class="p_value"]/text()')


In [182]:
iloader.load_item()

{'area': ['1030 sqft'],
 'car_parking': ['None'],
 'developer': ['Sai Kalyan Builders & Developers Pvt. Ltd.'],
 'furnished_status': ['Unfurnished'],
 'no_bathroom': [2],
 'no_bedroom': ['2'],
 'property_type': ['New Property'],
 'status': ['Ready to Move']}

#### response.xpath('//*[@id="thirdFoldDisplay"]/div[1]/div[2]/span/a//text()')

# That's how you nest elements for selecting nth instance of mth entity and combine regular expression
' '.join(response.xpath('(//*[@id="thirdFoldDisplay"]//*[@target="_blank"])//text()').re('[\.a-zA-Z]+'))

In [285]:
# //div[div/@class='item_promo']/div[@class='item_price']
# div[contains(@class, 'measure-tab') and contains(.//span, 'someText')]

# response.xpath('//*[@class="p_infoColumn"]/*[@class="p_value"]').extract()


In [303]:
# Name of Developer
developer = response.xpath('((//*[@id="thirdFoldDisplay"]//*[@target="_blank"])//text())[1]').extract()[0].strip()

# Size of Area
area = response.xpath('//*[@id="coveredAreaDisplay"]//text()').extract()[0] + ' sqft'

# Total number of bedrooms
no_bedroom = int(''.join(x.strip() for x in response.xpath('//*[@class="seeBedRoomDimen"]//text()').extract()))
# or use this XPath ''.join(x.strip() for x in response.xpath('//div[contains(@class, "p_infoColumn") and contains(.//div, "Bedrooms")]//*[@class="seeBedRoomDimen"]/text()').extract())

# Total number of bathrooms
no_bathroom = int(response.xpath('//div[contains(@class, "p_infoColumn") and contains(.//div, "Bathrooms")]/*[@class="p_value"]/text()').extract()[0])

# Status of property
status = response.xpath('//div[contains(@class, "p_infoColumn") and contains(.//div, "Status")]/*[@class="p_value"]/text()').extract()[0]

# Type of Property
property_type = response.xpath('//div[contains(@class, "p_infoColumn") and contains(.//div, "Transaction type")]/*[@class="p_value"]/text()').extract()[0]

# If Car Parking Present
car_parking = response.xpath('//div[contains(@class, "p_infoColumn") and contains(.//div, "Car parking")]/*[@class="p_value"]/text()').extract()[0]

# If apartment is unfurnished
furnished_status = response.xpath('//div[contains(@class, "p_infoColumn") and contains(.//div, "Furnished status")]/*[@class="p_value"]/text()').extract()[0]

You make a new project by typing "scrappy startproject properties"

In [305]:
res = requests.get('https://www.magicbricks.com/propertyDetails/2-BHK-1404-Sq-ft-Multistorey-Apartment-FOR-Sale-Electronic-City-Phase-2-in-Bangalore&id=4d423439383237313532', headers=headers)

# res = requests.get('https://madhubanmurli.org/#')#, headers = headers)
response = TextResponse(res.url, body=res.text, encoding='utf-8')

In [306]:
# response.xpath('//*[@id="thirdFoldDisplay"]/div[1]/div[2]/span/a//text()')

# That's how you nest elements for selecting nth instance of mth entity and combine regular expression
' '.join(response.xpath('(//*[@id="thirdFoldDisplay"]//*[@target="_blank"])//text()').re('[\.a-zA-Z]+'))

''

In [285]:
# //div[div/@class='item_promo']/div[@class='item_price']
# div[contains(@class, 'measure-tab') and contains(.//span, 'someText')]

# response.xpath('//*[@class="p_infoColumn"]/*[@class="p_value"]').extract()


In [315]:
# Name of Developer
# developer = response.xpath('((//*[@id="thirdFoldDisplay"]//*[@target="_blank"])//text())[1]').extract()[0].strip()

# Size of Area
area = response.xpath('//*[@id="coveredAreaDisplay"]//text()').extract()[0] + ' sqft'

# Total number of bedrooms
# no_bedroom = int(''.join(x.strip() for x in response.xpath('//*[@class="seeBedRoomDimen"]//text()').extract()))
# or use this XPath ''.join(x.strip() for x in response.xpath('//div[contains(@class, "p_infoColumn") and contains(.//div, "Bedrooms")]//*[@class="seeBedRoomDimen"]/text()').extract())

# Total number of bathrooms
no_bathroom = int(response.xpath('//div[contains(@class, "p_infoColumn") and contains(.//div, "Bathrooms")]/*[@class="p_value"]/text()').extract()[0])

# # Status of property
# status = response.xpath('//div[contains(@class, "p_infoColumn") and contains(.//div, "Status")]/*[@class="p_value"]/text()').extract()[0]

# # Type of Property
# property_type = response.xpath('//div[contains(@class, "p_infoColumn") and contains(.//div, "Transaction type")]/*[@class="p_value"]/text()').extract()[0]

# If Car Parking Present
car_parking = response.xpath('//div[contains(@class, "p_infoColumn") and contains(.//div, "Car parking")]/*[@class="p_value"]/text()').extract()[0]

# If apartment is unfurnished
furnished_status = response.xpath('//div[contains(@class, "p_infoColumn") and contains(.//div, "Furnished status")]/*[@class="p_value"]/text()').extract()[0]

In [317]:
furnished_status

'Semi-Furnished'

# Starting on IndiaMART

In [419]:
res = requests.get('https://dir.indiamart.com/search.mp?ss=disposable+face+mask&src=as-popular%3Akwd%3Dfacemask%3Apos%3D2%3Acat%3D-2%3Amcat%3D-2&cq=patna', headers=headers)

# res = requests.get('https://madhubanmurli.org/#')#, headers = headers)
response = TextResponse(res.url, body=res.text, encoding='utf-8')

In [442]:
response.xpath('//*[@class="lg elps"]//@href').extract()

['https://www.indiamart.com/proddetail/disposable-masks-20784584255.html',
 'https://www.indiamart.com/proddetail/three-layer-surgical-mask-22276389012.html',
 'https://www.indiamart.com/proddetail/disposable-face-mask-22259998088.html',
 'https://www.indiamart.com/proddetail/disposable-face-mask-22268474355.html',
 'https://www.indiamart.com/proddetail/disposable-face-mask-22289554988.html',
 'https://www.indiamart.com/proddetail/disposable-face-mask-22176765730.html',
 'https://www.indiamart.com/meghdootschooldresses/surgical-gloves-masks.html#reusable-hospital-face-mask-number-of-layers-1',
 'https://www.indiamart.com/proddetail/3-ply-disposable-face-mask-22396328288.html',
 'https://www.indiamart.com/proddetail/n95-face-mask-22300335348.html',
 'https://www.indiamart.com/proddetail/n95-face-mask-22318462655.html',
 'https://www.indiamart.com/proddetail/kn95-face-mask-22298106062.html',
 'https://www.indiamart.com/flytenfashionbrands/surgical-masks-3ply.html#3-ply-surgical-masks',
 

In [441]:
response.xpath('//*[@class="lg elps"]//text()').extract()

["Non-Woven Disposable Masks, For Hospitals, Model Name/Number: Premium's",
 'Ear Loop Mount Three Layer Surgical Mask',
 'Disposable Face Mask, Number of Layers: 3',
 'Ear Loop Mount PP Non Woven Disposable Face Mask',
 'Ear loop Disposable Face Mask',
 'General Purpose Ear Loop Disposable Face Mask',
 'Reusable hospital face mask, Number of Layers: 1',
 '3 Ply Disposable Face Mask',
 'Ear loop Disposable N95 FACE MASK',
 'Reusable N95 Face Mask',
 'Indian Kn95 Face Mask, Number of Layers: H',
 '3 Ply Surgical Masks',
 'Ear Loop Mount PP Non Woven 3 Ply Surgical Mask',
 'Disposable Surgical 3 Ply Face Mask',
 'Non Woven Disposable Face Mask, For Hospital, Clinic, 2']

# Starting with Amazon

proxies = {
    "http": 'http://109.50.52.142:80', 
    "https": 'http://109.50.52.142:80'
}


In [33]:
headers

{'accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3;q=0.9',
 'accept-language': 'en-US,en;q=0.9',
 'cache-control': 'max-age=0',
 'cookie': 'prov=6ee1b1c0-a975-57c1-b806-d6f02acdc078; _ga=GA1.2.1770530859.1579667732; __qca=P0-709205387-1579667731930; __gads=ID=7abeba23853bfab5:T=1579667732:S=ALNI_MZQpLsyZugJqQyXRBUJPOkgNjsDbw; sgt=id=57291405-92ae-4989-a741-f989cf2c6903; _gid=GA1.2.1065622312.1595226703; arp_scroll_position=136; acct=t=JE1j0YL6mlLl7%2bJDAZ0zaTJQ%2bUidPKXE&s=xcKiH08vYJFwXKLlol2HcBhRPB70gUZR',
 'dnt': '1',
 'referer': 'https://www.google.com/',
 'sec-fetch-dest': 'document',
 'sec-fetch-mode': 'navigate',
 'sec-fetch-site': 'cross-site',
 'sec-fetch-user': '?1',
 'upgrade-insecure-requests': '1',
 'user-agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_5) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/84.0.4147.89 Safari/537.36'}

In [34]:
res = requests.get('https://www.amazon.in/s?k=bicycle', headers=headers)#, proxies=proxies)

# res = requests.get('https://madhubanmurli.org/#')#, headers = headers)
response = TextResponse(res.url, body=res.text, encoding='utf-8')

In [37]:
next_selector = response.xpath('//*[@class="a-normal"]//@href')
for url in next_selector.extract():
    time.sleep(random.uniform(0, 2))
    print(scrapy.http.Request(urllib.parse.urljoin(response.url, url), headers = headers))

<GET https://www.amazon.in/s?k=bicycle&page=2&qid=1595676336&ref=sr_pg_2>
<GET https://www.amazon.in/s?k=bicycle&page=3&qid=1595676336&ref=sr_pg_3>


In [30]:
res = requests.get('https://www.amazon.in/Hero-Kyoto-Single-Speed-Cycle/dp/B073QW5F63/ref=sr_1_1?dchild=1&keywords=bicycle&qid=1595670387&sr=8-1', headers=headers)#, proxies=proxies)

# res = requests.get('https://madhubanmurli.org/#')#, headers = headers)
response = TextResponse(res.url, body=res.text, encoding='utf-8')

In [32]:
title = response.xpath('//span[@id="productTitle"]/text()').extract()[0].strip()

no_ratings = response.xpath('//span[@id="acrCustomerReviewText"]//text()').extract()[0]

rating = response.xpath('//span[@data-hook="rating-out-of-text"]//text()').extract()[0]

price = response.xpath('//span[@id="priceblock_dealprice"]//text()').extract()[0].strip().replace(u'\xa0', u' ')

print("Title: ", title)

Title:  Hero  Kyoto 26T Single Speed  Mountain Bike (Black, Ideal For : 12+ Years )


In [25]:


item = GrandItem()
iloader = ItemLoader(item=GrandItem(), response=response)


# self.log("developer: %s" % response.xpath('((//*[@id="thirdFoldDisplay"]//*[@target="_blank"])//text())[1]').extract()[0].strip())
# self.log("area: %s" % response.xpath('//*[@id="coveredAreaDisplay"]//text()').extract()[0] + ' sqft')
# self.log("no_bedroom: %s" % int(''.join(x.strip() for x in response.xpath('//*[@class="seeBedRoomDimen"]//text()').extract())))
# self.log("no_bathroom: %s" % int(response.xpath('//div[contains(@class, "p_infoColumn") and contains(.//div, "Bathrooms")]/*[@class="p_value"]/text()').extract()[0]))
# self.log("status: %s" % response.xpath('//div[contains(@class, "p_infoColumn") and contains(.//div, "Status")]/*[@class="p_value"]/text()').extract()[0])
# self.log("property_type: %s" % response.xpath('//div[contains(@class, "p_infoColumn") and contains(.//div, "Transaction type")]/*[@class="p_value"]/text()').extract()[0])
# self.log("car_parking: %s" % response.xpath('//div[contains(@class, "p_infoColumn") and contains(.//div, "Car parking")]/*[@class="p_value"]/text()').extract()[0])
# self.log("furnished_status: %s" % response.xpath('//div[contains(@class, "p_infoColumn") and contains(.//div, "Furnished status")]/*[@class="p_value"]/text()').extract()[0])


iloader.add_xpath("title", '//span[@id="productTitle"]/text()', MapCompose(str.strip))
iloader.add_xpath("no_ratings", '//span[@id="acrCustomerReviewText"]//text()')
iloader.add_xpath("rating", '//span[@data-hook="rating-out-of-text"]//text()')
iloader.add_xpath("price", '//span[@id="priceblock_dealprice"]//text()', lambda x: x[0].strip().replace(u'\xa0', u' '))

In [27]:
iloader.load_item()

{'no_ratings': ['925 ratings'],
 'price': ['₹ 4,799.00'],
 'rating': ['3.5 out of 5'],
 'title': ['Hero  Kyoto 26T Single Speed  Mountain Bike (Black, Ideal For : '
           '12+ Years )']}

In [101]:
# Read URL from command line
# url = 'https://www.amazon.in/s?k=bicycle&ref=nb_sb_noss'
url = 'https://www.amazon.com/s?k=radio&ref=nb_sb_noss'

# Create queue
queue = deque([])

# Maintains list of visited pages
visited_list = []
websites = []

start = True
count = 2

# Crawl the page and populate the queue with newly found URLs
def crawl(url):    
    global visited_list
    
    time.sleep(random.uniform(0, 4))
    
#     print("About to append: ",type(url), url[-13], url.split('&'), url.split(';')[-1])
    if url.split('&')[-1] in visited_list:
        return
    else:
        visited_list.append(url.split('&')[-1])
        websites.append(url)
        
    if len(queue) > 99:
        return
    
    print("Currently scraping: ", url)
#     print("Current queue is: ", queue)
    res = requests.get(url, headers=headers)
    response = TextResponse(res.url, body=res.text, encoding='utf-8')

    new = response.xpath('//*[@class="a-normal"]//@href').extract()
    urls = [urllib.parse.urljoin(response.url, new[i]) for i in range(len(new))]

    for i in urls:
        flag = 0
        # Complete relative URLs and strip trailing slash
        complete_url = i.rstrip('/')

        # Check if the URL already exists in the queue
        for j in queue:
            if j == complete_url.split(';')[-1]:
                flag = 1
                break

        # If not found in queue
#         print(visited_list)
        if flag == 0:
            if len(queue) > 99:
                return
            if (visited_list.count(complete_url.split('&')[-1])) == 0:
                queue.append(complete_url)

    # Pop one URL from the queue from the left side so that it can be crawled
    current = queue.popleft()

    while True:
        try:
            if current.split('&')[-1] in visited_list:
                current = queue.popleft()
            else:
                break
        except:
            print("Queue is Empty!")
            return
        # Recursive call to crawl until the queue is populated with 100 URLs
    crawl(current)

crawl(url)


# Print queue
# print("\n\nThis is queue!")
# for i in queue:
#     print(i)

# Print list of visited pages
# print("\n\nThese are lists of visited pages")
# for i in visited_list:
#     print(i)

Currently scraping:  https://www.amazon.com/s?k=radio&ref=nb_sb_noss
Currently scraping:  https://www.amazon.com/s?k=radio&page=2&qid=1595587073&ref=sr_pg_2
Currently scraping:  https://www.amazon.com/s?k=radio&page=3&qid=1595587073&ref=sr_pg_3
Currently scraping:  https://www.amazon.com/s?k=radio&qid=1595587076&ref=sr_pg_1
Currently scraping:  https://www.amazon.com/s?k=radio&page=4&qid=1595587080&ref=sr_pg_4
Currently scraping:  https://www.amazon.com/s?k=radio&page=5&qid=1595587087&ref=sr_pg_5
Currently scraping:  https://www.amazon.com/s?k=radio&page=6&qid=1595587091&ref=sr_pg_6
Currently scraping:  https://www.amazon.com/s?k=radio&page=7&qid=1595587095&ref=sr_pg_7
Currently scraping:  https://www.amazon.com/s?k=radio&page=8&qid=1595587097&ref=sr_pg_8
Currently scraping:  https://www.amazon.com/s?k=radio&page=9&qid=1595587100&ref=sr_pg_9
Currently scraping:  https://www.amazon.com/s?k=radio&page=10&qid=1595587102&ref=sr_pg_10
Currently scraping:  https://www.amazon.com/s?k=radio&pa

# Coming back to GUMTREE

In [87]:
from scrapy.http import Request

In [106]:
url = 'https://www.gumtree.com/property-to-rent/uk/studio/page7'

res = requests.get(url, headers=headers)
response = TextResponse(res.url, body=res.text, encoding='utf-8')

In [108]:
response.xpath('//a')

[]

In [None]:
# Read URL from command line
# url = 'https://www.amazon.in/s?k=bicycle&ref=nb_sb_noss'
url = 'https://www.gumtree.com/property-to-rent/uk/studio'
websites = []

# Create queue
queue = deque([])

# Maintains list of visited pages
visited_list = []

# Crawl the page and populate the queue with newly found URLs
def crawl(url):    
    global visited_list, websites
    
    time.sleep(random.uniform(0, 5))
    
#     print("About to append: ",type(url), url[-13], url.split('&'), url.split(';')[-1])
    if url.split('&')[-1] in visited_list:
        return
    else:
        visited_list.append(url.split('&')[-1])
        websites.append(url)
        
    if len(queue) > 99:
        return
    
    print("Currently scraping: ", url)
#     print("Current queue is: ", queue)
    res = requests.get(url, headers=headers)
    response = TextResponse(res.url, body=res.text, encoding='utf-8')

    new = response.xpath('//*[@data-analytics="gaEvent:PaginationPage"]/@href').extract()
    urls = [urllib.parse.urljoin(response.url, new[i]) for i in range(len(new))]

    for i in urls:
        flag = 0
        # Complete relative URLs and strip trailing slash
        complete_url = i.rstrip('/')

        # Check if the URL already exists in the queue
        for j in queue:
            if j == complete_url.split(';')[-1]:
                flag = 1
                break

        # If not found in queue
#         print(visited_list)
        if flag == 0:
            if len(queue) > 99:
                return
            if (visited_list.count(complete_url.split('&')[-1])) == 0:
                queue.append(complete_url)

    # Pop one URL from the queue from the left side so that it can be crawled
#     try:
#         current = queue.popleft()
#     except:
#         pass
    current = queue.popleft()

    while True:
        try:
            if current.split('&')[-1] in visited_list:
                current = queue.popleft()
            else:
                break
        except:
            print("Queue is Empty!")
            return
        # Recursive call to crawl until the queue is populated with 100 URLs
    crawl(current)

crawl(url)


# Print queue
# print("\n\nThis is queue!")
# for i in queue:
#     print(i)

# Print list of visited pages
print("\n\nThese are lists of visited pages")
for i in visited_list:
    print(i)

In [99]:
visited_list

['ref=nb_sb_noss',
 'ref=sr_pg_2',
 'ref=sr_pg_3',
 'ref=sr_pg_1',
 'ref=sr_pg_4',
 'ref=sr_pg_5',
 'ref=sr_pg_6',
 'ref=sr_pg_7',
 'ref=sr_pg_8',
 'ref=sr_pg_9',
 'ref=sr_pg_10',
 'ref=sr_pg_11',
 'ref=sr_pg_12',
 'ref=sr_pg_13',
 'ref=sr_pg_14',
 'ref=sr_pg_15',
 'ref=sr_pg_16',
 'ref=sr_pg_17',
 'ref=sr_pg_18',
 'ref=sr_pg_19',
 'ref=sr_pg_20']

### Starting with HackerEarth (couldn't do, blocked by ROBOTS.txt)

form = FormRequest(url="https://www.hackerearth.com/AJAX/login/", formdata={"login":"shivam13juna@gmail.com", "password":""})

# Starting with Murli 

In [70]:
def daterange(start_date, end_date):
    for n in range(int((end_date - start_date).days)):
        yield start_date + timedelta(n)

In [141]:
"Murli of " + calendar.month_abbr[int(names[i].split('-')[1])] + ' '+ names[i].split('-')[2] + ', ' + names[i].split('-')[0]

'Murli of Jul 24, 2020'

In [149]:
book = epub.EpubBook()

# set metadata
book.set_identifier('Murli')
book.set_title('Book Of Murlis')
book.set_language('en')

# book.add_author('Brahma Kumaris')
book.add_author('Brahma Kumaris', file_as='Love and Spread Love', role='ill')

# Adding Description
book.add_metadata('DC', 'description', 'Adding all the Murlis from Dec 1, 2016 to July 25, 2020 ')


#Iterating over Dates
start_date = date(2016, 12, 3)
# start_date = date(2020, 7, 20)
end_date = date(2020, 7, 25)
names = [i.strftime("%Y-%m-%d") for i in list(daterange(start_date, end_date))]

chapters = [0 for _ in range(len(names))]
for i in range(len(names)):
    res = requests.get('https://madhubanmurli.org/murlis/en/html/murli-'+ names[i] +'.html', headers=headers)
    response = TextResponse(res.url, body=res.text, encoding='utf-8')
    print(res.url)
    
    # create chapter
    chapters[i] = epub.EpubHtml(title="Murli of " + names[i], file_name=names[i]+'.xhtml', lang='en')
    chapters[i].content=response.body

    # add chapter
    book.add_item(chapters[i])

linkage = list(
                epub.Link(
                        names[i]+'.xhtml',
                         "Murli of " + calendar.month_abbr[int(names[i].split('-')[1])] + ' '+ names[i].split('-')[2] + ", "+ names[i].split('-')[0],
                         "Murli of " + calendar.month_abbr[int(names[i].split('-')[1])] + ' '+ names[i].split('-')[2] + ", "+ names[i].split('-')[0]
                         )
               for i in range(len(names))
              )

# define Table Of Contents
book.toc = (linkage)
            


# add default NCX and Nav file
book.add_item(epub.EpubNcx())
book.add_item(epub.EpubNav())

# define CSS style
style = 'BODY {color: white;}'
nav_css = epub.EpubItem(uid="style_nav", file_name="style/nav.css", media_type="text/css", content=style)

# add CSS file
book.add_item(nav_css)

# basic spine
chapters.insert(0, 'nav')
book.spine = chapters

# write to the file
epub.write_epub('test.epub', book, {})

https://madhubanmurli.org/murlis/en/html/murli-2016-12-03.html
https://madhubanmurli.org/murlis/en/html/murli-2016-12-04.html
https://madhubanmurli.org/murlis/en/html/murli-2016-12-05.html
https://madhubanmurli.org/murlis/en/html/murli-2016-12-06.html
https://madhubanmurli.org/murlis/en/html/murli-2016-12-07.html
https://madhubanmurli.org/murlis/en/html/murli-2016-12-08.html
https://madhubanmurli.org/murlis/en/html/murli-2016-12-09.html
https://madhubanmurli.org/murlis/en/html/murli-2016-12-10.html
https://madhubanmurli.org/murlis/en/html/murli-2016-12-11.html
https://madhubanmurli.org/murlis/en/html/murli-2016-12-12.html
https://madhubanmurli.org/murlis/en/html/murli-2016-12-13.html
https://madhubanmurli.org/murlis/en/html/murli-2016-12-14.html
https://madhubanmurli.org/murlis/en/html/murli-2016-12-15.html
https://madhubanmurli.org/murlis/en/html/murli-2016-12-16.html
https://madhubanmurli.org/murlis/en/html/murli-2016-12-17.html
https://madhubanmurli.org/murlis/en/html/murli-2016-12-

In [150]:
para = "In spite of their critical nature, employers complain about the lack of writing and presentation skills amongst their employees. New hires need to present their ideas well to be effective in a business environment. In order to be totally and completely satisfied with their employees employer’s should possibly hire employees who have some of these skills in their repertoire. These people may be very intelligent but without these skills they are not hireable so my course will prepare these students to have these skills."

In [151]:
len(para.split(' '))

84

# Starting With Membian (Making Pdf and CSV)

In [311]:
conv = '''authority: membean.com
method: GET
path: /trainers/shivam_prasad_manya
scheme: https
accept: text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3;q=0.9
accept-encoding: gzip, deflate, br
accept-language: en-US,en;q=0.9
cache-control: max-age=0
cookie: __cfduid=d38b9844740a8912b06c6dd46142ab0a61595766190; _ga=GA1.2.785234034.1595766192; _gid=GA1.2.1128878110.1595766192; __utma=77087369.785234034.1595766192.1595766193.1595766193.1; __utmc=77087369; __utmz=77087369.1595766193.1.1.utmcsr=(direct)|utmccn=(direct)|utmcmd=(none); wcsid=QbPut5yEoRR2zvP68B29g0JabbEotFXa; hblid=4yM6hdMKYqu7Ke9T8B29g0JGMoX4FEtb; _oklv=1595766193055%2CQbPut5yEoRR2zvP68B29g0JabbEotFXa; _okdetect=%7B%22token%22%3A%2215957661933060%22%2C%22proto%22%3A%22https%3A%22%2C%22host%22%3A%22membean.com%22%7D; olfsk=olfsk9343500095506629; _okbk=cd4%3Dtrue%2Cwa1%3Dfalse%2Cvi5%3D0%2Cvi4%3D1595766193598%2Cvi3%3Dactive%2Cvi2%3Dfalse%2Cvi1%3Dfalse%2Ccd8%3Dchat%2Ccd6%3D0%2Ccd5%3Daway%2Ccd3%3Dfalse%2Ccd2%3D0%2Ccd1%3D0%2C; _ok=3780-427-10-2738; remember_user_token=W1sxMDg3MzczXSwiZ3doUlotSmRsUnRlX01oYTZFZUpVZyIsIjE1OTU3NjYyMTQuMDcyNDg4NSJd--2a408d1d5821892d693c2da2aa237a1a975c3f0f; auth_token=gwhRZ-JdlRte_Mha6EeJUg; _new_membean_session_id=YjRoOVNCVUV0YjcvSUhmSVd6VTJKUGpQcGRXcys0eDBDNHBXMnMrT2VBM3dQKytITTZ0WVZMaEtndVVGMkh0elpQVDkzN2ZVZmhqNDdZdTFuNUN3YWp5cXlsdmVjSkxmanBXR1g5TllJZmI4YkpZVThIR0ZjbTZIeGhMVHNjeEZoVEkxbEIxQ25EYW1SQllYL0ttMjhsUThXWjdUNlMrNUl4dXJic1ZWeWEycHgwcGRtSHB0czV6VWo1WEZLMUJia25VQnZGSkxsZWRndktIY0t0T2Nhdm9WK2ZZcm0rVkNVUE9tN0RMRHByUHVsY3BlWTdBbk5xLzFlL08yZHJ4Yy0tc0FWQktiOXpCUXI5Nys3TG5kaGV1Zz09--b897bbfedd06224e6e69151cf43431d6033b2560; arp_scroll_position=5
dnt: 1
sec-fetch-dest: document
sec-fetch-mode: navigate
sec-fetch-site: none
sec-fetch-user: ?1
upgrade-insecure-requests: 1
user-agent: Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_5) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/84.0.4147.89 Safari/537.36'''

conv = conv.split('\n')

headers = dict()

for i in range(len(conv)):
    headers[conv[i].split(': ')[0]] = conv[i].split(': ')[1]

del headers['accept-encoding']

In [312]:
res = requests.get('https://membean.com/trainers/shivam_prasad_manya', headers=headers)#, proxies=proxies)

# res = requests.get('https://madhubanmurli.org/#')#, headers = headers)
response = TextResponse(res.url, body=res.text, encoding='utf-8')

In [314]:
new = response.xpath('//div[@id="poor" or @id="fair" or @id="good" or @id="strong"]//div[@class="content"]//a/@href').extract()

In [316]:
new

['/mywords/bagatelle',
 '/mywords/blandishment',
 '/mywords/bumptious',
 '/mywords/chimerical',
 '/mywords/crepuscular',
 '/mywords/desiderata',
 '/mywords/desultory',
 '/mywords/dialectic',
 '/mywords/diffident',
 '/mywords/encomium',
 '/mywords/enervate',
 '/mywords/expatiate',
 '/mywords/farrago',
 '/mywords/feckless',
 '/mywords/fervid',
 '/mywords/fulminate',
 '/mywords/gambol',
 '/mywords/hermetic',
 '/mywords/histrionic',
 '/mywords/ignominy',
 '/mywords/inchoate',
 '/mywords/intransigent',
 '/mywords/inveigle',
 '/mywords/limpid',
 '/mywords/littoral',
 '/mywords/miasma',
 '/mywords/opprobrium',
 '/mywords/paean',
 '/mywords/pertinacious',
 '/mywords/plangent',
 '/mywords/pluck',
 '/mywords/prolix',
 '/mywords/prosaic',
 '/mywords/proselytize',
 '/mywords/saturnine',
 '/mywords/solecism',
 '/mywords/stentorian',
 '/mywords/stygian',
 '/mywords/toothsome',
 '/mywords/turpitude',
 '/mywords/zeitgeist',
 '/mywords/ablation',
 '/mywords/adjure',
 '/mywords/adumbrate',
 '/mywords/ap

In [292]:
links = [urllib.parse.urljoin(response.url, new[i]) for i in range(len(new))]


In [304]:
links = sorted(links, key = lambda x: x.split('/')[-1].lower())

In [268]:
res = requests.get('https://membean.com/mywords/malfeasance', headers=headers)#, proxies=proxies)

# res = requests.get('https://madhubanmurli.org/#')#, headers = headers)
response = TextResponse(res.url, body=res.text, encoding='utf-8')

In [276]:
context_para = ' '.join(i.strip() for i in response.xpath('//p[@id="context-paragraph"]//text()').extract())
definition = ' '.join(i.strip() for i in response.xpath('//li[@class="choice answer "]//text()').extract()).strip()
mem_hook = ' '.join(i.strip() for i in response.xpath('//div[@class="hook"]//text()').extract()).strip()
synonyms = ', '.join(i.strip().strip('·').strip() for i in response.xpath('//ul[@class="related-syns"]//text()').extract()).replace(' ,', '').rstrip(' ').rstrip(',')
antonyms = ', '.join(i.strip().strip('·').strip() for i in response.xpath('//ul[@class="related-ants"]//text()').extract()).replace(' ,', '').rstrip(' ').rstrip(',')
word_ingred_def = ''.join(response.xpath('//*[@id="word-structure"]/div//p/descendant::text()').extract())
video_url = "https://cdn0.membean.com/video/examplevids/"+response.url.split('/')[-1]+'.mp4'
word_theater_title = ''.join(i.strip() for i in response.xpath('//*[@id="word-theater"]//div[@class="caption"]//descendant::text()').extract())
word = response.url.split('/')[-1]
# print("\nContext Para: ", context_para)
# print("\nDefinition:", definition)
# print("\nmem_hook: ", mem_hook)
# print("\nSynonym: ", synonyms)
# print("\nAntonym: ", antonyms)
# print("\nWord Ingredient: ", word_ingred_def)

In [306]:
import csv  
    
# field names  
fields = ['Word', 'Brief Definition', 'Long Definition', 'Memory Hook', 'Synonyms', 'Antonyms', 'Video Url', 'Video Title', 'Context Paragraph']  
    
# data rows of csv file  
    
# name of csv file  
filename = "membean_words.csv"
    
# writing to csv file  
with open(filename, 'w+') as csvfile:  
    # creating a csv writer object  
    csvwriter = csv.writer(csvfile)  
        
    # writing the fields  
    csvwriter.writerow(fields)  
    
    for link in links:

        # writing the data rows  
        res = requests.get(link, headers=headers)#, proxies=proxies)
        response = TextResponse(res.url, body=res.text, encoding='utf-8')

        context_para = ' '.join(i.strip() for i in response.xpath('//p[@id="context-paragraph"]//text()').extract())
        definition = ' '.join(i.strip() for i in response.xpath('//li[@class="choice answer "]//text()').extract()).strip()
        mem_hook = ' '.join(i.strip() for i in response.xpath('//div[@class="hook"]//text()').extract()).strip()
        synonyms = ', '.join(i.strip().strip('·').strip() for i in response.xpath('//ul[@class="related-syns"]//text()').extract()).replace(' ,', '').rstrip(' ').rstrip(',')
        antonyms = ', '.join(i.strip().strip('·').strip() for i in response.xpath('//ul[@class="related-ants"]//text()').extract()).replace(' ,', '').rstrip(' ').rstrip(',')
        word_ingred_def = ''.join(response.xpath('//*[@id="word-structure"]/div//p/descendant::text()').extract())
        video_url = "https://cdn0.membean.com/video/examplevids/"+response.url.split('/')[-1]+'.mp4'
        word_theater_title = ''.join(i.strip() for i in response.xpath('//*[@id="word-theater"]//div[@class="caption"]//descendant::text()').extract())
        word = response.url.split('/')[-1].capitalize()

        rows = [word, definition, word_ingred_def, mem_hook, synonyms, antonyms, video_url, word_theater_title, context_para]
        print("Currently scraping: ", word)

        csvwriter.writerows([rows]) 

    

Currently scraping:  Abandon
Currently scraping:  Abase
Currently scraping:  Abash
Currently scraping:  Abate
Currently scraping:  Abdicate
Currently scraping:  Aberrant
Currently scraping:  Abet
Currently scraping:  Abeyance
Currently scraping:  Abhor
Currently scraping:  Abject
Currently scraping:  Abjure
Currently scraping:  Ablation
Currently scraping:  Ablution
Currently scraping:  Abnegation
Currently scraping:  Abolish
Currently scraping:  Abominate
Currently scraping:  Abortive
Currently scraping:  Abrasive
Currently scraping:  Abridge
Currently scraping:  Abrogate
Currently scraping:  Abscond
Currently scraping:  Absolve
Currently scraping:  Abstemious
Currently scraping:  Abstinence
Currently scraping:  Abstruse
Currently scraping:  Abysmal
Currently scraping:  Accede
Currently scraping:  Accentuate
Currently scraping:  Acclimate
Currently scraping:  Accost
Currently scraping:  Accoutrement
Currently scraping:  Accretion
Currently scraping:  Acerbic
Currently scraping:  Acme


In [308]:
words = [i.split('/')[-1] for i in links]

In [310]:
book = epub.EpubBook()

# set metadata
book.set_identifier('Membean')
book.set_title('Book Of Words')
book.set_language('en')

# book.add_author('Brahma Kumaris')
book.add_author('Shivam Prasad', file_as='Love and Spread Love', role='ill')

# Adding Description
book.add_metadata('DC', 'description', 'Adding all the words from Membean')



chapters = [0 for _ in range(len(words))]
for i in range(len(links)):
    res = requests.get(links[i], headers=headers)
    response = TextResponse(res.url, body=res.text, encoding='utf-8')
    print(res.url)
    
    # create chapter
    chapters[i] = epub.EpubHtml(title=words[i], file_name=words[i]+'.xhtml', lang='en')
    chapters[i].content=response.body

    # add chapter
    book.add_item(chapters[i])

# linkage = list(
#                 epub.Link(
#                         words[i]+'.xhtml',
#                          "Murli of " + calendar.month_abbr[int(names[i].split('-')[1])] + ' '+ names[i].split('-')[2] + ", "+ names[i].split('-')[0],
#                          "Murli of " + calendar.month_abbr[int(names[i].split('-')[1])] + ' '+ names[i].split('-')[2] + ", "+ names[i].split('-')[0]
#                          )
#                for i in range(len(names))
#               )

# # define Table Of Contents
# book.toc = (linkage)
            


# add default NCX and Nav file
book.add_item(epub.EpubNcx())
book.add_item(epub.EpubNav())

# define CSS style
style = 'BODY {color: white;}'
nav_css = epub.EpubItem(uid="style_nav", file_name="style/nav.css", media_type="text/css", content=style)

# add CSS file
book.add_item(nav_css)

# basic spine
chapters.insert(0, 'nav')
book.spine = chapters

# write to the file
epub.write_epub('words.epub', book, {})

https://membean.com/mywords/abandon
https://membean.com/mywords/abase
https://membean.com/mywords/abash
https://membean.com/mywords/abate
https://membean.com/mywords/abdicate
https://membean.com/mywords/aberrant
https://membean.com/mywords/abet
https://membean.com/mywords/abeyance
https://membean.com/mywords/abhor
https://membean.com/mywords/abject
https://membean.com/mywords/abjure
https://membean.com/mywords/ablation
https://membean.com/mywords/ablution
https://membean.com/mywords/abnegation
https://membean.com/mywords/abolish
https://membean.com/mywords/abominate
https://membean.com/mywords/abortive
https://membean.com/mywords/abrasive
https://membean.com/mywords/abridge
https://membean.com/mywords/abrogate
https://membean.com/mywords/abscond
https://membean.com/mywords/absolve
https://membean.com/mywords/abstemious
https://membean.com/mywords/abstinence
https://membean.com/mywords/abstruse
https://membean.com/mywords/abysmal
https://membean.com/mywords/accede
https://membean.com/my

# OZEE SCRAPER

In [2]:
conv = '''authority: gwapi.zee5.com
method: GET
path: /content/tvshow/?season_id=0-2-BhabijiGharParHain1&type=episode&translation=en&country=IN&on_air=true&asset_subtype=tvshow&page=6&limit=25
scheme: https
accept: text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3;q=0.9
accept-encoding: gzip, deflate, br
accept-language: en-US,en;q=0.9
cache-control: no-cache
cookie: _gcl_au=1.1.981860243.1601232750; _qg_fts=1601232752; QGUserId=7863690288753388; _qg_cm=1; _gid=GA1.2.839926967.1601392273; hcsmlc=google-organic; _ga_LB5CP1CVNF=GS1.1.1601395857.3.1.1601397496.57; _ga=GA1.2.443061410.1601232751; _uetsid=da74bf22dfc08a2f4a9101a500dbe4ea; _uetvid=9ad524e1b53b4c00b335543a3aa06eff; arp_scroll_position=8
dnt: 1
pragma: no-cache
sec-fetch-dest: document
sec-fetch-mode: navigate
sec-fetch-site: none
sec-fetch-user: ?1
upgrade-insecure-requests: 1
user-agent: Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_6) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/85.0.4183.121 Safari/537.36'''


conv = '''authority: faststrip.apester.com
path: /?token=5ec3a02e3532852f0bdfafbb&itemShape=roundSquare&itemSize=large&itemTextColor=black&headerFontFamily=NotoSansSemiBold,sans-serif&headerText=Trending%2520Stories&headerFontSize=22&headerFontWeight=600&headerFontColor=%23ffffff&headerLtr=true&headerProvider=google&thumbnailsStrokeColor=none&stripBackground=transparent&paddingTop=0&paddingBottom=0&bottomBorderWidth=0&bottomBorderColor=black&topBorderWidth=0&topBorderColor=black&instanceId=e04fd368-a361-465a-9f9d-27d27ead80b0&sessionId=3060200a-bba7-42fe-8deb-a9ff045d04d7&canonicalUrl=https%3A%2F%2Fwww.zee5.com%2Ftvshows%2Fdetails%2Fbhabi-ji-ghar-par-hai%2F0-6-199%2Fbhabi-ji-ghar-par-hai-september-30-2020%2F0-1-manual_780ej7kahb20&horizontalHeaderPadding=8
scheme: https
accept: */*
accept-encoding: gzip, deflate, br
accept-language: en-US,en;q=0.9
cache-control: no-cache
cookie: userSession=c43f7573-2b64-4775-add0-7a7fefb49342
dnt: 1
pragma: no-cache
referer: https://www.zee5.com/
sec-fetch-dest: empty
sec-fetch-mode: no-cors
sec-fetch-site: same-origin
user-agent: Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_6) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/85.0.4183.121 Safari/537.36'''

conv = conv.split('\n')

headers = dict()

for i in range(len(conv)):
    headers[conv[i].split(': ')[0]] = conv[i].split(': ')[1]

# del headers['accept-encoding']

In [207]:
res = requests.get('https://www.zee5.com/tvshows/details/bhabi-ji-ghar-par-hai/0-6-199/episodes', headers=headers)#, proxies=proxies)
# https://gwapi.zee5.com/content/tvshow/?season_id=0-2-BhabijiGharParHain1&type=episode&translation=en&country=IN&on_air=true&asset_subtype=tvshow&page=12&limit=25
# res = requests.get('https://madhubanmurli.org/#')#, headers = headers)
response = TextResponse(res.url, body=res.text, encoding='utf-8')

In [61]:
response_sin.xpath('//div[@class="clickWrapper"]//div[@class="showDuration"]/span[1]/text()')

[<Selector xpath='//div[@class="clickWrapper"]//div[@class="showDuration"]/span[1]/text()' data='E1388'>,
 <Selector xpath='//div[@class="clickWrapper"]//div[@class="showDuration"]/span[1]/text()' data='E1387'>,
 <Selector xpath='//div[@class="clickWrapper"]//div[@class="showDuration"]/span[1]/text()' data='E1386'>,
 <Selector xpath='//div[@class="clickWrapper"]//div[@class="showDuration"]/span[1]/text()' data='E1385'>,
 <Selector xpath='//div[@class="clickWrapper"]//div[@class="showDuration"]/span[1]/text()' data='E1384'>]

In [60]:
response_sin.xpath('//div[@class="clickWrapper"]//h3[@class="showCardTitle"]//a/@href').extract()

['/tvshows/details/bhabi-ji-ghar-par-hai/0-6-199/bhabi-ji-ghar-par-hai-october-05-2020/0-1-manual_5jqrfherrvf0',
 '/tvshows/details/bhabi-ji-ghar-par-hai/0-6-199/bhabi-ji-ghar-par-hai-october-02-2020/0-1-manual_7hkhi9ruq010',
 '/tvshows/details/bhabi-ji-ghar-par-hai/0-6-199/bhabi-ji-ghar-par-hai-october-01-2020/0-1-manual_2b7dv2o891k0',
 '/tvshows/details/bhabi-ji-ghar-par-hai/0-6-199/bhabi-ji-ghar-par-hai-september-30-2020/0-1-manual_780ej7kahb20',
 '/tvshows/details/bhabi-ji-ghar-par-hai/0-6-199/bhabi-ji-ghar-par-hai-september-29-2020/0-1-manual_3456ec1898lg']

In [7]:
from selenium import webdriver


import time

driver=webdriver.Chrome('./chromedriver')
driver.get("https://www.zee5.com/tvshows/details/bhabi-ji-ghar-par-hai/0-6-199/episodes")
count = 0
flag = True
prev = 0
while flag:
    time.sleep(0.1)
    driver.execute_script("window.scrollTo(0,document.body.scrollHeight)")
    if prev == 0:
        prev = driver.execute_script("return document.body.scrollHeight;")
    else:
        curr = driver.execute_script("return document.body.scrollHeight;")
        if count == 5: # This is for checking if for 5th count, height hasn't increased
            print("Count is 5, setting flag to False; \nEXITING")
            flag = False
        if curr == prev: # If height hasn't increased, increase count and sleep a little more
            print("Curr: ", curr, " is equal to prev: ", prev, " increasing count by 1, current count is: ", count)
            count += 1
            time.sleep(0.1)
        else: # If height increased reset count
            count = 0
            prev = curr
response_main = TextResponse('https://www.zee5.com/', body=driver.page_source, encoding='utf-8')
driver.close()

            

            
# driver.find_element_by_xpath('//*[@id="root"]//h3[@class="showCardTitle"]//@href').click()
# time.sleep(3)
# driver.close()


Curr:  3897  is equal to prev:  3897  increasing count by 1, current count is:  0
Curr:  5267  is equal to prev:  5267  increasing count by 1, current count is:  0
Curr:  6638  is equal to prev:  6638  increasing count by 1, current count is:  0
Curr:  8237  is equal to prev:  8237  increasing count by 1, current count is:  0
Curr:  9607  is equal to prev:  9607  increasing count by 1, current count is:  0
Curr:  10977  is equal to prev:  10977  increasing count by 1, current count is:  0
Curr:  12348  is equal to prev:  12348  increasing count by 1, current count is:  0
Curr:  13946  is equal to prev:  13946  increasing count by 1, current count is:  0
Curr:  15317  is equal to prev:  15317  increasing count by 1, current count is:  0
Curr:  19656  is equal to prev:  19656  increasing count by 1, current count is:  0
Curr:  21026  is equal to prev:  21026  increasing count by 1, current count is:  0
Curr:  22397  is equal to prev:  22397  increasing count by 1, current count is:  0
Cu

In [77]:
from selenium import webdriver
import time
import csv

with open('bhabhi_ji_ghar.csv', 'w', newline='\n') as file:
            writer = csv.writer(file)
            writer.writerow(['Episode No','Episode Link', 'Description']) 

driver=webdriver.Chrome('./chromedriver')
driver.get("https://www.zee5.com/tvshows/details/bhabi-ji-ghar-par-hai/0-6-199/episodes")
time.sleep(3600)
no = []

# iterate through all the details links  (used the index rather elements list as it may lead to staleeleemnt exception after clicking on the first detiails link)
for link in link_list:
    print("Opening ", link)
    driver.execute_script("window.open('" + link + "');")
    driver.switch_to.window(driver.window_handles[1])
    time.sleep(5)
    response_sin = TextResponse('https://www.zee5.com', body=driver.page_source, encoding='utf-8')
    try: # Tryna get Episode No
        episode_num = response_sin.xpath('//div[@class="metaInfo"]//p[1]/text()').extract()[0]
    except:
        print("Couldn't get episode number")
        no.append(link)
        driver.close()
        driver.switch_to.window(driver.window_handles[0])
        continue
#         break
    try: # Tryna get description
        desc = response_sin.xpath('//*[@id="root"]//div[@class="consumptionMetaDiv"]/h1/text()').extract()
    except:
        print("Couldn't get description")
        no.append(link)
        driver.close()
        driver.switch_to.window(driver.window_handles[0])
        continue
#         break
    with open('bhabhi_ji_ghar.csv', 'a', newline='\n') as file:
        writer = csv.writer(file)
        writer.writerow([episode_num, link, desc]) 
        driver.close()
    # switching to parent window (on safer side)
    driver.switch_to.window(driver.window_handles[0])
driver.close()

Opening  https://www.zee5.com/tvshows/details/bhabi-ji-ghar-par-hai/0-6-199/bhabi-ji-ghar-par-hai-october-05-2020/0-1-manual_5jqrfherrvf0
Opening  https://www.zee5.com/tvshows/details/bhabi-ji-ghar-par-hai/0-6-199/bhabi-ji-ghar-par-hai-october-02-2020/0-1-manual_7hkhi9ruq010
Opening  https://www.zee5.com/tvshows/details/bhabi-ji-ghar-par-hai/0-6-199/bhabi-ji-ghar-par-hai-october-01-2020/0-1-manual_2b7dv2o891k0
Opening  https://www.zee5.com/tvshows/details/bhabi-ji-ghar-par-hai/0-6-199/bhabi-ji-ghar-par-hai-september-30-2020/0-1-manual_780ej7kahb20
Opening  https://www.zee5.com/tvshows/details/bhabi-ji-ghar-par-hai/0-6-199/bhabi-ji-ghar-par-hai-september-29-2020/0-1-manual_3456ec1898lg
Opening  https://www.zee5.com/tvshows/details/bhabi-ji-ghar-par-hai/0-6-199/teeka-and-friends-incur-a-loss-bhabi-ji-ghar-par-hai/0-1-manual_3u0p7shch1i0
Opening  https://www.zee5.com/tvshows/details/bhabi-ji-ghar-par-hai/0-6-199/bhabi-ji-ghar-par-hai-september-25-2020/0-1-manual_4octdqofm5u0
Opening  http

In [52]:
from selenium import webdriver
import time
import csv

driver=webdriver.Chrome('./chromedriver')
driver.get("https://www.zee5.com/tvshows/details/bhabi-ji-ghar-par-hai/0-6-199/episodes")
time.sleep(15)
no = []

# iterate through all the details links  (used the index rather elements list as it may lead to staleeleemnt exception after clicking on the first detiails link)
for link in link_list:
    print("Opening ", link)
    driver.execute_script("window.open('');")
    driver.switch_to.window(driver.window_handles[1])
    driver.get(link)
    response_sin = TextResponse('https://www.zee5.com', body=driver.page_source, encoding='utf-8')
    try: # Tryna get Episode No
        episode_num = response_sin.xpath('//div[@class="metaInfo"]//p[1]/text()').extract()[0]
    except:
        print("Couldn't get episode number")
        no.append(link)
        continue
#         driver.close()
#         break
    try: # Tryna get description
        desc = response_sin.xpath('//*[@id="root"]//div[@class="consumptionMetaDiv"]/h1/text()').extract()
    except:
        print("Couldn't get description")
        no.append(link)
        continue
#         driver.close()
#         break
    with open('bhabhi_ji_ghar.csv', 'a', newline='\n') as file:
        writer = csv.writer(file)
        writer.writerow([episode_num, 'https://www.zee5.com' + i, desc]) 
        driver.close()
    # switching to parent window (on safer side)
    driver.switch_to.window(driver.window_handles[0])

Opening  https://www.zee5.com/tvshows/details/bhabi-ji-ghar-par-hai/0-6-199/bhabi-ji-ghar-par-hai-october-05-2020/0-1-manual_5jqrfherrvf0
Couldn't get episode number
Opening  https://www.zee5.com/tvshows/details/bhabi-ji-ghar-par-hai/0-6-199/bhabi-ji-ghar-par-hai-october-02-2020/0-1-manual_7hkhi9ruq010
Opening  https://www.zee5.com/tvshows/details/bhabi-ji-ghar-par-hai/0-6-199/bhabi-ji-ghar-par-hai-october-01-2020/0-1-manual_2b7dv2o891k0
Couldn't get episode number
Opening  https://www.zee5.com/tvshows/details/bhabi-ji-ghar-par-hai/0-6-199/bhabi-ji-ghar-par-hai-september-30-2020/0-1-manual_780ej7kahb20
Couldn't get episode number
Opening  https://www.zee5.com/tvshows/details/bhabi-ji-ghar-par-hai/0-6-199/bhabi-ji-ghar-par-hai-september-29-2020/0-1-manual_3456ec1898lg
Opening  https://www.zee5.com/tvshows/details/bhabi-ji-ghar-par-hai/0-6-199/teeka-and-friends-incur-a-loss-bhabi-ji-ghar-par-hai/0-1-manual_3u0p7shch1i0
Couldn't get episode number
Opening  https://www.zee5.com/tvshows/det

KeyboardInterrupt: 

In [70]:
response_sin.xpath('//*[@id="root"]//div[@class="consumptionMetaDiv"]/h1/text()').extract()

[]

In [35]:
from selenium import webdriver
import time
import csv

driver=webdriver.Chrome('./chromedriver')
driver.get("https://www.zee5.com/tvshows/details/bhabi-ji-ghar-par-hai/0-6-199/episodes")
time.sleep(5)

# iterate through all the details links  (used the index rather elements list as it may lead to staleeleemnt exception after clicking on the first detiails link)
for link in link_list:
#     # get the details element based on index
#     ele = driver.find_element_by_xpath("(//a[@class='details'][contains(.,'Details')])[" + str (addrNum+1) + "]")
#     # get the href of the link
#     href = ele.get_attribute('href')
    # open the href in another tab
    print("Opening ", link)
    driver.execute_script("window.open('');")
    
    # Switch to the new window and open URL B
    driver.switch_to.window(driver.window_handles[1])
    driver.get(link)
    response_sin = TextResponse('https://www.zee5.com', body=driver.page_source, encoding='utf-8')
    try: # Tryna get Episode No
        episode_num = response_sin.xpath('//div[@class="metaInfo"]//p[1]/text()').extract()[0]
    except:
        print("Couldn't get episode number")
        driver.close()
        break
    try: # Tryna get description
        desc = response_sin.xpath('//*[@id="root"]//div[@class="consumptionMetaDiv"]/h1/text()').extract()
    except:
        print("Couldn't get description")
        driver.close()
        break
    with open('bhabhi_ji_ghar.csv', 'a', newline='\n') as file:
        writer = csv.writer(file)
        writer.writerow([episode_num, 'https://www.zee5.com' + i, desc]) 
        driver.close()
    # switching to parent window (on safer side)
    driver.switch_to.window(driver.window_handles[0])

Opening  https://www.zee5.com/tvshows/details/bhabi-ji-ghar-par-hai/0-6-199/bhabi-ji-ghar-par-hai-october-05-2020/0-1-manual_5jqrfherrvf0
Couldn't get episode number


In [15]:
# with open('bhab_main', 'wb') as handle:
#     pickle.dump(response_main, handle)

# with open('bhab_main', 'rb') as handle:
#     response_main_check =  pickle.loads(handle)
link_list = ['https://www.zee5.com' + i for i in response_main.xpath('//div[@class="clickWrapper"]//h3[@class="showCardTitle"]//a/@href').extract()]
    

In [17]:
link_list[0]

'https://www.zee5.com/tvshows/details/bhabi-ji-ghar-par-hai/0-6-199/bhabi-ji-ghar-par-hai-october-05-2020/0-1-manual_5jqrfherrvf0'

InvalidSessionIdException: Message: invalid session id


In [None]:
driver.add_cookie('userSession=c43f7573-2b64-4775-add0-7a7fefb49342')

In [None]:
userSession=c43f7573-2b64-4775-add0-7a7fefb49342

In [66]:
from selenium import webdriver
import time
import csv

with open('bhabhi_ji_ghar.csv', 'w', newline='\n') as file:
            writer = csv.writer(file)
            writer.writerow(['Episode No','Episode Link', 'Description']) 

print("Getting ", i)
driver=webdriver.Chrome('./chromedriver')

driver.get('https://www.zee5.com/tvshows/details/bhabi-ji-ghar-par-hai/0-6-199/bhabi-ji-ghar-par-hai-september-21-2020/0-1-manual_2e0ks994n8dg')
response_sin = TextResponse('https://www.zee5.com', body=driver.page_source, encoding='utf-8')
try: # Tryna get Episode No
    episode_num = response_sin.xpath('//div[@class="metaInfo"]//p[1]/text()').extract()[0]
except:
    print("Couldn't get episode number")
    driver.close()
try: # Tryna get description
    desc = response_sin.xpath('//*[@id="root"]//div[@class="consumptionMetaDiv"]/h1/text()').extract()
except:
    print("Couldn't get description")
    driver.close()
with open('bhabhi_ji_ghar.csv', 'a', newline='\n') as file:
    writer = csv.writer(file)
    writer.writerow([episode_num, 'https://www.zee5.com' + i, desc]) 
    driver.close()

            

            
# driver.find_element_by_xpath('//*[@id="root"]//h3[@class="showCardTitle"]//@href').click()
# time.sleep(3)
# driver.close()


Getting  https://www.zee5.com/tvshows/details/bhabi-ji-ghar-par-hai/0-6-199/bhabi-ji-ghar-par-hai-october-05-2020/0-1-manual_5jqrfherrvf0


In [65]:
episode_num

'Episode 1378'