## Importing the Drivers and Libraries

In [1]:
from selenium import webdriver
from selenium.webdriver.chrome.service import Service
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from bs4 import BeautifulSoup
import codecs
import re
import time
from webdriver_manager.chrome import ChromeDriverManager
import firebase_admin
from firebase_admin import credentials
from firebase_admin import firestore
import uuid
from datetime import datetime, timezone

## Setting up firestore clients

In [2]:
cred = credentials.Certificate('dataharvest_cred.json')     # Not included in GitHub for Safety Reasons
app = firebase_admin.initialize_app(cred)
db = firestore.client()

### For CSV file generation

In [3]:
import csv

### Starting a Selenium Web Session

In [4]:
driver=webdriver.Chrome(service=Service(ChromeDriverManager().install()))

In [5]:
print(driver)

<selenium.webdriver.chrome.webdriver.WebDriver (session="7d09f25d87f29673056b393804b9e455")>


### Adding the CSV File path

In [6]:
csv_file_path = 'blogs_data.csv'
print(csv_file_path)

blogs_data.csv


### Function to Generate the UID for Database 

In [7]:
def generate_document_id():
    return str(uuid.uuid4())

### Definition of Blog class to store the content of a blog

In [8]:
class Blog:
    def __init__(self,title="",date="",image_url="",likes=""):
        self.title = title
        self.date = date
        self.image_url = image_url
        self.likes = likes
        
    def print_blog(self):
        print("Blog Title: " + self.title)
        print("Blog Date: " + self.date.strftime("%m/%d/%Y, %H:%M:%S"))
        print("Blog Image URL: " + self.image_url)
        print("Blog Likes Count: "+ str(self.likes))

## Loading the website

In [9]:
val = "https://rategain.com/blog/"
wait = WebDriverWait(driver, 10)
driver.get(val)

get_url = driver.current_url
wait.until(EC.url_to_be(val))

if get_url == val:
    page_source = driver.page_source


## Using BeautifulSoup to scrape

In [10]:
soup = BeautifulSoup(page_source,features="html.parser")

In [11]:
page_numbers = soup.find_all(class_="page-numbers")

### Find the maximum pages for navigating

In [12]:
last_a_tag = page_numbers[-2]

In [13]:
max_page_number = int(last_a_tag['href'].split('/')[-2])

In [14]:
print(max_page_number)

45


### Function to scrape the webpage and return title,date,image url and likes count

In [15]:
def scrapeThePage(blogs,blog_pagewise):
    for blog in blogs:
        blog_title = blog.find('h6').text.strip()
        blog_date = blog.find('div', class_='bd-item').span.text.strip()
        blog_date = datetime.strptime(blog_date, "%B %d, %Y")
        try:
            blog_image_url = blog.find('div', class_='img').a.get('data-bg')
        except:
            blog_image_url=""
        likes_count = blog.find('a', class_='zilla-likes').span.text.strip()
        likes_count = int(likes_count.split()[0])
        val = Blog(blog_title,blog_date,blog_image_url,likes_count)
        blog_pagewise.append(val)
        

### Function to push the blog to Firestore


In [16]:
def add_or_update_blog_to_firestore(blog):
    # Check if the document already exists based on title
    existing_docs = db.collection('data').where('blog_title', '==', blog.title).get()

    if not existing_docs:
        # Document does not exist, add a new document
        document_id = generate_document_id()
        doc_ref = db.collection('data').document(document_id)
        doc_ref.set({
            'blog_title': blog.title,
            'date': blog.date,
            'image_url': blog.image_url,
            'likes': blog.likes
        })
        print(f"New blog added to Firestore with document ID: {document_id}")
    else:
        # Document exists, update the existing document
        for doc in existing_docs:
            doc_ref = db.collection('data').document(doc.id)
            doc_ref.update({
                'date': blog.date,
                'image_url': blog.image_url,
                'likes': blog.likes
            })
            print(f"Existing blog updated in Firestore with document ID: {doc.id}")

## Main code to traverse the website and update the data

In [17]:
blog_pagewise=[]
# val = "https://rategain.com/blog/"
# wait = WebDriverWait(driver, 15)
# driver.get(val)
# get_url = driver.current_url
# soup = BeautifulSoup(page_source,features="html.parser")
# page_source = driver.page_source
# page_blogs = soup.find_all('article', class_=re.compile(r'^blog-'))
# print("Parsing "+ str(get_url))
# scrapeThePage(page_blogs,blog_pagewise)
# print(len(blog_pagewise))
# print(get_url+" parsed")


for page in range(1,max_page_number+2):
    val = "https://rategain.com/blog/page/"+str(page)+"/"
    wait = WebDriverWait(driver, 15)
    driver.get(val)
    get_url = driver.current_url
    soup = BeautifulSoup(page_source,features="html.parser")
    page_source = driver.page_source
    page_blogs = soup.find_all('article', class_=re.compile(r'^blog-'))
    print("Parsing "+ str(get_url))
    scrapeThePage(page_blogs,blog_pagewise)
    print(get_url+" parsed")

Parsing https://rategain.com/blog/
https://rategain.com/blog/ parsed
Parsing https://rategain.com/blog/page/2/
https://rategain.com/blog/page/2/ parsed
Parsing https://rategain.com/blog/page/3/
https://rategain.com/blog/page/3/ parsed
Parsing https://rategain.com/blog/page/4/
https://rategain.com/blog/page/4/ parsed
Parsing https://rategain.com/blog/page/5/
https://rategain.com/blog/page/5/ parsed
Parsing https://rategain.com/blog/page/6/
https://rategain.com/blog/page/6/ parsed
Parsing https://rategain.com/blog/page/7/
https://rategain.com/blog/page/7/ parsed
Parsing https://rategain.com/blog/page/8/
https://rategain.com/blog/page/8/ parsed
Parsing https://rategain.com/blog/page/9/
https://rategain.com/blog/page/9/ parsed
Parsing https://rategain.com/blog/page/10/
https://rategain.com/blog/page/10/ parsed
Parsing https://rategain.com/blog/page/11/
https://rategain.com/blog/page/11/ parsed
Parsing https://rategain.com/blog/page/12/
https://rategain.com/blog/page/12/ parsed
Parsing http

### Printing the data Extracted by the websites

In [18]:
blog_pagewise = blog_pagewise[9:]
len(blog_pagewise)
i=1
for item in blog_pagewise:
    print(str(i)+".")
    item.print_blog()
    print()
    i+=1

1.
Blog Title: Taylor Swift's Spectacular Tour Sparks Unprecedented Hotel Booking Surge in Buenos Aires
Blog Date: 11/23/2023, 00:00:00
Blog Image URL: https://rategaincom.wpenginepowered.com/wp-content/uploads/2023/11/swiftie-effect-buenos-aires-scaled.jpg
Blog Likes Count: 5

2.
Blog Title: A Complete Guide to Hotel Revenue Management
Blog Date: 11/22/2023, 00:00:00
Blog Image URL: https://rategaincom.wpenginepowered.com/wp-content/uploads/2023/11/A-Complete-Guide-to-Hotel-Revenue-Management.png
Blog Likes Count: 2

3.
Blog Title: The Ultimate Guide to Choosing the Right Hotel Booking Engine
Blog Date: 11/22/2023, 00:00:00
Blog Image URL: https://rategaincom.wpenginepowered.com/wp-content/uploads/2023/11/The-Ultimate-Guide-to-Choosing-the-Right-Hotel-Booking-Engine.png
Blog Likes Count: 3

4.
Blog Title: Maximize Your Hotel’s Exposure with Google AdSense’s New "Related Search" Feature in Auto Ads
Blog Date: 10/20/2023, 00:00:00
Blog Image URL: https://rategaincom.wpenginepowered.com/

### Adding the data to the CSV file at location ./blogs_data.csv

In [19]:
with open(csv_file_path, 'w', newline='', encoding='utf-8') as csvfile:
    # Create a CSV writer
    csv_writer = csv.writer(csvfile)

    # Write the header row
    csv_writer.writerow(['Title', 'Date', 'Image URL', 'Likes'])

    # Write data for each Blog object
    for blog in blog_pagewise:
        csv_writer.writerow([blog.title, blog.date, blog.image_url, blog.likes])

print(f"CSV file '{csv_file_path}' has been created successfully.")

CSV file 'blogs_data.csv' has been created successfully.


### Add each Blog object to Firestore

In [20]:
for blog in blog_pagewise:
    add_or_update_blog_to_firestore(blog)

  return query.where(field_path, op_string, value)


Existing blog updated in Firestore with document ID: 9a8c84d9-37da-4d66-9626-babc99c14749
Existing blog updated in Firestore with document ID: fca59a0c-e484-473a-ad03-feed67013f17
Existing blog updated in Firestore with document ID: 6cfcec2f-ea19-4e04-bab4-4e78fc505948
Existing blog updated in Firestore with document ID: f4dd4ca4-d4fa-4873-9e26-12e96b5c2225
Existing blog updated in Firestore with document ID: 72e6dad1-58b6-47a6-9053-655725d38f08
Existing blog updated in Firestore with document ID: 30b0777d-6cf3-4461-87eb-e818b4d00c80
Existing blog updated in Firestore with document ID: d3c614c0-ced1-4d8a-84bb-647a25876fa6
Existing blog updated in Firestore with document ID: d923567a-55fc-4907-a03e-86eda682f2a4
Existing blog updated in Firestore with document ID: c84644f2-96cb-4f9d-a244-82e2e049b281
Existing blog updated in Firestore with document ID: da0d6476-f1ae-4b43-9881-51a0f4d70d58
Existing blog updated in Firestore with document ID: dc8fd6de-f3ae-47e1-8cc1-381e5a1e850b
Existing b

Existing blog updated in Firestore with document ID: 63369ea8-8bde-45c9-8351-6f16c71894af
Existing blog updated in Firestore with document ID: 01c2cd90-74e7-424f-8ef9-3fbd2b734d67
Existing blog updated in Firestore with document ID: 54c432d9-55dd-46aa-be63-73a9c55d2d49
Existing blog updated in Firestore with document ID: 6064edee-1f12-4682-8f56-ef332d87fbad
Existing blog updated in Firestore with document ID: 24d3df10-f253-4af4-9591-48af951e754d
Existing blog updated in Firestore with document ID: 7932e37a-cb70-4bf7-9a05-122417b9b48e
Existing blog updated in Firestore with document ID: fe8076c6-18b8-474f-9078-422533dc1614
Existing blog updated in Firestore with document ID: 6bff2c7e-bb4c-4e6d-82e9-15514d17c118
Existing blog updated in Firestore with document ID: d481e86f-7d2b-40c3-985a-c22596ddd6e5
Existing blog updated in Firestore with document ID: bb0b522e-915d-4954-8134-d1ad8cbeae10
Existing blog updated in Firestore with document ID: 59227763-b2d5-431e-829c-fa9c3a77666a
Existing b

Existing blog updated in Firestore with document ID: 34e2640d-a1f0-4703-8d59-6b6d5a66c3a5
Existing blog updated in Firestore with document ID: 43ab1779-c4d4-4c08-bc7f-d80ffba7b336
Existing blog updated in Firestore with document ID: 8de5d22d-5659-4361-acef-f7e1a0375465
Existing blog updated in Firestore with document ID: 0bfaf326-ce4d-40d8-a7df-c52e2772be78
Existing blog updated in Firestore with document ID: 12734746-6a3f-4f87-9835-5ccb4f9c259c
Existing blog updated in Firestore with document ID: 2b446045-6ce9-4e20-9b85-dc0acd1c2dfb
Existing blog updated in Firestore with document ID: 5c476b05-c40f-4dc3-ab9c-b0f37fedd590
Existing blog updated in Firestore with document ID: 85547967-4715-447d-a0bc-e5c3f74e6fa7
Existing blog updated in Firestore with document ID: 0bdf6afe-2f2c-47d5-8f18-70d1e0f0fb4f
Existing blog updated in Firestore with document ID: 22c2917e-28ad-4f6a-bced-439e0145b3d0
Existing blog updated in Firestore with document ID: adac2c55-bbf9-4af1-8b46-b5b399b8671c
Existing b

Existing blog updated in Firestore with document ID: e99eaab0-0164-48e7-a7c5-158267f07522
Existing blog updated in Firestore with document ID: a29bbd46-166e-45c2-8699-88dca782c51a
Existing blog updated in Firestore with document ID: d5b72e5a-678a-4994-900c-74234adbc8e3
Existing blog updated in Firestore with document ID: 73dfddbb-ce71-4f4f-973c-a2ff93753f22
Existing blog updated in Firestore with document ID: 347347d4-a7e2-459f-a7b3-478602eecd4e
Existing blog updated in Firestore with document ID: 2ded27c9-b1e2-4ab8-b75f-1c4f1b867f64
Existing blog updated in Firestore with document ID: 60c1b726-bed6-441a-8540-36e850df7422
Existing blog updated in Firestore with document ID: 063c4ae2-fd41-4935-a09a-5cd1b6e5efe7
Existing blog updated in Firestore with document ID: fe34a873-576d-4ebc-af4a-f56ca6767a9e
Existing blog updated in Firestore with document ID: be1aafec-359b-4fab-a819-6e560ad194f7
Existing blog updated in Firestore with document ID: 71b73fd3-3607-4bfb-bc2d-4921400b1cdc
Existing b

Existing blog updated in Firestore with document ID: 1842c5d2-a7db-4e65-af9b-9bbdc163fd61
Existing blog updated in Firestore with document ID: 5019585d-c1cf-4734-abdd-209d061e5116
Existing blog updated in Firestore with document ID: 7ea6da2f-c6fd-4d10-8429-a90f47b292d9
Existing blog updated in Firestore with document ID: 3a9c9a48-e226-4995-bef3-5bfa0aad5501
Existing blog updated in Firestore with document ID: b0980139-33df-41c1-8b47-116d7d19e3a9
Existing blog updated in Firestore with document ID: e46c20e7-5df3-467e-b61c-8d9b134d263a
Existing blog updated in Firestore with document ID: 9f5176af-e9c9-4670-939a-b037617f4c0a
Existing blog updated in Firestore with document ID: 951791d0-5742-4047-bf0e-014a170ae772
Existing blog updated in Firestore with document ID: 2bb8d419-2278-43a6-bf71-4439f719bb1a
Existing blog updated in Firestore with document ID: 83269896-ba5d-4235-bdc5-f3a7824200b4
Existing blog updated in Firestore with document ID: bd470727-10dc-4ce0-87eb-118d721c653f
Existing b

## Closing the Selenium Driver 

In [21]:
driver.quit()