In [1]:
# Dependencies
import pymongo
from splinter import Browser
from bs4 import BeautifulSoup as bs
import requests
import datetime
import calendar
import time
import re

In [2]:
# Declare a dictionary to hold the scraped data.
webex = {
    "months": [],
    "hosts": [],
    "participants": [],
    "countries": "",
    "meetings": [],
    "minutes": [],
    "timestamp": datetime.datetime.strftime(datetime.datetime.utcnow(),'%m/%d/%y %H:%M:%S')
}

In [3]:
# Create a browser object for use by all the scrapes.
# Need to do "brew install geckodriver" to use Firefox on Mac.

def init_browser():
    return Browser("firefox")

In [22]:
# This function gets the number of hours in a given month.
def get_hours(month):
    """
    Calculates the hours so far in a month.
    Usage: get_hours("January 2020") or get_hours("Current")
    """
    months = ["January", "February", "March", "April", "May", "June",
            "July", "August", "September", "October", "November", "December"]
    month_list = re.split(r'\s', month)
    month = month_list[0]
    if month == "Current":
        year = datetime.datetime.today().strftime("%Y")
        days = int(datetime.datetime.today().strftime("%d"))
    else:
        year = month_list[1]
        days = calendar.monthrange(int(year),int(months.index(month))+1)[1]
    
    complete_days = int(days) - 1
    current_hours = int(datetime.datetime.strftime(datetime.datetime.utcnow(),'%H'))
    
    hours = (complete_days * 24) + current_hours

    return hours


In [8]:
# This function does the work of scraping the Webex web site.
def scrape_webex():
    browser = init_browser()

    # Visit https://map.webex.com/
    url = "https://map.webex.com/"
    browser.visit(url)

    # Give time for dynamic content to load
    time.sleep(5)

    # Scrape page into Soup
    html = browser.html
    soup = bs(html, "html.parser")

    # Declare variables to store raw data.
    month_ids = ['last2Month', 'lastMonth', 'nowMonth']
    months = []
    hosts_commas = []
    hosts = []
    participants_commas = []
    participants = []
    countries = []
    meetings_commas = []
    meetings = []
    minutes = []
    
    # Iterate through the months; the data is in an iframe.
    for month_id in month_ids:
        with browser.get_iframe('meetingMap') as iframe:
            iframe.click_link_by_id(month_id)
            time.sleep(5)
            iframe_html = iframe.html
            iframe_soup = bs(iframe_html, "html.parser")
            months.append(iframe_soup.find('span', id=month_id).get_text())
            hosts_commas.append(re.search('title=\"(.*)\"', str(iframe_soup.find('span', id='hostData'))).group(1))
            participants_commas.append(re.search('title=\"(.*)\"', str(iframe_soup.find('span', id='participantData'))).group(1))
            countries.append(re.search('title=\"(.*)\"', str(iframe_soup.find('span', id='countryData'))).group(1))
            meetings_commas.append(re.search('title=\"(.*)\"', str(iframe_soup.find('span', id='meetingData'))).group(1))
            
            # The minutes are in reverse order (high endian), so we have to turn them around.
            mins = []
            num = 11
            while num > 0:
                a = str(iframe_soup.find('div', id='num' + str(num)))
                b = re.search(r'<span>(\d*)', a).group(1)             
                mins.append(b)
                num = num - 1
            mins_string = ''.join(map(str, mins))
            minutes.append(mins_string)
           
    # Some numbers have commas, so we have to remove those.   
    hosts =[s.replace(',', '') for s in hosts_commas]
    participants = [s.replace(',', '') for s in participants_commas]
    meetings = [s.replace(',', '') for s in meetings_commas]
    
    # The data is now ready.
    webex = {
        "months": months,
        "hosts": hosts,
        "participants": participants,
        "countries": countries,
        "meetings": meetings,
        "minutes": minutes,
        "timestamp": datetime.datetime.strftime(datetime.datetime.utcnow(),'%m/%d/%y %H:%M:%S')
    }
    

    # Close the browser after scraping
    browser.quit()

    # Return results
    return webex

In [6]:
# Initialize PyMongo to work with MongoDBs
conn = 'mongodb://localhost:27017'
client = pymongo.MongoClient(conn)
# Define database and collection
db = client.webex
webex_scrape = db.webex


In [7]:
# Scrape the data and store in webex_days
webex_days = scrape_webex()

hosts: $['24420815', '26000472', '26000472']


In [9]:
# DEBUG: print contents.
print(f'months: {webex_days["months"]}')
print(f'hosts: {webex_days["hosts"]}')
print(f'participants: {webex_days["participants"]}')
print(f'countries: {webex_days["countries"]}')
print(f'meetings: {webex_days["meetings"]}')
print(f'minutes: {webex_days["minutes"]}')
print(f'timestamp: {webex_days["timestamp"]}')

months: ['February 2020', 'March 2020', 'Current']
hosts: ['24420815', '26000472', '26000472']
participants: ['161764828', '350692197', '22668772']
countries: ['224', '220', '213']
meetings: ['37994030', '73295312', '4396197']
minutes: ['6787001490', '14300227112', '889875043']
timestamp: 04/01/20 21:08:00


In [23]:
# Convert the data to hourly rate and store in webex
hours = []
for i in webex_days["months"]:
    hours_month = get_hours(i)
    hours.append(hours_month)
    
webex = {
    "months": [],
    "hosts": [],
    "participants": [],
    "countries": "",
    "meetings": [],
    "minutes": [],
    "timestamp": ""
}

for i in range(3):
    webex["participants"].append(round(int(webex_days["participants"][i]) / hours[i]))
    webex["meetings"].append(round(int(webex_days["meetings"][i]) / hours[i]))
    webex["minutes"].append(round(int(webex_days["minutes"][i]) / hours[i]))

webex["hosts"] = webex_days["hosts"]
webex["months"] = webex_days["months"]
webex["countries"] = webex_days["countries"]
webex["timestamp"] = webex_days["timestamp"]

hours: 693
hours: 741
hours: 21


In [24]:
# Debug: print data.
print(f'months: {webex["months"]}')
print(f'hosts: {webex["hosts"]}')
print(f'participants: {webex["participants"]}')
print(f'countries: {webex["countries"]}')
print(f'meetings: {webex["meetings"]}')
print(f'minutes: {webex["minutes"]}')
print(f'timestamp: {webex["timestamp"]}')    

months: ['February 2020', 'March 2020', 'Current']
hosts: ['24420815', '26000472', '26000472']
participants: [233427, 473269, 1079465]
countries: ['224', '220', '213']
meetings: [54825, 98914, 209343]
minutes: [9793653, 19298552, 42375002]
timestamp: 04/01/20 21:08:00


In [25]:
# Insert the new data into the database.
try:
    webex_scrape.insert_one(webex)
except Exception as e:
        print(e)


In [26]:
# Debug: display items in MongoDB collection
data = db.webex.find()

for item in data:
    print(item)


{'_id': ObjectId('5e7e36644c242bba3cef964f'), 'months': ['January 2020', 'February 2020', 'Current'], 'hosts': [32936, 35087, 40124], 'participants': [204837, 232421, 453729], 'countries': ['223', '224', '221'], 'meetings': [49587, 54589, 96919], 'minutes': [8481303, 9751439, 18422932], 'timestamp': '03/27/20 17:23:25'}
{'_id': ObjectId('5e7e37ff7e476ac58d8339b6'), 'months': ['January 2020', 'February 2020', 'Current'], 'hosts': [32936, 35087, 40124], 'participants': [204837, 232421, 453776], 'countries': ['223', '224', '221'], 'meetings': [49587, 54589, 96937], 'minutes': [8481303, 9751439, 18426725], 'timestamp': '03/27/20 17:26:25'}
{'_id': ObjectId('5e7e390e74a6e0c38f1d664c'), 'months': ['January 2020', 'February 2020', 'Current'], 'hosts': [32936, 35087, 40124], 'participants': [204837, 232421, 454203], 'countries': ['223', '224', '221'], 'meetings': [49587, 54589, 97031], 'minutes': [8481303, 9751439, 18438389], 'timestamp': '03/27/20 17:34:06'}
{'_id': ObjectId('5e7e3e455875a69b