In [35]:
from selenium import webdriver
from selenium.webdriver.chrome.options import Options
from bs4 import BeautifulSoup
from selenium.webdriver.chrome.service import Service
import time
import requests
import pandas as pd

Get all page data

In [36]:
PATH = './chromedriver_ver110.exe'

ser = Service(PATH)

chrome_options = Options()
#chrome_options.add_argument("--headless")

driver = webdriver.Chrome(service=ser,options=chrome_options)

url = 'https://community.duo.com/latest?order=activity'

driver.get(url)

# Scroll to bottom of page to load all posts
last_height = driver.execute_script("return document.body.scrollHeight")
while True:
    # Scroll down to the bottom.
    driver.execute_script("window.scrollTo(0, document.body.scrollHeight);")
    # Wait to load the page.
    time.sleep(2)
    # Calculate new scroll height and compare with last scroll height.
    new_height = driver.execute_script("return document.body.scrollHeight")
    if new_height == last_height:
        break
    last_height = new_height

# Get all posts
content = driver.page_source

driver.quit()

#### Project Goal

The project goal is to add all the posts into a dataframe in a cleaned manner

First, I will get a list of all the links for all the posts. Then, I will visit each of those links and gather all the post information into a new dataframe.

#### Gather Post Links

In [37]:
soup = BeautifulSoup(content, "html.parser")

post_content = soup.find_all(class_='title raw-link raw-topic-link')

links = [item.get('href').split('"')[0] for item in post_content]

In [38]:
prefix = 'https://community.duo.com'

links = [prefix + link if not link.startswith(prefix) else link for link in links]

In [55]:
links

['https://community.duo.com/t/popular-duo-videos-how-to-setup-thread/12557',
 'https://community.duo.com/t/what-training-content-would-you-like-to-see/4691',
 'https://community.duo.com/t/disabling-vs-deleting-a-user/5121',
 'https://community.duo.com/t/scim-support-by-duo/1524',
 'https://community.duo.com/t/d260-duo-release-notes-for-march-1-2023/14256',
 'https://community.duo.com/t/token-mfa-for-aws-workspaces/14149',
 'https://community.duo.com/t/duo-with-fortigate-cisco-ftd-and-switches/14249',
 'https://community.duo.com/t/api-call-to-update-ldap-user-fails/14034',
 'https://community.duo.com/t/fortigate-and-radius-in-azure-not-connecting/14243',
 'https://community.duo.com/t/samsung-galaxy-z-flip3/11695',
 'https://community.duo.com/t/duo-mfa-microsoft-rras-setup-no-option-to-change-authentication-provider-because-nps-is-installed/14061',
 'https://community.duo.com/t/office365-vdi-non-persistent-instant-clones/13998',
 'https://community.duo.com/t/onedrive-as-a-backup-restore-

NOTE: We can expect to have a minimum of 2980 posts. If anything under that amount, there was something wrong with selenium and you should try to run the script again. The last post should be https://community.duo.com/t/copyright-dispute-policy/57/

In [39]:
print('First 3 links',links[:3])
print('Last 3 links:',links[-3:])

print('Total Number of Links:',len(links))

First 3 links ['https://community.duo.com/t/popular-duo-videos-how-to-setup-thread/12557', 'https://community.duo.com/t/what-training-content-would-you-like-to-see/4691', 'https://community.duo.com/t/disabling-vs-deleting-a-user/5121']
Last 3 links: ['https://community.duo.com/t/community-guidelines/59', 'https://community.duo.com/t/terms-and-conditions/58', 'https://community.duo.com/t/copyright-dispute-policy/57']
Total Number of Links: 2980


#### Finding the data within the posts. 

Data points I would like to capture as a pilot:

1. Post title
2. post category(s) (will be seperated by delimiter)
3. post tag(s) (will be seperated by delimiter)
4. Post user
5. Post date
6. Post content (will be tricky with images, I am thinking that I will only capture the text portion of the post for now)
7. Views
8. Replies
9. Users
10. Links

BONUS:

A column that indicates if the title or column contains 

In [40]:
def get_data(url):
    # Get page
    page = requests.get(url)
    soup = BeautifulSoup(page.content,'html.parser')
    
    # Get Title
    title = soup.title.string.split(' - ')[0]

    # Get Categories
    categories = ' | '.join(soup.title.string.split(' - ')[1:])

    # Get Tags
    tags_element = soup.find('div', {'class': 'discourse-tags list-tags'})
    tags = tags_element.text.strip().replace('\n', '').replace(' ', '').replace(',', ' | ') if tags_element is not None else None

    # Get User Link
    hrefs = [link.get('href') for link in soup.find_all('a')]
    user_link = next((link for link in hrefs if link.startswith('https://community.duo.com/u/')), None)

    # Get User Name from User Link
    username = user_link.split('/')[-1]

    # Get Date
    date = str(soup.find('time', {'class': 'post-time'})['datetime'][:10])

    # Get Content
    post_content = '\n'.join(tag.get_text(strip=True) for tag in soup.find('div', {'class': 'post', 'itemprop': 'articleBody'}).find_all(['p','h2','h3','ul','li']))

    return url, title, categories, tags, username, user_link, date, post_content

In [41]:
page = requests.get('https://community.duo.com/t/popular-duo-videos-how-to-setup-thread/12557')
soup = BeautifulSoup(page.content,'html.parser')

get_data('https://community.duo.com/t/popular-duo-videos-how-to-setup-thread/12557')

('https://community.duo.com/t/popular-duo-videos-how-to-setup-thread/12557',
 'Popular Duo Videos: How-To/Setup Thread',
 'Protecting Applications forum | Duo Security Community',
 None,
 'VideoCody',
 'https://community.duo.com/u/VideoCody',
 '2022-06-30',
 'This thread serves as an opportunity to highlight some of our most popular Service Integration & Application videos for getting the most out of Duo.\nAs you probably know, Duo technical setup videos can also be found on corresponding documentation pages atduo.com/docs, as well as incorporated into our educational content on theDuo Level Uplearning platform.\nTo stay up to date withallDuo videos, including feature and marketing content, please subscribe onYouTubeand turn on all notifications for the channel.\nCheers,Cody')

It appears that the beautiful soup library is unable to get the section of the html with post views, replies, users, or links. If that is information that is important in the future, we can explore other avenues to get those data points.

#### Creating Data Frame with Transformed Data

In [54]:
columns = ['URL','POST_TITLE','POST_CATEGORIES','POST_TAGS','POST_USER','POST_USER_LINK','POST_DATE','POST_CONTENT']
df = pd.DataFrame(columns=columns)

for link in links:
    page = requests.get(link)
    soup = BeautifulSoup(page.content,'html.parser')
    
    # Get Title
    title = soup.title.string.split(' - ')[0]

    # Get Categories
    categories = ' | '.join(soup.title.string.split(' - ')[1:-1])

    # Get Tags
    tags_element = soup.find('div', {'class': 'discourse-tags list-tags'})
    tags = tags_element.text.strip().replace('\n', '').replace(' ', '').replace(',', ' | ') if tags_element is not None else None

    # Get User Link
    hrefs = [link.get('href') for link in soup.find_all('a')]
    user_link = next((link for link in hrefs if link.startswith('https://community.duo.com/u/')), None)

    # Get User Name from User Link
    username = user_link.split('/')[-1]

    # Get Date
    date = str(soup.find('time', {'class': 'post-time'})['datetime'][:10])

    # Get Content
    post_content = '\n'.join(tag.get_text(strip=True) for tag in soup.find('div', {'class': 'post', 'itemprop': 'articleBody'}).find_all(['p','h2','h3','ul','li']))
    print(url, title, categories, tags, username, user_link, date, post_content)
    data = pd.Series([link, title, categories, tags, username, user_link, date, post_content], index=columns)
    df = df.append(data,ignore_index=True)
    time.sleep(1)

https://community.duo.com/latest?order=activity Popular Duo Videos: How-To/Setup Thread Protecting Applications forum None VideoCody https://community.duo.com/u/VideoCody 2022-06-30 This thread serves as an opportunity to highlight some of our most popular Service Integration & Application videos for getting the most out of Duo.
As you probably know, Duo technical setup videos can also be found on corresponding documentation pages atduo.com/docs, as well as incorporated into our educational content on theDuo Level Uplearning platform.
To stay up to date withallDuo videos, including feature and marketing content, please subscribe onYouTubeand turn on all notifications for the channel.
Cheers,Cody
https://community.duo.com/latest?order=activity What training content would you like to see? General Discussion forum None Amy https://community.duo.com/u/Amy 2019-06-05 We want to hear from you! For our Duo admins and customers, how can we better serve you with training content? What topics an

In [59]:
df['ADMIN_FFLAG'] = df['POST_CONTENT'].str.contains('admin|Admin').astype(int)

In [61]:
df.to_csv('duo_community_posts.csv')