In [None]:
from bs4 import BeautifulSoup
import pandas as pd
from selenium import webdriver
import time

In [None]:
SCROLL_PAUSE_TIME = 0.5

In [None]:
def add_posts(conversation_posts, usernames, posts_dataframe, conversation_id):
    start_row = posts_dataframe[posts_dataframe["conversation_id"] == conversation_id].index[0]
    end_row = posts_dataframe[posts_dataframe["conversation_id"] == conversation_id].index[-1]
    
    for post_number in range(start_row, end_row + 1):
        print("Expected author of post {} is {}".format(post_number, 
                                                        posts_dataframe.loc[post_number, "author"]))
        print("Extracted author of post {} is {}".format(post_number,
                                                         usernames[post_number - start_row]))
        
        if (usernames[post_number - start_row] == 
            posts_dataframe.loc[post_number, "author"]):
            posts_dataframe.loc[post_number, "post"] = conversation_posts[post_number - start_row]
               
    print(posts_dataframe)

In [None]:
def scrape_url(conversation_url):
    print("Conversation URL: {}".format(conversation_url))
        
    chrome_options = webdriver.ChromeOptions()
    chrome_options.add_argument("--incognito")
    chrome_options.add_argument("headless")

    driver = webdriver.Chrome(options = chrome_options)
    driver.get(conversation_url + "?sort_by=oldest")
    
    last_height = driver.execute_script("return document.body.scrollHeight;")

    while True:
        driver.execute_script("window.scrollTo(0, document.body.scrollHeight);")
        time.sleep(SCROLL_PAUSE_TIME)
        new_height = driver.execute_script("return document.body.scrollHeight;")
        if new_height == last_height: 
            break
        last_height = new_height

    soup = BeautifulSoup(driver.page_source)
    
    post_list = soup.find("div", {"id" : "post_show_" + conversation_url.split("/")[-1]})
    original_post = soup.find("div", {"id" : "subject_msg"}).text
    original_username = (soup.find("div", {"class" : "subj_info"})
                             .find("div", {"class" : "username"}).a.text)
                         
    posts = [original_post]
    responses = post_list.find_all("div", {"class" : "resp_body"})
    for response in responses:
        posts.append(response.text)
    
    usernames = [original_username]
    responders = post_list.find_all("div", {"class" : "resp_info"})
    for responder in responders:
        responder = responder.find("div", {"class" : "username"})
        usernames.append(responder.a.text)
        
    assert(len(usernames) == len(posts))
 
    for index in range(len(posts)):
        posts[index] = posts[index].replace(u"\xa0", " ").strip()
        usernames[index] = usernames[index].replace(u"\xa0", " ").strip()
        
    return posts, usernames

In [None]:
def scrape_conversation_posts(disease):
    conversations = pd.read_csv("gold_standard/conversations/" + disease + ".tsv", delimiter = "\t")
    posts_dataframe = pd.read_csv("gold_standard/posts/" + disease + ".tsv", delimiter = "\t")
    expected_posts_per_conversation = posts_dataframe["conversation_id"].value_counts()
    
    for index, row in conversations.iterrows():
        conversation_posts, usernames = scrape_url(row["url"])
        
        print("Number of extracted conversation posts / usernames: {}".format(len(conversation_posts)))
        print("Expected number of posts to be analyzed: {}".format(
            expected_posts_per_conversation[row["conversation_id"]]))
        assert(len(conversation_posts) >= expected_posts_per_conversation[row["conversation_id"]])
        
        add_posts(conversation_posts, usernames, posts_dataframe, row["conversation_id"])
        
        break

In [None]:
def driver():
    diseases = pd.read_csv("gold_standard/diseases/diseases.tsv", delimiter = "\t")
    
    for disease in diseases["disease_id"]:
        print("Disease ID: {}".format(disease))
        scrape_conversation_posts(disease)
        break

In [None]:
driver()