In [1]:
import bisect
from bs4 import BeautifulSoup
from datetime import datetime
import math
import pandas as pd
import re
from selenium import webdriver
import time

In [2]:
SCROLL_PAUSE_TIME = 0.5

In [3]:
def tokenize_responses(disease, posts_dataframe):
    sentence_dataframe = pd.read_csv("gold_standard/sentences/" + disease + ".tsv", delimiter = "\t")
    
    for sentence_number, sentence in sentence_dataframe.iterrows():
        post = posts_dataframe[posts_dataframe["post_id"] == sentence["post_id"]]["post"].item()
        
        if isinstance(post, float) and math.isnan(post):
            break
        else:
#             print("Start index: {}, end index: {}".format(sentence["start_index"], sentence["end_index"]))
#             print("Mined sentence (number: {}): {}\n".format(sentence_number, post[sentence["start_index"] - 1 :
#                                                                                    sentence["end_index"] - 1].strip()))
            sentence_dataframe.loc[sentence_number, "sentence"] = post[sentence["start_index"] - 1 :
                                                                       sentence["end_index"] - 1].strip()
    
    print(sentence_dataframe.head(50))

In [4]:
def add_posts(conversation_posts, usernames, posts_dataframe, conversation_id):
    start_row = posts_dataframe[posts_dataframe["conversation_id"] == conversation_id].index[0]
    end_row = posts_dataframe[posts_dataframe["conversation_id"] == conversation_id].index[-1]
    
    for post_number in range(start_row, end_row + 1):
        print("Expected author of post {} is {}".format(post_number, posts_dataframe.loc[post_number, "author"]))
        print("Extracted author of post {} is {}".format(post_number, usernames[post_number - start_row]))
        print("Match? {}\n".format(posts_dataframe.loc[post_number, "author"] ==
                                   usernames[post_number - start_row]))
        
        if (usernames[post_number - start_row] == posts_dataframe.loc[post_number, "author"]):
            posts_dataframe.loc[post_number, "post"] = conversation_posts[post_number - start_row]

In [5]:
def scrape_url(conversation_url):
    print("Conversation URL: {}\n".format(conversation_url))
    
    # setup selenium web driver to crawl page with infinite scrolling
    chrome_options = webdriver.ChromeOptions()
    chrome_options.add_argument("--incognito")
    chrome_options.add_argument("headless")

    driver = webdriver.Chrome(options = chrome_options)
    driver.get(conversation_url + "?sort_by=oldest")
    
    last_height = driver.execute_script("return document.body.scrollHeight;")

    while True:
        driver.execute_script("window.scrollTo(0, document.body.scrollHeight);")
        time.sleep(SCROLL_PAUSE_TIME)
        new_height = driver.execute_script("return document.body.scrollHeight;")
        if new_height == last_height: break
        last_height = new_height

    soup = BeautifulSoup(driver.page_source)
    
    # responses encapsulated in outer div with class post_show_{ID}
    post_list = soup.find("div", {"id" : "post_show_" + conversation_url.split("/")[-1]})
    
    # original poster and contents encapsulated in divs with class "subj_info" and "subject_msg" 
    original_post = soup.find("div", {"id" : "subject_msg"}).text
    original_user_data = (soup.find("div", {"class" : "subj_info"})
                              .find("div", {"class" : "username"}))
    original_username = original_user_data.a.text
    
    try:
        original_date = datetime.strptime(original_user_data.time.text, "%b %d, %Y")
    except ValueError:
        print("Encountered post without specified year; assuming post was authored in current year\n")
        original_date = datetime.strptime(original_user_data.time.text + 
                                          ", {}".format(datetime.now().year), "%b %d, %Y")
    
    # for posts with a "best" answer, find that card and extract its contents -- 
    # encapsulated in outer div with class containing "best_answer_card"
    best_answer_card = soup.find("div", {"class" : re.compile(r"best_answer_card")})
    if best_answer_card:
        best_post = best_answer_card.find("div", {"class" : "comment_body"}).text
        best_answer_user_data = (best_answer_card.find("div", {"class" : "resp_info"})
                                                 .find("div", {"class" : "username"}))
        best_answer_username = best_answer_user_data.a.text
        
        try:
            best_answer_date = datetime.strptime(best_answer_user_data.time.text, "%b %d, %Y")
        except ValueError:
            print("Encountered post without specified year; assuming post was authored in current year\n")
            best_answer_date = datetime.strptime(best_answer_user_data.time.text + 
                                                 ", {}".format(datetime.now().year), "%b %d, %Y")
    
    # extract replies
    posts = [original_post] if not best_answer_card else [original_post, best_post]
    responses = post_list.find_all("div", {"class" : "resp_body"})
    for response in responses:
        posts.append(response.text)
    
    # extract user metadata (username & date)
    usernames = [original_username] if not best_answer_card else [original_username, best_answer_username]
    timestamps = [original_date] if not best_answer_card else [original_date, best_answer_date]
    responders = post_list.find_all("div", {"class" : "resp_info"})
    for responder in responders:
        responder = responder.find("div", {"class" : "username"})
        usernames.append(responder.a.text)
        
        try:
            timestamps.append(datetime.strptime(responder.time.text, "%b %d, %Y"))
        except ValueError:
            print("Encountered post without specified year; assuming post was authored in current year\n")
            timestamps.append(datetime.strptime(responder.time.text + 
                                                ", {}".format(datetime.now().year), "%b %d, %Y"))
    
    # duplicate best answer response and insert in chronological order
    if best_answer_card:
        insertion_point = bisect.bisect(timestamps, best_answer_date, lo = 2)
        usernames.insert(insertion_point, best_answer_username)
        posts.insert(insertion_point, best_post)
        
    assert(len(usernames) == len(posts))

    # clean web scraped text
    for index in range(len(posts)):
        posts[index] = posts[index].replace(u"\xa0", " ").strip()
        
    return posts, usernames

In [6]:
def scrape_conversation_posts(disease):
    conversations = pd.read_csv("gold_standard/conversations/" + disease + ".tsv", delimiter = "\t")
    posts_dataframe = pd.read_csv("gold_standard/posts/" + disease + ".tsv", delimiter = "\t")
    expected_posts_per_conversation = posts_dataframe["conversation_id"].value_counts()
    
    for index, row in conversations.iterrows():
        conversation_posts, usernames = scrape_url(row["url"])
        
#         print("Collected posts: {}\n".format(conversation_posts))
        print("Number of extracted conversation posts / usernames: {}".format(len(conversation_posts)))
        print("Expected number of posts to be analyzed: {}\n".format(
            expected_posts_per_conversation[row["conversation_id"]]))
        assert(len(conversation_posts) >= expected_posts_per_conversation[row["conversation_id"]])
        
        add_posts(conversation_posts, usernames, posts_dataframe, row["conversation_id"])
        
    return posts_dataframe

In [7]:
def driver():
    diseases = pd.read_csv("gold_standard/diseases/diseases.tsv", delimiter = "\t")
    
    for disease in diseases["disease_id"]:
        print("Disease ID: {}\n".format(disease))
        posts_dataframe = scrape_conversation_posts(disease)
        # tokenize_responses(disease, posts_dataframe)

In [8]:
driver()

Disease ID: HIF2016_DIS00

Conversation URL: http://www.medhelp.org/posts/Allergies---Food/Does-anyone-have-an-allergy-to-coffee/show/1550820

Number of extracted conversation posts / usernames: 31
Expected number of posts to be analyzed: 31

Expected author of post 0 is seattlemom2plus
Extracted author of post 0 is seattlemom2plus
Match? True

Expected author of post 1 is ChitChatNine
Extracted author of post 1 is ChitChatNine
Match? True

Expected author of post 2 is FurballsMom
Extracted author of post 2 is FurballsMom
Match? True

Expected author of post 3 is seattlemom2plus
Extracted author of post 3 is seattlemom2plus
Match? True

Expected author of post 4 is FurballsMom
Extracted author of post 4 is FurballsMom
Match? True

Expected author of post 5 is seattlemom2plus
Extracted author of post 5 is seattlemom2plus
Match? True

Expected author of post 6 is FurballsMom
Extracted author of post 6 is FurballsMom
Match? True

Expected author of post 7 is seattlemom2plus
Extracted auth

Number of extracted conversation posts / usernames: 13
Expected number of posts to be analyzed: 13

Expected author of post 83 is Jayhawk29
Extracted author of post 83 is Jayhawk29
Match? True

Expected author of post 84 is Ashwin Bhandari, MBBS
Extracted author of post 84 is Ashwin  Bhandari, MBBS
Match? False

Expected author of post 85 is Jayhawk29
Extracted author of post 85 is Jayhawk29
Match? True

Expected author of post 86 is Paderla Anitha, DNB
Extracted author of post 86 is Paderla  Anitha, DNB 
Match? False

Expected author of post 87 is Jayhawk29
Extracted author of post 87 is Jayhawk29
Match? True

Expected author of post 88 is seattlemom2plus
Extracted author of post 88 is seattlemom2plus
Match? True

Expected author of post 89 is Jayhawk29
Extracted author of post 89 is Jayhawk29
Match? True

Expected author of post 90 is ChitChatNine
Extracted author of post 90 is ChitChatNine
Match? True

Expected author of post 91 is seattlemom2plus
Extracted author of post 91 is seat

Number of extracted conversation posts / usernames: 12
Expected number of posts to be analyzed: 10

Expected author of post 31 is Maza69
Extracted author of post 31 is Maza69
Match? True

Expected author of post 32 is emg454
Extracted author of post 32 is S.  Kaul, MD
Match? False

Expected author of post 33 is emg454
Extracted author of post 33 is emg454
Match? True

Expected author of post 34 is KimK1
Extracted author of post 34 is emg454
Match? False

Expected author of post 35 is meow07
Extracted author of post 35 is KimK1
Match? False

Expected author of post 36 is LOSTG1RL78
Extracted author of post 36 is meow07
Match? False

Expected author of post 37 is Liveroom
Extracted author of post 37 is S.  Kaul, MD
Match? False

Expected author of post 38 is Liveroom
Extracted author of post 38 is LOSTG1RL78
Match? False

Expected author of post 39 is Megadodger
Extracted author of post 39 is Liveroom
Match? False

Expected author of post 40 is IBS_Coach
Extracted author of post 40 is Li

Number of extracted conversation posts / usernames: 16
Expected number of posts to be analyzed: 16

Expected author of post 127 is azure77
Extracted author of post 127 is azure77
Match? True

Expected author of post 128 is TrudieC
Extracted author of post 128 is TrudieC
Match? True

Expected author of post 129 is azure77
Extracted author of post 129 is azure77
Match? True

Expected author of post 130 is TrudieC
Extracted author of post 130 is TrudieC
Match? True

Expected author of post 131 is cka58
Extracted author of post 131 is cka58
Match? True

Expected author of post 132 is niki26
Extracted author of post 132 is niki26
Match? True

Expected author of post 133 is kr00110
Extracted author of post 133 is kr00110
Match? True

Expected author of post 134 is TrudieC
Extracted author of post 134 is TrudieC
Match? True

Expected author of post 135 is lightman16
Extracted author of post 135 is lightman16
Match? True

Expected author of post 136 is jb1000
Extracted author of post 136 is jb

AssertionError: 

TESTING BLOCKS

In [None]:
# TESTING SENTENCE EXTRACTION.

post = "That does sound like you have a food sensitivity to coffee.  I don't think it's the caffeine, but one way you can figure out if it's caffeine is to try black or green tea (both are from the same plant).  Just be mindful that the green tea can still carry a problem with lead from the processing.  Dr. Mercola has articles that explain this more thoroughly.  There are some that he explains don't have this risk.  I know it seems like there shouldn't be a risk, since they are steaming, then drying to keep the leaves green.  So, I'm not sure where the lead comes from.  But, black tea doesn't carry the same risk.  I myself simply can't drink tea, though, because the tannin bothers me."
post_id = "HIF2016_DIS00_CONV00_POST002"
disease = "HIF2016_DIS00"

sentence_dataframe = pd.read_csv("gold_standard/sentences/" + disease + ".tsv", delimiter = "\t")

print(post)
print("\n\n---------\n\n")

for sentence_number, sentence in sentence_dataframe.iterrows():
    if sentence["post_id"] == post_id:
        print("Start index: {}, end index: {}".format(sentence["start_index"], sentence["end_index"]))
        print("Mined sentence (number: {}): {}\n".format(sentence_number, post[sentence["start_index"] - 1 :
                                                                               sentence["end_index"] - 1].strip()))

In [10]:
# TESTING URL MINING.

conversation_url = "http://www.medhelp.org/posts/Crohns-Disease---Ulcerative-Colitis/Crohns-IBS-and-herpes-simplex-2---are-they-all-related/show/1486989"
posts, usernames = scrape_url(conversation_url)

print(len(posts))
print(len(usernames))
print("\n\n-------------\n\n")
print(posts)
print("\n\n-------------\n\n")
print(usernames)

Conversation URL: http://www.medhelp.org/posts/Crohns-Disease---Ulcerative-Colitis/Crohns-IBS-and-herpes-simplex-2---are-they-all-related/show/1486989

10
10


-------------


['I have recently been informed that I have suspected crohns disease and IBS. I have had 2 colonscopies and various blood and stool tests. I also have a diagnosis of Herpes simplex 2 and recently the episodes of IBS have increased as a result of my period, as have the herpes epsiodes- is there a direct link with herpes and IBS/Crohns and if so if I were to treat the herpes would this improve my bowel symptoms. Thanks', 'I have the same problem, but from the information I have gathered is that herpes flare ups happen when our bodies are under stress. I have to say having to go to the potty 5 to 7 times a day canker sores stomach cramps alternating constipation and loose/soft stools are very stressful .if I find out anything different I will be sure to share please do the same I would love to know of any connection