# Data Gathering

This workbook describes data collection from the Vetsbenefits.net web forum. This web forum consists of two major areas of interest: 

* **A top level discussion board**, made up of the 500 most recently active threads (active meaning there has been a post made to the thread); and,
* **The threads themselves**, comprising of an original post, plus one or more replies

Beautiful soup was the primary means for performing this data collection. A separate workbook catalogs a similar effort capture data not in a tabular form, but in a more flexible data dictionary format for future use. 

### Standard Packages

In [4]:
import pickle

In [5]:
from bs4 import BeautifulSoup
import requests

In [6]:
from IPython.core.display import display, HTML

In [7]:
import time, os

In [8]:
import pandas as pd

In [9]:
import numpy as np

In [10]:
import regex as re

In [11]:
from time import sleep
from random import randint

In [12]:
import pandas as pd
import matplotlib.pyplot as plt

In [13]:
import datetime
import seaborn as sns

In [14]:
from datetime import datetime as dt

In [15]:
# Make better use of Jupyter Notebook cell width
from IPython.core.display import display, HTML
display(HTML("<style>.container { width:95% !important; }</style>"))

pd.set_option('display.max_rows', 660)
pd.set_option('display.max_columns', 500)
pd.set_option('display.max_colwidth', 400)
pd.options.display.float_format = '{:,.10f}'.format

In [3]:
#pickle.dump(df_discussion_links, open("df_discussion_links_2020-03-09_12PM.p", "wb"))

df_discussion_links_pickle = pickle.load(open("df_discussion_links_2020-03-09_12PM.p", "rb"))

In [7]:
#pickle.dump(df_zipped_post2, open("df_zipped_post2_2020-03-09_01AM.p", "wb"))

df_zipped_post2_pickle = pickle.load(open("df_zipped_post2_2020-03-09_01AM.p", "rb"))

In [8]:
df_zipped_post2_pickle.shape

(13955, 5)

### Capture Data from Top Level Discussion Board
...using Beautiful Soup to move through pagination

In [16]:
def get_links(base_url,end):
    
    #Get base source data
    source_code = requests.get(base_url)   
    soup = BeautifulSoup(source_code.text, 'html5lib')
    
    #Get topic titles
    topic_titles=[]
    for element in soup.find_all("a", class_="topictitle shRt_t"):
        topic_titles.append(element.text)
        
    #Get last post time
    last_post_time=[]
    for index, element in enumerate(soup.find_all('span',class_="timespan")):
        if index % 2 == 0:
#    for element in soup.find_all('span',class_="timespan"):
            last_post_time.append(element['title'])   
    
#     if last_post_time[24] > recent_post_time[0]:
#         last_post_time_copy = last_post_time.copy()
#         recent_post_time.insert(0,last_post_time_copy[0])
#         ret
    
    
    
    #Get links for topics
    links_with_text = []
    for a in soup.find_all('a', class_="topictitle shRt_t", href=True): 
        if a.text: 
            links_with_text.append(a['href'])
   
    print('done page 1')
    
    #Loop through subsequent pages
    for i in list(range(25,end,25)):
        
        #Get source code
        source_code = requests.get(base_url + '&start={}'.format(i))
        soup = BeautifulSoup(source_code.text, 'html5lib')
    
        #Get topic titles
        for element in soup.find_all("a", class_="topictitle shRt_t"):
            topic_titles.append(element.text)

        #Get last post time
        for index, element in enumerate(soup.find_all('span',class_="timespan")):
            if index % 2 == 0:
        #    for element in soup.find_all('span',class_="timespan"):
                last_post_time.append(element['title'])    
        
        #Get links for topics
        for a in soup.find_all('a', class_="topictitle shRt_t", href=True): 
            if a.text: 
                links_with_text.append(a['href'])
                
        #Pause
        print('done page {}'.format(i))
        sleep(randint(5,10))
   
    #Zip cols and create DF        
    post_zip = zip(topic_titles, links_with_text, last_post_time) 
    zipped_list = list(post_zip) #Convert zip to list.
    df_zipped_links = pd.DataFrame(zipped_list, columns=['post','url','last_post_time'])
    
    df_zipped_links['url'] = [x[:-42] for x in df_zipped_links['url']]
    
    df_zipped_links[['last_post_hour','last_post_date']] = df_zipped_links.last_post_time.str.split("-",expand=True) 
    df_zipped_links['last_post_date'] = pd.to_datetime(df_zipped_links['last_post_date'])
    df_zipped_links['last_post_hour'] = pd.to_datetime(df_zipped_links['last_post_hour']).dt.time
    df_zipped_links['combined_time'] = df_zipped_links.apply(lambda r : pd.datetime.combine(r['last_post_date'],r['last_post_hour']),1)
    
    # Filter out anything that isn't newer than most recent time saved
    #df_zipped_links = df_zipped_links[df_zipped_links['combined_time'] >= recent_post_time[0]]
    #recent_post_time.insert(0,df_zipped_links.combined_time[0])
    
    return df_zipped_links

In [17]:
df_discussion_links_no_dict_2 = get_links('https://vetsbenefits.net/discussion/all?sr=topics&search_id=newposts&discussions=1&unlimit=1',501)

done page 1
done page 25
done page 50
done page 75
done page 100
done page 125
done page 150
done page 175
done page 200
done page 225
done page 250
done page 275
done page 300
done page 325
done page 350
done page 375
done page 400
done page 425
done page 450
done page 475
done page 500


  df_zipped_links['combined_time'] = df_zipped_links.apply(lambda r : pd.datetime.combine(r['last_post_date'],r['last_post_hour']),1)


In [34]:
df_discussion_links_no_dict['url'] = [x[:-42] for x in df_discussion_links_no_dict['url']]

In [18]:
df_discussion_links_no_dict_2.head()

Unnamed: 0,post,url,last_post_time,last_post_hour,last_post_date,combined_time
0,ChampVA Claims,https://vetsbenefits.net/champva-claims-t193856,"10:05 AM - Mar 22, 2021",10:05:00,2021-03-22,2021-03-22 10:05:00
1,Monday Moring Roll Call,https://vetsbenefits.net/monday-moring-roll-call-t193855,"10:01 AM - Mar 22, 2021",10:01:00,2021-03-22,2021-03-22 10:01:00
2,PTSD Increase,https://vetsbenefits.net/ptsd-increase-t192490,"9:41 AM - Mar 22, 2021",09:41:00,2021-03-22,2021-03-22 09:41:00
3,Old or new rating?,https://vetsbenefits.net/old-or-new-rating-t193810,"9:12 AM - Mar 22, 2021",09:12:00,2021-03-22,2021-03-22 09:12:00
4,VA Travel,https://vetsbenefits.net/va-travel-t193742,"9:10 AM - Mar 22, 2021",09:10:00,2021-03-22,2021-03-22 09:10:00


### Capture Data from Individual Threads
...including moving through pagination on specific threads as needed

In [22]:
# Global data containers
post_id_container = []
original_post_id_container = []
topic_titles_posts = []
post_text=[]
post_time = []
post_author=[]

In [19]:
def capture_post(url):    
    ############
    # New Post #
    ############    

    #Get soup info
    source_code = requests.get(url+'.html')
    soup = BeautifulSoup(source_code.text, 'html5lib')
    
    #Get topic title for page 1
    topic_title = soup.find('h2', class_ = "topic-title").text
    
    #Get post IDs
    original_post = soup.find('div', class_=["normal-bubble post_content post-bubble-outter-padding bg1",
                                      "normal-bubble post_content post-bubble-outter-padding bg2"], id=True)

    for a in soup.find_all('div', class_=["normal-bubble post_content post-bubble-outter-padding bg1",
                                      "normal-bubble post_content post-bubble-outter-padding bg2"], id=True): 
        post_id_container.append(a['id'])
        original_post_id_container.append(original_post['id'])
        topic_titles_posts.append(topic_title)
    
    #Get post text
    for element in soup.find_all("div", class_="content noskim"):
        post_text.append(element.text)
    
    #Get Post Time
    for a in soup.find_all('span', class_="timespan", title=True): 
        if a.text: 
            post_time.append(a['title'])
    
    #Get poster name
    for element in soup.find_all('script', text = re.compile("POST_AUTHOR")):

        J = str(element)
        J1 = J.split('{')
        J2 = J1[1].split(',')
        post_author.append(J2[0].split(':'))

    print('finished page {}'.format(topic_title))
    
#     # Check if next button is present
    next_button = soup.find('i', class_="icon fa-chevron-right fa-fw")

#     if next_button:
#         print("next url is " + url + '-s{}.html'.format(40))
#         continue 
#     else:
#         print('no more pages')
#         return None
    
    ###################
    # Loop Thru Pages #
    ###################    

    if next_button:
        s = 40
        counter = 2

        while True:

            source_code = requests.get(url + '-s{}.html'.format(s))
            soup = BeautifulSoup(source_code.text, 'html5lib')

            #Get topic title for page 1
            topic_title = soup.find('h2', class_ = "topic-title").text
    
            #Get post IDs
            for a in soup.find_all('div', class_=["normal-bubble post_content post-bubble-outter-padding bg1",
                                              "normal-bubble post_content post-bubble-outter-padding bg2"], id=True): 
                post_id_container.append(a['id'])
                original_post_id_container.append(post_id_container[0])
                topic_titles_posts.append(topic_title)

            #Get post text
            for element in soup.find_all("div", class_="content noskim"):
                post_text.append(element.text)

            #Get Post Time
            for a in soup.find_all('span', class_="timespan", title=True): 
                if a.text: 
                    post_time.append(a['title'])

            #Get poster name
            for element in soup.find_all('script', text = re.compile("POST_AUTHOR")):

                J = str(element)
                J1 = J.split('{')
                J2 = J1[1].split(',')
                post_author.append(J2[0].split(':'))

            next_button = soup.find('i', class_="icon fa-chevron-right fa-fw")

            if next_button:
                s += 40
                print("done page {}".format(counter))
                print("next url is " + url + '-s{}.html'.format(s))
                counter += 1
            else:
                break 

            sleep(randint(5,10))
    else:
        return None

In [23]:
for i in df_discussion_links_no_dict_2.url:
    if i == 'https://vetsbenefits.net/supper-t180378':
        pass
    else:
        capture_post(i)

finished page ChampVA  Claims
finished page Monday Moring Roll Call
finished page PTSD Increase
finished page Old or new rating?
finished page VA Travel
finished page Received First Moderna Vaccine  Shot Wednesday
finished page Makes me sad
finished page Saturday Moring Roll Call
finished page Sunday Morning Roll Call
finished page Thursday Morning Roll Call 
finished page Why a C&P Exam for D2 when the VA has all the records and is supplying medication?
finished page Friday morning Roll Call 
finished page well i filed
finished page Blood in urine after pelvic salvage radiation
finished page Marine passed from lung cancer.
finished page Caregivers Program for the older Veterans
finished page VA Healthcare and Medicare options
finished page Will the VA cover a dental Implant now that im rated at 100%
done page 2
next url is https://vetsbenefits.net/will-the-va-cover-a-dental-implant-now-that-im-rat-t120465-s80.html
done page 3
next url is https://vetsbenefits.net/will-the-va-cover-a-de

finished page Criteria for VA to authorize a mattress purchase for TDIU veterans
finished page St Patrick's Day!
finished page DBQ Foot Exam Copy
finished page Notifications still coming in after all turned off
finished page Padded sox recommendation
finished page Morse Code Intercept Operators and hearing loss
finished page Reapplying for VA care giver program
finished page Survivor benefits and suicide...?
finished page The etymology of ‘f*ck’ and the war that popularized it
finished page Old farmer's lament
finished page Question about education benefits for dependents
finished page Clear and Unmistakable Error: What are the odds?
finished page Giving up IU
finished page VA Eval Form for Athlete's Foot
finished page Date last insured
finished page Direct Connection or Gulf War
finished page IU P&T vs 100% schedular
finished page Break in treatment, re-evaluation pending.
finished page help with discharge review
finished page Humor while stuck in the house
done page 2
next url is htt

finished page PLEX
finished page For all of you Navy Folks!
finished page Help Understanding An increase request
finished page How to report piece of [email protected] ex husband in 100% fraudulently
finished page Migraines secondary to TMD
finished page Opting out of a video hearing (cruiser?)
finished page Ratings changes over the years
finished page 3 Part Process Caregiver Program
finished page Trying NOT to lose claim effective date
finished page Board is ready!! Effective date ?
finished page Can't fix stupid
finished page Nexus letters
finished page Carla Wallenda dies
finished page Won tinnitus with no hearing loss
finished page Buying my first home.
finished page VA Disability rating prior to decision being made
finished page FEMALE VETS READ THIS
done page 2
next url is https://vetsbenefits.net/female-vets-read-this-t179780-s80.html
done page 3
next url is https://vetsbenefits.net/female-vets-read-this-t179780-s120.html
done page 4
next url is https://vetsbenefits.net/female-

done page 8
next url is https://vetsbenefits.net/post-presumptive-awards-here-t54138-s320.html
done page 9
next url is https://vetsbenefits.net/post-presumptive-awards-here-t54138-s360.html
done page 10
next url is https://vetsbenefits.net/post-presumptive-awards-here-t54138-s400.html
done page 11
next url is https://vetsbenefits.net/post-presumptive-awards-here-t54138-s440.html
done page 12
next url is https://vetsbenefits.net/post-presumptive-awards-here-t54138-s480.html
done page 13
next url is https://vetsbenefits.net/post-presumptive-awards-here-t54138-s520.html
done page 14
next url is https://vetsbenefits.net/post-presumptive-awards-here-t54138-s560.html
done page 15
next url is https://vetsbenefits.net/post-presumptive-awards-here-t54138-s600.html
done page 16
next url is https://vetsbenefits.net/post-presumptive-awards-here-t54138-s640.html
done page 17
next url is https://vetsbenefits.net/post-presumptive-awards-here-t54138-s680.html
done page 18
next url is https://vetsbenef

finished page So I been doing my homework for my smc remand
finished page Senior Citizen Texting Codes:
finished page retro payment with continued rfe?
finished page HOT PEB Question TDRL / VA disability
finished page 100% disability question, not P&T
finished page recent 100% P&T award question
finished page Left Knee Rating Clarification Needed
finished page Va care and medicare
finished page Female Marine recruits have arrived at Recruit Depot San Diego for training for the first time ever
finished page Age 55 and the VA
finished page Long wait for C&P
finished page Double standard for corpsman
finished page New C&P
finished page withdrawal condition
finished page VES ??
finished page Sleep disturbance
finished page Documented Pre-Diabetes - Denied HLR for DM2
finished page SSDI saying I'm longer disabled
finished page Haven't received a letter from bva about my remand
finished page P&T possibility
finished page Same last name question
finished page Sunday Morning Roll Call
finished

#### Zip and check outputs

In [25]:
# Zip all files
post_all_zip2 = zip(post_id_container, post_text, post_time,post_author,original_post_id_container, topic_titles_posts)#,
                  #original_post, original_post_time, original_post_poster_name) 
zipped_post2 = list(post_all_zip2) #Convert zip to list.
df_zipped_post3 = pd.DataFrame(zipped_post2, columns=['post_id_container','post_text','post_time','post_author',
                                                  'original_post_id_container', 'topic_title'])

In [26]:
df_zipped_post3.shape

(13378, 6)

In [27]:
df_zipped_post3.head(50)

Unnamed: 0,post_id_container,post_text,post_time,post_author,original_post_id_container,topic_title
0,post_content1512116,Hi I sent in my wife’s claims and EOB Jan 2nd still nothing from ChampVA how long does it take to get reimbursed I tried calling but it’s a 30 minute wait,"9:55 AM - Mar 22, 2021","[""POST_AUTHOR"", ""jrp419""]",post_content1512116,ChampVA Claims
1,post_content1512119,"Took about 3 months after applying and then sending in claims to be reimbursed.\n\nMonday morning is the worst time to call them. Yes, there is a wait time for anytime you call.","10:05 AM - Mar 22, 2021","[""POST_AUTHOR"", ""EKco22""]",post_content1512116,ChampVA Claims
2,post_content1512095,Yo\n\nHere and present for Monday.,"6:10 AM - Mar 22, 2021","[""POST_AUTHOR"", ""Chiefw3""]",post_content1512095,Monday Moring Roll Call
3,post_content1512096,happy monday morning every one\n\n\n\n\n\n\n\n\n,"6:30 AM - Mar 22, 2021","[""POST_AUTHOR"", ""Vetn""]",post_content1512095,Monday Moring Roll Call
4,post_content1512097,Good Morning All. \n\n\t\t\t\n\t\t \n \n \n \t \n\t\t\n\t\t\n\t\t\t,"7:23 AM - Mar 22, 2021","[""POST_AUTHOR"", ""jim1392""]",post_content1512095,Monday Moring Roll Call
5,post_content1512098,Morning folks,"7:41 AM - Mar 22, 2021","[""POST_AUTHOR"", ""Bigdmn""]",post_content1512095,Monday Moring Roll Call
6,post_content1512103,Morning all Happy Monday......since we're all getting vaccinated and toilet paper is limited....I might have room for one more\n\nSent from my SM-G970U using Tapatalk\n\n,"8:17 AM - Mar 22, 2021","[""POST_AUTHOR"", ""Truckmaster801""]",post_content1512095,Monday Moring Roll Call
7,post_content1512107,Hello from cool but sunny NC...Hope you'll have a good one.\n\n\n\nHello Springtime. Great to see these little critters back again.\n\n\n\t\t\t\n\t\t \n \n \n \t \n\t\t\n\t\t\n\t\t\t\n\t\t\t\n\t\t \n \n \n \t \n\t\t\n\t\t\n\t\t\t,"9:05 AM - Mar 22, 2021","[""POST_AUTHOR"", ""SkidRider 70""]",post_content1512095,Monday Moring Roll Call
8,post_content1512111,\n\nMorning everyone \n\n\nSent from my iPad using Tapatalk,"9:22 AM - Mar 22, 2021","[""POST_AUTHOR"", ""Mikesax""]",post_content1512095,Monday Moring Roll Call
9,post_content1512112,yo.,"9:25 AM - Mar 22, 2021","[""POST_AUTHOR"", ""BigDrocks""]",post_content1512095,Monday Moring Roll Call


### Create Subset
...enabling more targeted analysis

In [None]:
df_zipped_post4_2019_sept_up = df_zipped_post4[df_zipped_post4.month >= '2019-09']

In [52]:
pickle.dump(df_discussion_links_no_dict, open("df_discussion_links_no_dict_2020-03-14_11AM.p", "wb"))

df_discussion_links_no_dict_pickle = pickle.load(open("df_discussion_links_no_dict_2020-03-14_11AM.p", "rb"))

In [53]:
pickle.dump(df_zipped_post2, open("df_zipped_post2_2020-03-14_11AM.p", "wb"))

df_zipped_post2_pickle = pickle.load(open("df_zipped_post2_2020-03-14_11AM.p", "rb"))

In [54]:
df_zipped_post2_pickle.shape

(13177, 6)

In [28]:
pickle.dump(df_zipped_post3, open("df_zipped_post3_2020-03-22_11AM.p", "wb"))

df_zipped_post3_pickle = pickle.load(open("df_zipped_post3_2020-03-22_11AM.p", "rb"))

In [29]:
df_zipped_post3_pickle.shape

(13378, 6)