# Web Scraping for Reddit & Predicting Comments

Your method for acquiring the data will be scraping the 'hot' threads as listed on the [Reddit homepage](https://www.reddit.com/). You'll acquire information about each thread:
1. The title of the thread
2. The subreddit that the thread corresponds to
3. The length of time it has been up on Reddit
4. The number of comments on the thread

Once you've got the data, you will build a classification model that, using Natural Language Processing and any other relevant features, predicts whether or not a given Reddit post will have above or below the 75th percentile number of comments.

### Scraping Thread Info from Reddit.com

#### Set up a request (using requests or chromedriver) to the URL below. Use BeautifulSoup to parse the page and extract all results

In [1]:
import requests
from bs4 import BeautifulSoup
import urllib
import pandas as pd

In [2]:
url = "http://www.reddit.com"
r = requests.get(url)
#request gets the HTML (in this case), gets stuff from websites

In [3]:
HTML = r.text

In [4]:
#lxml is the parser of HTML for python
soup = BeautifulSoup(HTML, 'lxml')

In [7]:
from selenium import webdriver
from selenium.webdriver import Chrome

driver = webdriver.Chrome(executable_path = './chromedriver')

WebDriverException: Message: 'chromedriver' executable needs to be in PATH. Please see https://sites.google.com/a/chromium.org/chromedriver/home


In [8]:
driver.get(url)

NameError: name 'driver' is not defined

In [9]:
from time import sleep

In [10]:
time.sleep(5)
content = driver.page_source
time.sleep(5)
print content

NameError: name 'time' is not defined

In [None]:
soup = BeautifulSoup(content, 'lxml')

In [None]:
driver.close()

In [None]:
print soup.find_all('a', {'data-event-action': 'title'})

While this has some more verbose elements removed, we can see that there is some structure to the above:
- The thread title is within an `<a>` tag with the attribute `data-event-action="title"`.
- The time since the thread was created is within a `<time>` tag with attribute `class="live-timestamp"`.
- The subreddit is within an `<a>` tag with the attribute `class="subreddit hover may-blank"`.
- The number of comments is within an `<a>` tag with the attribute data-event-action="comments"`.

In [None]:
#find the html code classified under the tag "a"
print soup.find('a',{'data-event-action':'title'})

In [None]:
#start printing only the text with the tag of "a
print soup.find('a',{'data-event-action':'title'}).get_text()

In [None]:
#pull out time since posted
print soup.find('time',{'class':'live-timestamp'}).get_text()

In [None]:
#pull out subreddit
print soup.find('a',{'class':'subreddit hover may-blank'}).get_text()

In [None]:
#pull out number of comments
print soup.find('a',{'data-event-action':'comments'}).get_text()

## Write 4 functions to extract these items (one function for each): title, time, subreddit, and number of comments.Â¶

##### - Make sure these functions are robust and can handle cases where the data/field may not be available.


In [None]:
def get_title(html):
    #html is NOT the "HTML" that we found, it is the "soup"
    title_list =[]
    for x in html.findAll('p', {'class':'title'})[1:]:
        title_list.append(x.text)
    return title_list

In [None]:
titles = get_title(soup)
titles
#this is a list, in our df it will be as a dataframe

In [None]:
titles[5]
#trying to pull one title from the list

In [None]:
def get_time(html):
    #html is NOT the "HTML" that we found, it is the "soup"
    time_list =[]
    for x in html.findAll('time',{'class':'live-timestamp'}):
        time_list.append(x.text)
    return time_list

In [None]:
def get_subreddit(html_soup):
    #html is NOT the "HTML" that we found, it is the "soup"
    subreddit_list =[]
    for x in html_soup.findAll('a',{'class':'subreddit hover may-blank'}):
        try:
            subreddit_list.append(x.text)
        except:
            subreddit_list.append('ERROR')
    return subreddit_list

In [None]:
subreddits = get_subreddit(soup)
subreddits

In [None]:
def get_comments(html_soup):
    #html is NOT the "HTML" that we found, it is the "soup"
    comment_list =[]  
    for x in html_soup.findAll('a',{'data-event-action':'comments'}):
        try:
            comment_list.append(x.text)
        except:
            comment_list.append('ERROR')
    return comment_list

In [None]:
number_comments = get_comments(soup)
number_comments

In [None]:
dict_1 = {'titles':titles, 'posted':times, 'subreddit':subreddits, 'comments':number_comments}
df_firstpage = pd.DataFrame(dict_1)

In [None]:
df_firstpage

In [None]:
df_firstpage.to_csv('reddit_onepage.csv',encoding= 'utf-8', index=False)

In [None]:
#df get_upvotes()

In [None]:
def reddit_function(soup):
    title = get_title(soup)
    times = get_time(soup)
    subreddit = get_subreddit(soup)
    comments = get_comments(soup)
    #upvotes = get_upvotes(soup)
    dic = {'titles':title, 'posted':times, 'subreddit':subreddit, 'comments':comments}
    if len(title) == len(times) == len(subreddit) == len(comments):
        df = pd.DataFrame(dic)
        return df

In [None]:
reddit_function(soup)

## Write one more function that finds the `id` on the page, and stores it.

In [None]:
#load up 300 pages on one page, need to use selenium to combine the pages
from selenium import webdriver

In [None]:
url = 'http://www.reddit.com/'

In [None]:
driver = webdriver.Chrome('./chromedriver 2')
driver.get(url)

In [None]:
#all of the HTML for all of the pages we designate
html= driver.page_source
html

In [None]:
full_soup= BeautifulSoup(html, 'lxml')

In [None]:
driver.close()

In [None]:
import re
#result.find(id=re.compile("thing"))

In [None]:
full_soup.find(id= re.compile('thing'))['id'][6:]

In [None]:
import re
def get_lastID(mysoup):
    return mysoup.find(id= re.compile('thing'))['id'][6:]

In [None]:
get_lastID(full_soup)

## Now, let's put it all together.

Use the functions you wrote above to parse out the 4 fields - title, time, subreddit, and number of comments. Create a dataframe from the results with those 4 columns.

In [None]:
url_template = "http://www.reddit.com/?count={}&after={}"
max_results = 100 # Set this to a high-value (5000) to generate more results. 
# Crawling more results, will also take much longer. First test your code on a small number of results and then expand.

results = []

for start in range(0, max_results, 25):
    # Grab the results from the request (as above)
    # Append to the full set of results
    pass

In [None]:
def reddit_scrapper(website):
    
    driver = webdriver.Chrome(executable_path="./chromedriver 2")
    driver.get(website)
    
    time.sleep(1)
    
    html = driver.page_source
    soup = BeautifulSoup(html, 'lxml')
    
    
    ids=[]
    for x in soup.find('div', {'class': 'thing'}):
        ids.append(x['data-fullname'])
    
    new_list = zip(ids,range(25,900,25))

    for i, n in new_list:
        
        url_template = "http://www.reddit.com/?count={}&after={}".format(n,i)
        driver.get(url_template)
        time.sleep(3)
        
        html = driver.page_source
        soup = BeautifulSoup(html, 'lxml')
        #return reddit_function(soup)
        
        time.sleep(3)
        
    driver.close()
    return ids
    #return reddit_function(soup)
    

In [None]:
title = []
subreddit = []
times = []
comment = []
domain = []

def all_elements(x):
  
    for i in soup.findAll('a',{'data-event-action': 'title'}):
        title.append(i.text)
    for i in soup.findAll('a',{'class': 'subreddit hover may-blank'}):
        subreddit.append(i.text)
    for i in soup.findAll('time',{'class': 'live-timestamp'}):
        times.append(i.text)
    for i in soup.findAll('a',{'class':'bylink comments may-blank'}):
        comment.append(i.text)
    for i in soup.findAll('span',{'class':'domain'}):
        domain.append(i.text.replace("(","").replace(")",""))
       
    return pd.DataFrame(zip(title[1:], subreddit, times, comment, domain),
                       columns=['title', 'subreddit','times', 'comment', 'domain',])

In [None]:
pages = range(1,30)
# starting url 
url = 'https://www.reddit.com/'    
for page in pages: 
    # Instantiate a new driver every loop
    driver = webdriver.Chrome(executable_path = './chromedriver 2')
    driver.get(url)
    html = driver.page_source
    # Put the page HTML in a soup object
    soup = BeautifulSoup(html, 'lxml')
    # overwrite the url with the url that the "Next" link points to.
    url = soup.find('span', {'class':'next-button'}).a['href']
    print url
    # Close out the driver
    df = pd.concat([df,reddit_function(soup)])
    driver.close()
    # Sleeping 
    sleep(5)

In [None]:
df.to_csv('reddit_df.csv', encoding = 'utf-8',index=False)
df = df.drop_duplicates()

In [None]:
df_20.to_csv('reddit_df_20.csv',encoding= 'utf-8', index=False)

In [None]:
def get_endings(website):
    driver = webdriver.Chrome(executable_path="./chromedriver 2")
    driver.get(website)
    
    time.sleep(1)
    
    html = driver.page_source
    soup = BeautifulSoup(html, 'lxml')

    ids = []
    for x in soup.findAll('div', {'class': 'thing'}):
        ids.append(x['data-fullname'])
    print ids
    

In [None]:
get_endings('http://www.reddit.com')

In [None]:
list_ids = get_endings('http://www.reddit.com')
print list_ids

In [None]:
ids = get_endings('http://www.reddit.com')
df = pd.DataFrame()

full_list = zip(range(25,4000,25),ids)

for i, n in full_list:
        url_template = "http://www.reddit.com/?count={}&after={}".format(n,i)
        print url_template
        driver = webdriver.Chrome(executable_path="./chromedriver 2")
        driver.get(url_template)
        time.sleep(3)
        
        html = driver.page_source
        soup = BeautifulSoup(html, 'lxml')
        time.sleep(3)
        df = pd.concat([df,reddit_function(soup)])
        #df.to_csv('file_name', encoding='utf-8', index=False)
        
        driver.close()
df

In [None]:
url_template = '...?count{}&after={}'

### Save your results as a CSV
You may do this regularly while scraping data as well, so that if your scraper stops or if your computer crashes, you don't lose all your data.

In [None]:
# Export to csv
import pandas as pd
scraping_results= pd.read_csv('./scraping_results.csv')
scraping_results.head()

In [None]:
scraping_results.info()

In [None]:
df = scraping_results.drop(scraping_results[['Unnamed: 0', 'created_at','time_now']],axis=1)

In [None]:
df.head()

In [None]:
df.time = pd.to_datetime(df.time_delta)

In [None]:
days = df.time_delta.days
# hours, remainder = divmod(td.seconds, 3600)
# minutes, seconds = divmod(remainder, 60)
# # If you want to take into account fractions of a second
# seconds += td.microseconds / 1e6