# HW3 - Scraping and Regression

In [None]:
# initial setup

from bs4 import BeautifulSoup
import urllib.request
import requests
import numpy as np
import scipy as sc
import matplotlib.pyplot as plt
import statsmodels.formula.api as smf
import pandas as pd

### Part 1 - Data Acquisition

In [None]:
# url = "https://amarioguy.github.io/m1windowsproject/"

# # here we actually access the website
# with urllib.request.urlopen( url ) as response:
#     html = response.read()
#     html = html.decode( 'utf-8' )

# # save the file
# with open( 'm1windowsproject.html', 'w' ) as new_file:
#     new_file.write(html)

# soup = BeautifulSoup( html, 'html.parser' )

# store data in arrays
rank = []
title_length = []
age_hours = []
points = []
comments_num = []

def scraper(url, n_pages) :

    webpage = requests.get(f"{url}{n_pages}")
    soup = BeautifulSoup(webpage.content, 'html.parser')

    # scrape rank
    for hit in soup.find_all("span", class_="rank") :      
        rank.append(int(hit.text.strip().replace(".", "")))

    # scrape length of title
    for hit in soup.find_all("a", class_="titlelink") :
        title_length.append(len(hit.text.strip()))

    # enter tablerow for age, points, and comments
    for row in soup.find_all("td", class_="subtext") :

        # scrpe age in hours
        hit_age = row.find("span", class_="age").find("a")
        if "minute" in hit_age.text :
            age_hours.append(float(hit_age.text.strip().replace(" minute ago", "").replace(" minutes ago", ""))/60)

        if "hour" in hit_age.text :
            age_hours.append(float(hit_age.text.strip().replace(" hour ago", "").replace(" hours ago", "")))

        if "day" in hit_age.text :
            age_hours.append(float(hit_age.text.strip().replace(" day ago", "").replace(" days ago", ""))*24)

        if hit_age is None :
            age_hours.append(0)

        print(f"{hit_age.text} {age_hours[-1]}")

        # scrape points
        hit_points = row.find("span", class_="score")
        if hit_points is None :
            points.append(0)
        else :
            points.append(int(hit_points.text.strip().replace(" points", "").replace(" point", "")))

        # scrape number of comments
        hit_comments = row.find_all("a")[-1]
        if "comments" in  hit_comments.text :
            comments_num.append(int(hit_comments.text.strip().replace(u'\xa0', u' ').replace(" comments", "").replace(" comment", "")))
        else :
            comments_num.append(0)

    if n_pages == 0 :
        return
    else :
        scraper(url, n_pages - 1)


In [None]:
# restart the function by modifying the url with the next logical one

# pass url here:
url = "https://news.ycombinator.com/news?p="


scraper(url, n_pages=5)


In [None]:
# verify that all columns are the same length (150)

# print(age_hours)
# print(len(age_hours))

### Part 2 - Regression

In [None]:
dataset = pd.DataFrame({'points' : points, 'rank' : rank, 'title_length' : title_length, 'age_hours' : age_hours, 'comments_num' : comments_num})

dataset

In [None]:
# rank against points

results = smf.ols('rank ~ points', data=dataset).fit()

results.summary()

In [None]:
# rank against number of comments
# you may need to run this a few times before its 150

results3 = smf.ols('rank ~ comments_num', data=dataset).fit()

results3.summary()

In [None]:
# rank compared with age in hours

results2 = smf.ols('rank ~ age_hours', data=dataset).fit()

results2.summary()

In [None]:
# regress "expression" onto "motifScore" (plus an intercept)
p = results2.params
print(p)

ax = dataset.plot(x='age_hours', y='rank', kind='scatter')
# plot regression line on the same axes, set x-axis limits
x=list(range(72))
ax.plot(x, p.Intercept + p.age_hours * np.float64(x))
ax.set_xlim([0, 72])

#### Analysis

The most useful regression compares rank against age in hours, which makes up for 
15% of the ranking according to its R2. The points vs rank, and number of comments vs rank, were both not statistically useful. 

The least useful are the comments and points. The R2 in comments and points was very low and did not signify a correlation between either comments or points and the ranking. 

For age in hours, at 0, the value of the Intercept coefficient is 57.3360. And as each hour increases, the ranking increases by 1.5168. This shows that this relation is statistically significant. 

For number of comments, at 0, the value of the Intercept coefficient is 77.4995. -0.0321

For points, at 0, the value of the Intercept coefficient is 76.9751. -0.0121

The p-value for Intercept is 0 for all three summaries. The p-value shows the chance that this relation happened by chance. The p-value being 0 shows that the relationship between age in hours and rank, the number of comments and rank, and points and rank, is statistically significant, because it means that it's unlikely to come about by accident. 

The age in hours and rank is not quite linear, but is also not nonexistent. There is a clear positive trend, but it's probably not linear.

### Part 3 - Classification

In [None]:
# make categorical variables into dummy variables
dataset['front_page'] = [1 if x <= 30 else 0 for x in dataset['rank']]

dataset.tail()

In [None]:
log_reg = smf.logit('front_page ~ points + title_length + age_hours + comments_num', dataset).fit()

log_reg.summary()

In [None]:
preds = []
for index, row in dataset.iterrows() :
    preds.append(log_reg.params.Intercept + log_reg.params.points * row.points + log_reg.params.title_length * row.title_length + log_reg.params.age_hours * row.age_hours + log_reg.params.comments_num * row.comments_num)
    
dataset["my_preds"] = preds

ax = dataset.plot(x='rank', y='my_preds', kind='scatter')

x=list(range(150))
ax.plot(x, [-1.5 for x in range(150)])
ax.set_xlim([0, 150])

In [None]:
# HN ranking algorithm found on the web (points - 1)^0.8 / (age_hours + 2)^1.8

preds = []
for index, row in dataset.iterrows() :
    preds.append((row.points - 1)**0.8 / (row.age_hours + 2)**1.8)
    
dataset["outdated_preds"] = preds

ax = dataset.plot(x='rank', y='outdated_preds', kind='scatter')

x=list(range(150))
ax.plot(x, [0.75 for x in range(150)])
ax.set_xlim([0, 150])


#### Analysis

My model does a decent job at predicting what will be on the front page. The coefficient for points is 0.0132, the points is positive and so a story is more likely to be on the front page as its points increase. The age_hours coefficient -0.2746 has a large magnitude and shows that an older post is less likely to be on the front page. Age in hours has more impact than number of points. 