In [1]:
#Below is my code for the crawl spider. It crawls the pages of the sports website bleacherreport.com
#To run the spider, cd into the bleacherreport parent directory and run the command: 'scrapy crawl sports -o data.csv -t csv",
#Which will save the resulting output to a csv file called 'data.csv'
#Created using the guide 'https://medium.com/python-pandemonium/develop-your-first-web-crawler-in-python-scrapy-6b2ee4baf954'

import scrapy
from scrapy.spiders import CrawlSpider, Rule
from scrapy.linkextractors import LinkExtractor
from scrapy.selector import Selector
from scrapy.http import Request
from scrapy.exceptions import CloseSpider

class BleacherreportItem(scrapy.Item):
    parent_url = scrapy.Field() #Every page that is scraped saves the following information: the url and the url of the 
    url = scrapy.Field() #The page that referred to it


class SportsSpider(CrawlSpider):
    name = 'sports'
    allowed_domains = ['www.bleacherreport.com']
    start_urls = ['http://www.bleacherreport.com/']
    
    rules = (
    Rule(LinkExtractor(allow=(), deny = "users"), #The spider will crawl every page, except those that have the word 'users'
         callback="parse_item", #in them. BleacherReport allows users to create profiles, and we're not interested in having
         follow=True), #the spider crawl through all of the many profiles that exist.
    )

    page_count = 0

    def start_requests(self):
        requests = []
        for start_url in self.start_urls:
            requests.append(Request(url=start_url, headers={'Referer': start_url}))
        return requests

    def parse_item(self, response):
        if self.page_count > 30000: ##30,000 page maximum
            raise CloseSpider('Have scraped a sufficient number of pages')
        else:
            self.page_count += 1
        if self.page_count % 1000 == 0:
            print(self.page_count)
            print('URL: ' + url)
            print('Parent URL: ' + parent_url)
        url = response.url
        parent_url = response.request.headers.get('Referer', None).decode('utf-8')
        item = BleacherreportItem()
        item['parent_url'] = parent_url
        item['url'] = url
        yield item


In [2]:
import numpy as np
import pandas as pd

In [3]:
##Load data

data = pd.read_csv('data.csv')

print('The number of rows in this dataframe is ' + str(len(data)))
data.head()

The number of rows in this dataframe is 18178


Unnamed: 0,url,parent_url
0,http://www.bleacherreport.com/mobile,http://www.bleacherreport.com/
1,http://www.bleacherreport.com/golf,http://www.bleacherreport.com/
2,http://www.bleacherreport.com/college-basketball,http://www.bleacherreport.com/
3,http://www.bleacherreport.com/tennis,http://www.bleacherreport.com/
4,http://www.bleacherreport.com/nascar,http://www.bleacherreport.com/


In [4]:
#Create a dictionary that will map each page to the index that it will be associated with in the connectivity graph
#and stochastic matrix

indexes = {}
j = 0

for i in range(len(data)):
    if data['parent_url'][i] not in indexes:
        indexes[data['parent_url'][i]] = j
        j += 1
    if data['url'][i] not in indexes:
        indexes[data['url'][i]] = j
        j += 1

In [5]:
#Create the connectivity graph. A '0' means that there is no link from page A to page B, while a '1' menas that there is
#a link from page A to page B

connectivity_graph = np.zeros((len(indexes), len(indexes)))
for i in range(len(data)):
    parent_url_index = indexes[data['parent_url'][i]]
    url_index = indexes[data['url'][i]]
    connectivity_graph[parent_url_index][url_index] = 1

In [6]:
# Create a vector with each value being the total number of outgoing links for a particular page.
# Connectivity matrix will be element-wise divided by this vector to create the stochastic matrix
total_outgoing_links = np.sum(connectivity_graph,axis = 1).reshape(len(connectivity_graph), 1)

#Replace all zeros with another value so that there are no divide by zero errors
total_outgoing_links[total_outgoing_links == 0] = 1

stochastic_matrix = connectivity_graph/total_outgoing_links


In [7]:
#Define the pagerank algorithm. It's input is the stochastic matrix that was just created
#Aglorithm retrieved from 'https://cs7083.wordpress.com/2013/01/31/demystifying-the-pagerank-and-hits-algorithms/'

def pagerank(H):
    n= len(H)
    w = np.zeros(n)
    rho = 1./n * np.ones(n);
    for i in range(n):
        if np.multiply.reduce(H[i]== np.zeros(n)):
            w[i] = 1
    newH = H + np.outer((1./n * w),np.ones(n))
 
    theta=0.85
    G = (theta * newH) + ((1-theta) * np.outer(1./n * np.ones(n), np.ones(n)))
    for j in range(10):
        rho = np.dot(rho,G)
    return rho

In [8]:
#Create the pagerank vector. The value of index i in the vector is the relative importance compared with the other pages
#of the page associated with index i in the 'indexes' dictionary

pagerank_rankings = pagerank(stochastic_matrix)

In [9]:
#Find the 5 indexes with the greatest pagerank value.

top_5_pagerank_indexes = pagerank_rankings.argsort()[-5:][::-1]
print('The top 5 pagerank indexes are ' + str(top_5_pagerank_indexes))

The top 5 pagerank indexes are [11949 12242 12238 11804 11924]


In [10]:
#Create an inversed dictionary that maps the indexes to the pages, so that each index can be easily looked up

inv_indexes = {v: k for k, v in indexes.items()}

In [11]:
#Look up each index in the inversed dictionary

top_5_pagerank_pages = []

for index in top_5_pagerank_indexes:
    top_5_pagerank_pages.append(inv_indexes[index])
    
print('The top 5 pages according to the pagerank algorithm are: ')
for page in top_5_pagerank_pages:
    print (str(top_5_pagerank_pages.index(page) + 1) + '.) ' + page)

The top 5 pages according to the pagerank algorithm are: 
1.) http://www.bleacherreport.com/wwe-fastlane/archives
2.) http://www.bleacherreport.com/san-antonio-stars/archives
3.) http://www.bleacherreport.com/football/archives
4.) http://www.bleacherreport.com/articles/2740848-ronda-rouseys-rumored-wrestlemania-bout-would-be-biggest-wwe-womens-match-ever
5.) http://www.bleacherreport.com/wwe-fastlane


In [12]:
#Now define the hits algorithm. It outputs 2 rankings, the Hits Authority ranking,
#and the Hits Hub ranking. The algorithm's input is the connectivity graph.
#Algorithm retrieved from 'https://cs7083.wordpress.com/2013/01/31/demystifying-the-pagerank-and-hits-algorithms/'


def hits(A):
    n= len(A)
    Au= np.dot(np.transpose(A),A)
    Hu = np.dot(A,np.transpose(A))
    a = np.ones(n); h = np.ones(n)
    for j in range(5):
        a = np.dot(a,Au)
        a = a/sum(a)
        h = np.dot(h,Hu)
        h = h/ sum(h)
    return a, h


In [13]:
##Data set is large so this hits algorithm takes a while to run

hits_authority_rankings, hits_hub_rankings = hits(connectivity_graph)

In [14]:
#Find the 5 indexes with the greatest values in each of the 2 rankings:

top_5_hits_authority_indexes = hits_authority_rankings.argsort()[-5:][::-1]
top_5_hits_hub_indexes = hits_hub_rankings.argsort()[-5:][::-1]

print('The top 5 Hits Authority indexes are ' + str(top_5_hits_authority_indexes))
print('The top 5 Hits Hub ranking indexes are ' + str(top_5_hits_hub_indexes))

The top 5 Hits Authority indexes are [6242 7255 7257 7258 7259]
The top 5 Hits Hub ranking indexes are [ 5046 11107 14555    25 11452]


In [15]:
#Find each of the indexes's respective page using the 'inv_indexes' dictionary from before

top_5_hits_authority_pages = []
top_5_hits_hub_pages = []


for index in top_5_hits_authority_indexes:
    top_5_hits_authority_pages.append(inv_indexes[index])
    

for index in top_5_hits_hub_indexes:
    top_5_hits_hub_pages.append(inv_indexes[index])
    
print('The top 5 pages according to the hits authority ranking are: ')
for page in top_5_hits_authority_pages:
    print (str(top_5_hits_authority_pages.index(page) + 1) + '.) ' + page)
    
print('The top 5 pages according to the hits hub ranking are: ')
for page in top_5_hits_hub_pages:
    print (str(top_5_hits_hub_pages.index(page) + 1) + '.) ' + page)

The top 5 pages according to the hits authority ranking are: 
1.) http://www.bleacherreport.com/houston-dynamo
2.) http://www.bleacherreport.com/armenia-national-football
3.) http://www.bleacherreport.com/argentina
4.) http://www.bleacherreport.com/andorra-national-football
5.) http://www.bleacherreport.com/american-samoa-national-football
The top 5 pages according to the hits hub ranking are: 
1.) http://www.bleacherreport.com/world-football/teams
2.) http://www.bleacherreport.com/mma/teams
3.) http://www.bleacherreport.com/college-basketball/teams
4.) http://www.bleacherreport.com/college-football/teams
5.) http://www.bleacherreport.com/pro-wrestling/teams
