In [115]:
import sys
import json
from bs4 import BeautifulSoup
#from time import sleep
import time as time
import requests
from random import randint
from html.parser import HTMLParser
import pickle

USER_AGENT = {'User-Agent':'Emil'}

#class for scraping
class SearchEngine:
    def search(query, sleep=True):
        if sleep: #prevents loading too many pages too soon
            time.sleep(randint(200, 400))
        temp_url = '+'.join(query.split()) #for adding + between words for the query
        url = 'https://www.bing.com/search?q=' + temp_url + '&count=30'
        print(url)
        soup = BeautifulSoup(requests.get(url, headers=USER_AGENT).text, 'html.parser')
        new_results = SearchEngine.scrape_search_results(soup)
        return new_results
        
    def scrape_search_results(soup):
        raw_results = soup.find_all('li', {'class': 'b_algo'})
        #print(raw_results)
        results = []
        for result in raw_results:
            link = result.find('a').get('href')
            results.append(link)
        return results
        
#read the queries into an array
def load_queries():
    queries = []
    #queries_lines = open('100QueriesSet1.txt', 'r')
    queries_lines = open('100QueriesSet1_small.txt', 'r')
    for q in queries_lines:
        _ = q.strip()
        queries.append(_)
    return queries

#read the json Google reference results
def load_ref():
    ref = open("Google_Result1.json", "r")
    ref_content = ref.read()
    ref_dict = json.loads(ref_content)
    return ref_dict

#convert python dict to an array
def dict2array(queries, ref_dict):
    ref_array = []
    for q in queries:
        _ = ref_dict[q]
        ref_array.append(_)
    return ref_array

#convert to lowercase > remove http or https > remove :// > remove www. > remove / at the end for search results or reference files 
def process_results(results_all):
    processed_results = []
    for rs in results_all:
        rs_ = []
        for r in rs:
            r = r.lower()
            if r.startswith('https'):
                r = r[len('https'):]
            if r.startswith('http'):
                r = r[len('http'):]
            r = r[len('://'):]
            if r.startswith('www.'):
                r = r[len('www.'):]
            if r.endswith('/'):
                r = r[:-1]
            rs_.append(r)
        processed_results.append(rs_)
    return processed_results

#remove duplicates, get first 10 results
def remove_duplicates_truncate(result, result_original):
    res = []
    res_o = []
    for r, r_o in zip(result, result_original):
        if r not in res:
            res.append(r)
            res_o.append(r_o)
    if len(res) > 10:
        return res[0:10], res_o[0:10]
    else:
        return res, res_o

#remove duplicates, get first 10 results
def filter_results(results, results_original):
    results_f = []
    results_f_original = []
    for result, result_original in zip(results, results_original):
        _, _original = remove_duplicates_truncate(result, result_original)
        results_f.append(_)
        results_f_original.append(_original)
    return results_f, results_f_original

def dump_search_results(results_all_f, queries):
    out = dict()
    for idx, q in enumerate(queries):
        out[q] = results_all_f[idx]
    with open('hw1.json', 'w') as fp:
        json.dump(out, fp, indent=4)
        
#calculate URL matches (n) 
def find_overlap(references, results):
    
    all_index_ref = []
    all_index_res = []
    
    for ref, res in zip(references, results):
        
        index_ref = [] #index as it appears in the reference solution
        index_res = [] #index as it appears in the scraped solution
        
        for index_in_ref, r in enumerate(ref):
            if r in res:
                index_in_res = res.index(r)
                index_res.append(index_in_res)
                index_ref.append(index_in_ref)
                
        all_index_ref.append(index_ref)
        all_index_res.append(index_res)

    return all_index_ref, all_index_res

def _sum(arr): 
    sum=0
    for i in arr:
        sum = sum + i
    return(sum) 

def sum_of_squares(x, y):
    diff = [a_i - b_i for a_i, b_i in zip(x, y)]
    diff_sq = [a_i**2 for a_i in diff]
    return _sum(diff_sq)

def rho(n, sos):
    return 1 - ((6.0*sos)/(n*((n**2)-1)))

#squared of rank differences (d^2)
def calculate_metric(all_index_ref, all_index_res):
    metrics = []
    for ref, res in zip(all_index_ref, all_index_res):
        #print(res)
        #print(ref)
        sos = sum_of_squares(ref, res)
        n = len(ref)
        percentage = (n/10.0)*100
        if n == 1:
            if sos == 0:
                rho_ = 1.0
            else:
                rho_ = 0.0
        elif n == 0:
            rho_ = 1.0
        else:
            rho_ = rho(n, sos)
        metrics.append((n, percentage, rho_))
    return metrics

def calculate_averages(metrics):
    averages = []
    for i in range(3):
        sum_ = 0
        for m in metrics:
            sum_ = sum_ + m[i]
        averages.append(1.0*sum_/len(metrics))
    return averages

def write_output(metrics, averages):
    output = open('hw1.csv' , 'w')
    output.write('Queries, Number of Overlapping Results, Percent Overlap, Spearman Coefficient\n')
    for idx, m in enumerate(metrics):
        output.write('Query '+str(idx+1)+', '+str(m[0])+', '+str(m[1])+', '+str(m[2])+'\n')
    output.write('Averages, '+str(averages[0])+', '+str(averages[1])+', '+str(averages[2])+'\n')
    output.close

In [2]:
#load queries and reference answers
queries = load_queries()
ref_dict = load_ref()

#scrape URLs from the webpage
results_all = []
for idx, q in enumerate(queries):
    _ = SearchEngine.search(q)
    print("Query: " + idx + " No of results returned: " + len(_))
    results_all.append(_)

https://www.bing.com/search?q=How+do+you+replace+coolant+thermostat&count=30
No of results returned:  30
https://www.bing.com/search?q=How+is+library+science+vand+information+science+related&count=30
No of results returned:  29
https://www.bing.com/search?q=Which+phase+is+the+non+dividing+stage&count=30
No of results returned:  28
https://www.bing.com/search?q=How+much+has+michael+vick+worth+after+release+from+prison&count=30
No of results returned:  17


In [120]:
with open('results.pkl','wb') as f:
    pickle.dump(results_all, f)
    
with open('results.pkl','rb') as f:
    results_all = pickle.load(f)

In [121]:
####### TASK 1 #######

#process returned search results and reference answers
results_all_p = process_results(results_all)
ref_array = dict2array(queries, ref_dict)
ref_array_p = process_results(ref_array)

#get the first 10 and remove duplicates
results_all_p_f, results_all_f = filter_results(results_all_p, results_all)

#dump results into a json file
dump_search_results(results_all_f, queries)

#find overlap
all_index_ref, all_index_res = find_overlap(ref_array_p, results_all_p_f)

In [122]:
####### TASK 2 #######

metrics = calculate_metric(all_index_ref, all_index_res)
averages = calculate_averages(metrics)

write_output(metrics, averages)