# Create A Simple Search Engine Using Python
### Information retrieval using cosine similarity and term-document matrix with TF-IDF weighting.

The first thing that we have to do is to retrieve the documents from the Internet. In this case, we can use web scraping to extract documents from a website. I will scrape documents from kompas.com on sport category, especially on the popular articles. Because of the documents are using HTML format, we initialize a BeautifulSoup object to parse the HTML file, so we can extract each element that we want much easier.

In [31]:
import re
import string
import requests
import numpy as np
import pandas as pd
from bs4 import BeautifulSoup
from sklearn.feature_extraction.text import TfidfVectorizer

def retrieve_docs_and_clean():
  # Scrape the website
  r = requests.get('https://skyandtelescope.org/astronomy-news/exoplanets/')
  soup = BeautifulSoup(r.content, 'html.parser')

  link = []
  for i in soup.find('div', {'class':'archive'}).find_all('aarchive'):
      i['href'] = i['href'] + '?page=all'
      link.append(i['href'])

  # Retrieve Paragraphs
  documents = []
  for i in link:
      r = requests.get(i)
      soup = BeautifulSoup(r.content, 'html.parser')

      sen = []
      for i in soup.find('div', {'class':'read__content'}).find_all('p'):
          sen.append(i.text)
      documents.append(' '.join(sen))

  # Clean Paragraphs
  documents_clean = []
  for d in documents:
      document_test = re.sub(r'[^\x00-\x7F]+', ' ', d)
      document_test = re.sub(r'@\w+', '', document_test)
      document_test = document_test.lower()
      document_test = re.sub(r'[%s]' % re.escape(string.punctuation), ' ', document_test)
      document_test = re.sub(r'[0-9]', '', document_test)
      document_test = re.sub(r'\s{2,}', ' ', document_test)
      documents_clean.append(document_test)

  return documents_clean

def get_similar_articles(q, df):
    print("Query:", q)
    print('-'*100)

    # Convert the query become a vector
    q = [q]
    q_vec = vectorizer.transform(q).toarray().reshape(df.shape[0],)

    # Calculate the similarity
    sim = {}
    for i in range(10):
        sim[i] = np.dot(df.loc[:, i].values, q_vec) / np.linalg.norm(df.loc[:, i]) * np.linalg.norm(q_vec)
    
    # Sort the values
    sim_sorted = sorted(sim.items(), key=lambda x: x[1], reverse=True)
    
    # Print the articles and similarity values
    for k, v in sim_sorted:
        if v != 0.0:
            print("Highest Cosine Similarity:", v)
            print(docs[k])
            print()

## Term-Document Matrix

It consists of rows that represent by each token (term) from all documents, and the columns consist of the identifier of the document. Inside of the cell is the number of frequency of each word that is weighted by some number.

In [32]:
docs = retrieve_docs_and_clean()

# Create Term-Document Matrix with TF-IDF weighting

# Instantiate a TfidfVectorizer object
vectorizer = TfidfVectorizer()
# Fits the data and transforms it as a vector
X = vectorizer.fit_transform(docs)

# Create a DataFrame and set the vocabulary as the index
df = pd.DataFrame(X.T.toarray(), index=vectorizer.get_feature_names())
df.head()

ValueError: empty vocabulary; perhaps the documents only contain stop words

In [None]:
q1 = 'barcelona'

get_similar_articles(q1, df)



Query: barcelona
----------------------------------------------------------------------------------------------------
Highest Cosine Similarity: 0.06796391501095142
kompas com hasil liga champions pada selasa hingga rabu memastikan dua tim melaju ke babak besar kedua tim itu adalah juventus dan bayern muenchen kepastian juventus dan bayern muenchen lolos fase knockout didapat setelah melakoni pertandingan kontra lawan masing masing pada matchday keempat fase grup liga champions musim ini juventus yang menjamu zenit st petersburg di allianz stadium rabu dini hari wib meraih kemenangan bianconeri julukan juventus menang meyakinkan berkat paulo dybala gol federico chiesa dan alvaro morata baca juga hasil juventus vs zenit bianconeri lolos ke besar liga champions sementara itu gol gol zenit tercipta dari gol bunuh diri leonardo bonucci dan aksi sardar azmoun berkat kemenangan ini juventus makin kokoh di puncak klasemen grup h liga champions dapatkan informasi inspirasi dan insight di email

In [27]:
r = requests.get('https://skyandtelescope.org/astronomy-news/exoplanets/')
soup = BeautifulSoup(r.content, 'html.parser')

In [28]:
soup

<!DOCTYPE html>

<!--[if lt IE 7]><html class="no-js ie ie6 lt-ie9 lt-ie8 lt-ie7" lang="en-US" xmlns:fb="https://www.facebook.com/2008/fbml" xmlns:addthis="https://www.addthis.com/help/api-spec" > <![endif]-->
<!--[if IE 7]><html class="no-js ie ie7 lt-ie9 lt-ie8" lang="en-US" xmlns:fb="https://www.facebook.com/2008/fbml" xmlns:addthis="https://www.addthis.com/help/api-spec" > <![endif]-->
<!--[if IE 8]><html class="no-js ie ie8 lt-ie9" lang="en-US" xmlns:fb="https://www.facebook.com/2008/fbml" xmlns:addthis="https://www.addthis.com/help/api-spec" > <![endif]-->
<!--[if gt IE 8]><!--><html class="no-js" lang="en-US" xmlns:addthis="https://www.addthis.com/help/api-spec" xmlns:fb="https://www.facebook.com/2008/fbml"> <!--<![endif]-->
<head>
<!-- Start after head script -->
<!-- Google Tag Manager -->
<script>(function(w,d,s,l,i){w[l]=w[l]||[];w[l].push({'gtm.start':
new Date().getTime(),event:'gtm.js'});var f=d.getElementsByTagName(s)[0],
j=d.createElement(s),dl=l!='dataLayer'?'&l='+l:''