In [1]:
import re
import bs4
import requests
from urllib.parse import urljoin

from transformers import pipeline

# Request website to scrap

In [15]:
website_url = "https://improvado.io"

# Improvado - Blog

## Web scraping

In [16]:
path_to_webscrape = 'blog'

In [105]:
response = requests.get(urljoin(website_url, path_to_webscrape))
if not response.ok:
    print('Error in response happend')

In [108]:
# parse response
soup = bs4.BeautifulSoup(response.text, 'html.parser')

# find content
content = soup.find('div', {'class': re.compile(r'blog-posts')})

In [119]:
def get_header_info(post_soup: bs4.BeautifulSoup) -> tuple:
    title = post_soup.find('div',{'class':'blog-post-header'}).h1.text
    author = post_soup.find('div',{'class': re.compile(r'blog-post-author')}).text
    date = post_soup.find('div',{'class': re.compile(r'blog-post-date')}).text

    return (title, author, date)

def get_post_content(post_soup: bs4.BeautifulSoup) -> str:
    post_content = post_soup.find('div',{'id':'content'})
    text = post_content.text.replace("â","'")

    return text

In [129]:
# first tables content contains latest posts, so only grab tags including the right url
list_posts = content.find_all('div',{'role': 'listitem'})

context = []

for post in list_posts[:5]:
    url_post = urljoin(website_url, post.a['href'])

    # request website
    post = requests.get(url_post)
    post_soup = bs4.BeautifulSoup(post.text, 'html.parser')
    
    # get header info
    title, author, date = get_header_info(post_soup)

    # get post content info
    post_content = get_post_content(post_soup)

    # make context 
    make_context_text = f'{author} at {date} wrote {title}. This is the following content. {post_content}'
    context.append(make_context_text)

In [135]:
context = ' '.join(context)

## QA model

In [131]:
qa_model = pipeline("question-answering")

No model was supplied, defaulted to distilbert-base-cased-distilled-squad and revision 626af31 (https://huggingface.co/distilbert-base-cased-distilled-squad).
Using a pipeline without specifying a model name and revision in production is not recommended.
Some layers from the model checkpoint at distilbert-base-cased-distilled-squad were not used when initializing TFDistilBertForQuestionAnswering: ['dropout_19']
- This IS expected if you are initializing TFDistilBertForQuestionAnswering from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing TFDistilBertForQuestionAnswering from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some layers of TFDistilBertForQuestionAnswering were not initialized from the model ch

In [142]:
question = "What is a salesforce dashboard?"
context = context
result = qa_model(question = question, context = context)
result

{'score': 0.8670777678489685,
 'start': 32917,
 'end': 32924,
 'answer': 'the CPC'}

In [143]:
question = "How to build a Salesforce dashboard?"
context = context
result = qa_model(question = question, context = context)
result

{'score': 0.44610846042633057,
 'start': 1509,
 'end': 1552,
 'answer': "using the platform's built-in functionality"}

In [144]:
question = "What's Einstein Analytics?"
context = context
result = qa_model(question = question, context = context)
result

{'score': 0.7752603888511658,
 'start': 43449,
 'end': 43482,
 'answer': 'Cross-Channel DashboardsNaturally'}

# Future works and key improvements

- Clean special characters that are not being decoded correctly
- Scrap the blog contents better. So far it is pulling all text at once, but we could improve the format of lists and embedded text blocks
- Fine-tune a Q&A model. Now, I'm using a pretrained Hugging Face Q&A model, however to improve the results of the answers, we'd need a dataset with questions and answers to fine tune this model.