# Scraping text from a Kickstarter page

In [1]:
# Load required libraries
import requests
from bs4 import BeautifulSoup
import re
import pandas as pd

Select a test hyperlink.

In [2]:
hyperlink = 'https://www.kickstarter.com/projects/636921542/mobile-phone-cup-holder-dock-and-stand-apple-and-a?ref=category'

Scrape the HTML content from the website pointed by the hyperlink.

In [3]:
doc = requests.get(hyperlink)
soup = BeautifulSoup(doc.text, 'html.parser')

Replace all line break markers with a whitespace.

In [4]:
parsed_text = soup.get_text().replace('\n', ' ')

Define functions to extract information on the two main sections: *About This Project* and *Risks and Challenges*.

In [5]:
def extract_about_project(text):
    # Extracts the 'About this project' blurb
    return ''.join(re.findall(
        r'(?<=About this project).*(?=Risks and challenges)',
        text
    ))

In [6]:
def extract_risks(text):
    # Extracts the 'Risks and challenges' blurb
    return ''.join(re.findall(
        r'(?<=Risks and challenges).*(?=Learn about accountability on Kickstarter)',
        text
    ))

Define a function to clean up the extracted sections.

In [7]:
def clean_up(messy_text):    
    # Shrinks all whitespace to a single space
    clean_text = re.sub(r'\s+', ' ', messy_text)
    
    # Removes leading and trailing whitespace
    clean_text = re.sub(r'^\s+|\s+?$', '', clean_text)
    
    # Removes the video warning in the scraped content
    warning_str = "You'll need an HTML5 capable browser to see this content. Play Replay with sound Play with sound 00:00 00:00"
    return clean_text.replace(warning_str, '')

Test the above functions on `parsed_text`.

In [8]:
about_project_text = clean_up(extract_about_project(parsed_text))
about_project_text

'The first and only vehicle cup holder mobile phone dock and stand. This dock and stand is very easy to use with just one hand, no more getting in the vehicle and have to find your charging or syncing cord then plug in and put your phone on your seat or your lap only to have to turn sharply or break and your phone slides in the crevice of the seat or the floor, no more getting a call and cannot find your phone, with the mount you simply get in and drop your phone on the mount and it stays put and where you can always see the display, once docked the phone charges and connects to your stereo.'

In [9]:
risks_text = clean_up(extract_risks(parsed_text))
risks_text

'Only risks could be timing issues from obtaining parts.'