# Feature engineering

In [1]:
# Load required libraries
import nltk
import requests
from bs4 import BeautifulSoup
import re
import pandas as pd

Load extraction and cleanup functions.

In [2]:
def extract_about_project(text):
    # Extracts the 'About this project' blurb
    return ''.join(re.findall(
        r'(?<=About this project).*(?=Risks and challenges)',
        text
    ))

def extract_risks(text):
    # Extracts the 'Risks and challenges' blurb
    return ''.join(re.findall(
        r'(?<=Risks and challenges).*(?=Learn about accountability on Kickstarter)',
        text
    ))

def clean_up(messy_text):    
    # Shrinks all whitespace to a single space
    clean_text = re.sub(r'\s+', ' ', messy_text)
    
    # Removes leading and trailing whitespace
    clean_text = re.sub(r'^\s+|\s+?$', '', clean_text)
    
    # Removes the video warning in the scraped content
    warning_str = "You'll need an HTML5 capable browser to see this content. Play Replay with sound Play with sound 00:00 00:00"
    return clean_text.replace(warning_str, '')

def extract_content(hyperlink):
    # Scrape the HTML content from the website pointed by the hyperlink
    doc = requests.get(hyperlink)
    soup = BeautifulSoup(doc.text, 'html.parser')
    
    # Return content all line break markers with a whitespace
    return soup.get_text().replace('\n', ' ')

def extract_sections(page_content):    
    # Extract and clean up both sections
    return (
        clean_up(extract_about_project(page_content)), 
        clean_up(extract_risks(page_content))
    )

Extract content from a hyperlink.

In [3]:
hyperlink = 'https://www.kickstarter.com/projects/sbf/sculpto-the-worlds-most-user-friendly-desktop-3d-p'
page_content = extract_content(hyperlink)
about_project_text, risks_text = extract_sections(page_content)

## Define functions to extract features

Count the number of sentences in the *About This Project* section.

In [4]:
def tokenize_sentences(text):
    # Tokenizes text into sentences and returns them in a list
    return nltk.sent_tokenize(text)

In [5]:
len(tokenize_sentences(about_project_text))

84

Count the number of all-caps words in the *About This Project* section.

In [6]:
def count_all_caps(text):
    # Counts the number of words in all-caps
    return len(re.findall(r'[A-Z]{2,}', text))

In [7]:
count_all_caps(about_project_text)

8

Count the number of exclamation points in the *About This Project* section.

In [8]:
def count_exclamations(text):
    # Counts the number of exclamation marks present in the text
    return len(re.findall(r'!', text))

In [9]:
count_exclamations(about_project_text)

5

Count the number of words in the *About This Project* section.

In [10]:
def remove_punc(text):
    # Returns the text with punctuation removed
    return re.sub(r'[^\w\d\s]', '', text)

In [11]:
def tokenize_words(text):
    # Tokenizes text into words and returns them in a list
    return nltk.word_tokenize(remove_punc(text))

In [12]:
len(tokenize_words(about_project_text))

1664

Count the number of words per sentence in the *About This Project* section.

In [13]:
def compute_avg_words(text):
    return pd.Series(
        [len(tokenize_words(sentence)) for sentence in \
         tokenize_sentences(text)]
    ).mean()

In [14]:
compute_avg_words(about_project_text)

19.80952380952381

## Create a master function for feature engineering

Define a function to extract all features given a text section.

In [15]:
def extract_features(text):
    # Compute all features and return them in a list
    sentence_count = len(tokenize_sentences(text))
    all_caps_count = count_all_caps(text)
    exclm_count = count_exclamations(text)
    word_count = len(tokenize_words(text))
    avg_words = compute_avg_words(text)
    return [sentence_count, all_caps_count, exclm_count, word_count, avg_words]

In [16]:
extract_features(about_project_text)

[84, 8, 5, 1664, 19.80952380952381]