# Feature engineering

In [1]:
# Load required libraries
import nltk
import requests
from bs4 import BeautifulSoup
import re
import lxml
import pandas as pd

Load extraction and cleanup functions.

In [2]:
"""def get_section1(soup):
    # Extracts the 'About this project' section
    #return text.partition('About this project')[2] \
    #    .partition('Risks and challenges')[0]
    try:
        return soup.find(
            'div',
            class_='full-description js-full-description responsive-media ' + \
                'formatted-lists'
        ).get_text(' ')
    except AttributeError:
        return 'not_found'
    
def get_section2(soup):
    # Extracts the 'Risks and challenges' section
    #return text.partition('Risks and challenges')[2] \
    #    .partition('Learn about accountability on Kickstarter')[0]
    try:
        return soup.find(
            'div', 
            class_='mb3 mb10-sm mb3 js-risks'
        ).get_text(' ').replace('Risks and challenges', '')
    except AttributeError:
        return 'not_found'

def extract_sections(soup):    
    # Extract and clean up both sections
    #return (
    #    clean_up(get_section1(soup.get_text(' '))), 
    #    clean_up(get_section2(soup.get_text(' ')))
    #)
    return (
        clean_up(get_section1(soup)),
        clean_up(get_section2(soup))
    )"""

def scrape(hyperlink):
    # Scrape the website
    return requests.get(hyperlink)

def parse(scraped_html):
    # Parse the HTML
    return BeautifulSoup(scraped_html.text, 'lxml')

def clean_up(messy_text):    
    # Remove line breaks, leading and trailing whitespace, and compresses all
    # whitespace to a single space
    clean_text = ' '.join(messy_text.split()).strip()
    
    # Remove the HTML5 warning for videos
    return clean_text.replace(
        "You'll need an HTML5 capable browser to see this content. " + \
        "Play Replay with sound Play with sound 00:00 00:00",
        '#html5warn'
    )

def get_campaign(soup):
    # Extract the 'About this project' section, if available
    try:
        section1 = soup.find(
            'div',
            class_='full-description js-full-description responsive-media ' + \
                'formatted-lists'
        ).get_text(' ')
    except AttributeError:
        section1 = 'not_found'
    
    # Extract the 'Risks and challenges' section, if available
    try:
        section2 = soup.find(
            'div', 
            class_='mb3 mb10-sm mb3 js-risks'
        ) \
            .get_text(' ') \
            .replace('Risks and challenges', '') \
            .replace('Learn about accountability on Kickstarter', '')
    except AttributeError:
        section2 = 'not_found'
    
    # Clean up both sections and return them in a dict
    return {'about': clean_up(section1), 'risks': clean_up(section2)}

Get HTML content from a hyperlink.

In [27]:
#hyperlink = 'https://www.kickstarter.com/projects/getpebble/pebble-2-time-2-and-core-an-entirely-new-3g-ultra'
#hyperlink = 'https://www.kickstarter.com/projects/sbf/sculpto-the-worlds-most-user-friendly-desktop-3d-p?ref=discovery'
hyperlink = 'https://www.kickstarter.com/projects/getpebble/pebble-e-paper-watch-for-iphone-and-android'
scraped_html = scrape(hyperlink)

Parse HTML content and extract sections.

In [28]:
soup = parse(scraped_html)
campaign = get_campaign(soup)

## Normalize text

In [5]:
def normalize(text):
    normalized = re.sub(
        r'(http[s]?\S+)|(\w+\.[A-Za-z]{2,4}\S*)',
        '#httpaddr',
        text
    )
    
    return normalized

## Define functions to compute features
For each feature, test the function on the *About this project* section.

### Count # of sentences

In [6]:
def get_sentences(text):
    # Tokenizes text into sentences and returns them in a list
    return nltk.sent_tokenize(text)

In [29]:
len(get_sentences(campaign['about']))

113

### Count # of words

In [8]:
def remove_punc(text):
    # Returns the text with punctuation removed
    return re.sub(r'[^\w\d\s]', ' ', text)

In [9]:
def get_words(text):
    # Tokenizes text into words and returns them in a list
    return nltk.word_tokenize(remove_punc(text))

In [30]:
len(get_words(campaign['about']))

1305

### Count # of all-caps words and compute %

In [11]:
def identify_allcaps(text):
    # Counts the number of all-caps words
    return re.findall(r'\b[A-Z]{2,}', text)

In [31]:
len(identify_allcaps(campaign['about']))

27

In [32]:
len(identify_allcaps(campaign['about'])) / len(get_words(campaign['about']))

0.020689655172413793

### Count # of exclamation marks and compute %

In [13]:
def count_exclamations(text):
    # Counts the number of exclamation marks present in the text
    return text.count('!')

In [33]:
count_exclamations(campaign['about'])

12

In [34]:
count_exclamations(campaign['about']) / len(get_words(campaign['about']))

0.009195402298850575

### Compute the average # of words per sentence.

In [15]:
def compute_avg_words(text):
    return pd.Series(
        [len(get_words(sentence)) for sentence in \
         get_sentences(text)]
    ).mean()

In [35]:
compute_avg_words(campaign['about'])

11.548672566371682

### SOUP: Count # of hyperlinks

In [17]:
def count_hyperlinks(soup, section):    
    # Initialize total number of hyperlinks
    link_count = 0

    # Use tree parsing to compute hyperlinks for each section
    if section == 'about':
        link_count += len(soup.find(
            'div',
            class_='full-description js-full-description responsive' + \
                '-media formatted-lists'
            ).find_all('a'))
    elif section == 'risks':
        link_count += len(soup.find(
            'div',
            class_='mb3 mb10-sm mb3 js-risks'
            ).find_all('a'))
    
    return link_count

In [36]:
count_hyperlinks(soup, 'about')

15

### Count # of bolded text in the campaign and compute %

In [19]:
def count_bolded(soup, section):    
    # Initialize total number of hyperlinks
    bold_count = 0

    # Use tree parsing to compute hyperlinks for each section
    if section == 'about':
        bold_count += len(soup.find(
            'div',
            class_='full-description js-full-description responsive' + \
                '-media formatted-lists'
            ).find_all('b'))
    elif section == 'risks':
        bold_count += len(soup.find(
            'div',
            class_='mb3 mb10-sm mb3 js-risks'
            ).find_all('b'))
    
    return bold_count

In [37]:
count_bolded(soup, 'about')

42

In [38]:
count_bolded(soup, 'about') / len(get_words(campaign['about']))

0.03218390804597701

## Master function for computing features

In [22]:
def extract_features(text):
    # Compute all features and return them as a tuple
    return (
        len(get_sentences(text)),
        count_all_caps(text),
        count_exclamations(text),
        len(get_words(text)),
        compute_avg_words(text)
    )

In [23]:
extract_features(section1_text)

NameError: name 'section1_text' is not defined