# Feature engineering

In [1]:
# Load required libraries
import nltk
import requests
from bs4 import BeautifulSoup
import re
import lxml
import pandas as pd
import numpy as np
from sklearn.externals import joblib

Load scraping, parsing, and extraction functions.

In [2]:
def scrape(hyperlink):
    # Scrape the website
    return requests.get(hyperlink)

def parse(scraped_html):
    # Parse the HTML
    return BeautifulSoup(scraped_html.text, 'lxml')

def clean_up(messy_text):    
    # Remove line breaks, leading and trailing whitespace, and compresses all
    # whitespace to a single space
    clean_text = ' '.join(messy_text.split()).strip()
    
    # Remove the HTML5 warning for videos
    return clean_text.replace(
        "You'll need an HTML5 capable browser to see this content. " + \
        "Play Replay with sound Play with sound 00:00 00:00",
        ''
    )

def get_campaign(soup):
    # Extract the 'About this project' section, if available
    try:
        section1 = soup.find(
            'div',
            class_='full-description js-full-description responsive-media ' + \
                'formatted-lists'
        ).get_text(' ')
    except AttributeError:
        section1 = 'section_not_found'
    
    # Extract the 'Risks and challenges' section, if available
    try:
        section2 = soup.find(
            'div', 
            class_='mb3 mb10-sm mb3 js-risks'
        ) \
            .get_text(' ') \
            .replace('Risks and challenges', '') \
            .replace('Learn about accountability on Kickstarter', '')
    except AttributeError:
        section2 = 'section_not_found'
    
    # Clean up both sections and return them in a dict
    return {'about': clean_up(section1), 'risks': clean_up(section2)}

In [3]:
"""def get_section1(soup):
    # Extracts the 'About this project' section
    #return text.partition('About this project')[2] \
    #    .partition('Risks and challenges')[0]
    try:
        return soup.find(
            'div',
            class_='full-description js-full-description responsive-media ' + \
                'formatted-lists'
        ).get_text(' ')
    except AttributeError:
        return 'not_found'
    
def get_section2(soup):
    # Extracts the 'Risks and challenges' section
    #return text.partition('Risks and challenges')[2] \
    #    .partition('Learn about accountability on Kickstarter')[0]
    try:
        return soup.find(
            'div', 
            class_='mb3 mb10-sm mb3 js-risks'
        ).get_text(' ').replace('Risks and challenges', '')
    except AttributeError:
        return 'not_found'

def extract_sections(soup):    
    # Extract and clean up both sections
    #return (
    #    clean_up(get_section1(soup.get_text(' '))), 
    #    clean_up(get_section2(soup.get_text(' ')))
    #)
    return (
        clean_up(get_section1(soup)),
        clean_up(get_section2(soup))
    )""";

Scrape HTML content from a hyperlink.

In [4]:
#hyperlink = 'https://www.kickstarter.com/projects/getpebble/pebble-2-time-2-and-core-an-entirely-new-3g-ultra'
#hyperlink = 'https://www.kickstarter.com/projects/sbf/sculpto-the-worlds-most-user-friendly-desktop-3d-p?ref=discovery'
hyperlink = 'https://www.kickstarter.com/projects/getpebble/pebble-e-paper-watch-for-iphone-and-android'
#hyperlink = 'https://www.kickstarter.com/projects/1683069409/the-new-york-sorta-marathon?ref=discovery'
#hyperlink = 'https://www.kickstarter.com/projects/dinobytelabs/midli-a-dark-and-mystical-tale-of-letting-go?ref=category'
scraped_html = scrape(hyperlink)

Parse HTML content and extract sections.

In [5]:
soup = parse(scraped_html)
campaign = get_campaign(soup)

## Normalize text

Define a function to tag specific components of the sections, such as hyperlinks, as to avoid interfering with the analysis.

In [6]:
def normalize(text):
    # Tag email addresses
    normalized = re.sub(
        r'\b[\w\-.]+?@\w+?\.\w{2,4}\b',
        'emailaddr',
        text
    )
    
    # Tag hyperlinks
    normalized = re.sub(
        r'(http[s]?\S+)|(\w+\.[A-Za-z]{2,4}\S*)',
        'httpaddr',
        normalized
    )
    
    return normalized

In [7]:
campaign['about'] = normalize(campaign['about'])
campaign['risks'] = normalize(campaign['risks'])

## Define functions to compute features
For each feature, test the function on the *About this project* section.

### Count # of sentences

In [8]:
def get_sentences(text):
    # Tokenizes text into sentences and returns them in a list
    return nltk.sent_tokenize(text)

In [9]:
if campaign['about'] == 'section_not_found':
    num_sents = np.nan
else:
    num_sents = len(get_sentences(campaign['about']))
num_sents

113

### Count # of words

In [10]:
def remove_punc(text):
    # Returns the text with punctuation removed
    return re.sub(r'[^\w\d\s]', ' ', text)

In [11]:
def get_words(text):
    # Tokenizes text into words and returns them in a list, excluding tags
    return [word for word in nltk.word_tokenize(remove_punc(text)) \
            if word not in ('emailaddr', 'httpaddr')]

In [12]:
if campaign['about'] == 'section_not_found':
    num_words = np.nan
else:
    num_words = len(get_words(campaign['about']))
num_words

1295

### Count # of all-caps words and compute %

In [13]:
def identify_allcaps(text):
    # Counts the number of all-caps words
    return re.findall(r'\b[A-Z]{2,}', text)

In [14]:
if campaign['about'] == 'section_not_found':
    print(np.nan, np.nan)
else:
    print(
        len(identify_allcaps(campaign['about'])),
        len(identify_allcaps(campaign['about'])) / num_words
    )

27 0.02084942084942085


### Count # of exclamation marks and compute %

In [15]:
def count_exclamations(text):
    # Counts the number of exclamation marks present in the text
    return text.count('!')

In [16]:
if campaign['about'] == 'section_not_found':
    print(np.nan, np.nan)
else:
    print(
        count_exclamations(campaign['about']),
        count_exclamations(campaign['about']) / num_words
    )

12 0.009266409266409266


### Count # of Apple adjectives and %

In [17]:
def count_apple_words(text):
    apple_words = frozenset(
        ['revolutionary', 'breakthrough', 'beautiful', 'magical', 
        'gorgeous', 'amazing', 'incredible', 'awesome']
    )
    
    return sum(
        1 for word in get_words(text) if word in apple_words
    )

In [18]:
if campaign['about'] == 'section_not_found':
    print(np.nan, np.nan)
else:
    print(
        count_apple_words(campaign['about']),
        count_apple_words(campaign['about']) / num_words
    )

2 0.0015444015444015444


### Compute the average # of words per sentence

In [19]:
def compute_avg_words(text):
    return pd.Series(
        [len(get_words(sentence)) for sentence in \
         get_sentences(text)]
    ).mean()

In [20]:
if campaign['about'] == 'section_not_found':
    print(np.nan)
else:
    print(compute_avg_words(campaign['about']))

11.4601769912


### Count the # of paragraphs

In [21]:
def count_paragraphs(soup, section):    
    # Use tree parsing to compute # of paragraphs for each section
    if section == 'about':
        return len(soup.find(
            'div',
            class_='full-description js-full-description responsive' + \
                '-media formatted-lists'
            ).find_all('p'))
    elif section == 'risks':
        return len(soup.find(
            'div',
            class_='mb3 mb10-sm mb3 js-risks'
            ).find_all('p'))

In [22]:
if campaign['about'] == 'section_not_found':
    print(np.nan)
else:
    print(count_paragraphs(soup, 'about'))

39


### Count the average # of sentences per paragraph

In [23]:
def compute_avg_sents_paragraph(soup, section):
    if section == 'about':
        paragraphs = soup.find(
            'div',
            class_='full-description js-full-description responsive' + \
                '-media formatted-lists'
            ).find_all('p')
    elif section == 'risks':
        paragraphs = soup.find(
            'div',
            class_='mb3 mb10-sm mb3 js-risks'
            ).find_all('p')
        
    return pd.Series(
        [len(get_sentences(paragraph.get_text(' '))) for paragraph in \
         paragraphs]
    ).mean()

In [24]:
if campaign['about'] == 'section_not_found':
    print(np.nan)
else:
    print(compute_avg_sents_paragraph(soup, 'about'))

2.94871794872


### Count the average # of words per paragraph

In [25]:
def compute_avg_words_paragraph(soup, section):
    if section == 'about':
        paragraphs = soup.find(
            'div',
            class_='full-description js-full-description responsive' + \
                '-media formatted-lists'
            ).find_all('p')
    elif section == 'risks':
        paragraphs = soup.find(
            'div',
            class_='mb3 mb10-sm mb3 js-risks'
            ).find_all('p')

    return pd.Series(
        [len(get_words(paragraph.get_text(' '))) for paragraph in paragraphs]
    ).mean()

In [26]:
if campaign['about'] == 'section_not_found':
    print(np.nan)
else:
    print(compute_avg_words_paragraph(soup, 'about'))

29.4358974359


### SOUP: Count # of images

In [27]:
def count_images(soup, section):    
    # Use tree parsing to compute # of images for each section
    if section == 'about':
        return len(soup.find(
            'div',
            class_='full-description js-full-description responsive' + \
                '-media formatted-lists'
            ).find_all('img'))
    elif section == 'risks':
        return len(soup.find(
            'div',
            class_='mb3 mb10-sm mb3 js-risks'
            ).find_all('img'))

In [28]:
if campaign['about'] == 'section_not_found':
    print(np.nan)
else:
    print(count_images(soup, 'about'))

11


### Count # of embedded videos

In [29]:
def count_videos(soup, section):    
    # Use tree parsing to compute # of images for each section
    if section == 'about':
        return len(soup.find(
            'div',
            class_='full-description js-full-description responsive' + \
                '-media formatted-lists'
            ).find_all('div', class_='video-player'))
    elif section == 'risks':
        return len(soup.find(
            'div',
            class_='mb3 mb10-sm mb3 js-risks'
            ).find_all('div', class_='video-player'))

In [30]:
if campaign['about'] == 'section_not_found':
    print(np.nan)
else:
    print(count_videos(soup, 'about'))

0


### Count # of YouTube videos

In [31]:
def count_youtube(soup, section):    
    # Initialize total number of YouTube videos
    youtube_count = 0

    # Use tree parsing to compute # of YouTube videos for each section
    if section == 'about':
        iframes = soup.find(
            'div',
            class_='full-description js-full-description responsive' + \
            '-media formatted-lists'
        ).find_all('iframe')
    elif section == 'risks':
        iframes = soup.find(
            'div',
            class_='mb3 mb10-sm mb3 js-risks'
        ).find_all('iframe')
    
    for iframe in iframes:
        try:
            if 'youtube' in iframe.get('src'):
                youtube_count += 1
        except TypeError:
            pass
    
    return youtube_count

In [32]:
if campaign['about'] == 'section_not_found':
    print(np.nan)
else:
    print(count_youtube(soup, 'about'))

0


### Count # of GIFs

In [33]:
def count_gifs(soup, section):    
    # Initialize total number of GIFs
    gif_count = 0

    # Use tree parsing to compute # of GIFs for each section
    if section == 'about':
        images = soup.find(
            'div',
            class_='full-description js-full-description responsive' + \
            '-media formatted-lists'
        ).find_all('img')
    elif section == 'risks':
        images = soup.find(
            'div',
            class_='mb3 mb10-sm mb3 js-risks'
        ).find_all('img')
    
    for image in images:
        try:
            if 'gif' in image.get('data-src'):
                gif_count += 1
        except TypeError:
            pass
    
    return gif_count

In [34]:
if campaign['about'] == 'section_not_found':
    print(np.nan)
else:
    print(count_gifs(soup, 'about'))

0


### Count # of hyperlinks

In [35]:
def count_hyperlinks(soup, section):    
    # Use tree parsing to compute hyperlinks for each section
    if section == 'about':
        return len(soup.find(
            'div',
            class_='full-description js-full-description responsive' + \
                '-media formatted-lists'
            ).find_all('a'))
    elif section == 'risks':
        return len(soup.find(
            'div',
            class_='mb3 mb10-sm mb3 js-risks'
            ).find_all('a'))

In [36]:
if campaign['about'] == 'section_not_found':
    print(np.nan)
else:
    print(count_hyperlinks(soup, 'about'))

15


### Count # of bolded text and compute %

In [37]:
def count_bolded(soup, section):    
    # Use tree parsing to compute hyperlinks for each section
    if section == 'about':
        return len(soup.find(
            'div',
            class_='full-description js-full-description responsive' + \
                '-media formatted-lists'
            ).find_all('b'))
    elif section == 'risks':
        return len(soup.find(
            'div',
            class_='mb3 mb10-sm mb3 js-risks'
            ).find_all('b'))

In [38]:
if campaign['about'] == 'section_not_found':
    print(np.nan, np.nan)
else:
    print(
        count_bolded(soup, 'about'),
        count_bolded(soup, 'about') / num_words
    )

42 0.032432432432432434


## Extract all features for one project

In [39]:
section = 'about'
if campaign[section] == 'section_not_found':
    print([np.nan] * 19)
else:
    row = ( 
        len(get_sentences(campaign[section])),
        len(get_words(campaign[section])),
        len(identify_allcaps(campaign[section])),
        len(identify_allcaps(campaign[section])) / num_words,
        count_exclamations(campaign[section]),
        count_exclamations(campaign[section]) / num_words,
        count_apple_words(campaign[section]),
        count_apple_words(campaign[section]) / num_words,
        compute_avg_words(campaign[section]),
        count_paragraphs(soup, section),
        compute_avg_sents_paragraph(soup, section),
        compute_avg_words_paragraph(soup, section),
        count_images(soup, section),
        count_videos(soup, section),
        count_youtube(soup, section),
        count_gifs(soup, section),
        count_hyperlinks(soup, section),
        count_bolded(soup, section),
        count_bolded(soup, section) / num_words
    )
    
    print(row)

(113, 1295, 27, 0.02084942084942085, 12, 0.009266409266409266, 2, 0.0015444015444015444, 11.460176991150442, 39, 2.9487179487179489, 29.435897435897434, 11, 0, 0, 0, 15, 42, 0.032432432432432434)


## Extract features for all Kickstarter pages

In [40]:
# Load scraped data
scraped_collection = joblib.load('data/2017-09-10_scraped_data.pkl')

In [64]:
def extract_features(soup, campaign, section):
    # Extract all features for the given section, otherwise return np.nan
    if campaign[section] == 'section_not_found':
        return([np.nan] * 19)
    else:
        return (
            len(get_sentences(campaign[section])),
            len(get_words(campaign[section])),
            len(identify_allcaps(campaign[section])),
            len(identify_allcaps(campaign[section])) / num_words,
            count_exclamations(campaign[section]),
            count_exclamations(campaign[section]) / num_words,
            count_apple_words(campaign[section]),
            count_apple_words(campaign[section]) / num_words,
            compute_avg_words(campaign[section]),
            count_paragraphs(soup, section),
            compute_avg_sents_paragraph(soup, section),
            compute_avg_words_paragraph(soup, section),
            count_images(soup, section),
            count_videos(soup, section),
            count_youtube(soup, section),
            count_gifs(soup, section),
            count_hyperlinks(soup, section),
            count_bolded(soup, section),
            count_bolded(soup, section) / num_words
    )

In [68]:
# Initialize empty DataFrames for each section
features = ['num_sents', 'num_words', 'num_all_caps', 'percent_all_caps',
             'num_exclms', 'percent_exclms', 'num_apple_words',
             'percent_apple_words', 'avg_words_per_sent', 'num_paragraphs',
             'avg_sents_per_paragraph', 'avg_words_per_paragraph',
             'num_images', 'num_videos', 'num_youtubes', 'num_gifs',
             'num_hyperlinks', 'num_bolded', 'percent_bolded']
section1_df = pd.DataFrame(columns=features)
section2_df = pd.DataFrame(columns=features)

In [69]:
for index, row in scraped_collection.iterrows():
    # Parse scraped HTML
    soup = parse(row[0])
    
    # Extract and normalize campaign sections
    campaign = get_campaign(soup)
    campaign['about'] = normalize(campaign['about'])
    campaign['risks'] = normalize(campaign['risks'])
    
    # Extract features for each section
    section1_df.loc[index] = extract_features(soup, campaign, 'about')
    section2_df.loc[index] = extract_features(soup, campaign, 'risks')