# Feature extractor

The goal of this script is to load a DataFrame containing scraped HTML, parse it, and extract features for both sections of the Kickstarter project.

In [1]:
# Load required libraries
import nltk
import requests
from bs4 import BeautifulSoup
import re
import lxml
import pandas as pd
import numpy as np
from sklearn.externals import joblib

Load scraping, parsing, and campaign extraction and cleaning functions.

In [2]:
def scrape(hyperlink):
    # Scrape the website
    return requests.get(hyperlink)

def parse(scraped_html):
    # Parse the HTML
    return BeautifulSoup(scraped_html.text, 'lxml')

def clean_up(messy_text):    
    # Remove line breaks, leading and trailing whitespace, and compresses all
    # whitespace to a single space
    clean_text = ' '.join(messy_text.split()).strip()
    
    # Remove the HTML5 warning for videos
    return clean_text.replace(
        "You'll need an HTML5 capable browser to see this content. " + \
        "Play Replay with sound Play with sound 00:00 00:00",
        ''
    )

def get_campaign(soup):
    # Extract the 'About this project' section, if available
    try:
        section1 = soup.find(
            'div',
            class_='full-description js-full-description responsive-media ' + \
                'formatted-lists'
        ).get_text(' ')
    except AttributeError:
        section1 = 'section_not_found'
    
    # Extract the 'Risks and challenges' section, if available
    try:
        section2 = soup.find(
            'div', 
            class_='mb3 mb10-sm mb3 js-risks'
        ) \
            .get_text(' ') \
            .replace('Risks and challenges', '') \
            .replace('Learn about accountability on Kickstarter', '')
    except AttributeError:
        section2 = 'section_not_found'
    
    # Clean up both sections and return them in a dict
    return {'about': clean_up(section1), 'risks': clean_up(section2)}

def normalize(text):
    # Tag email addresses
    normalized = re.sub(
        r'\b[\w\-.]+?@\w+?\.\w{2,4}\b',
        'emailaddr',
        text
    )
    
    # Tag hyperlinks
    normalized = re.sub(
        r'(http[s]?\S+)|(\w+\.[A-Za-z]{2,4}\S*)',
        'httpaddr',
        normalized
    )
    
    return normalized

Load all feature extraction functions.

In [3]:
def get_sentences(text):
    # Tokenizes text into sentences and returns them in a list
    return nltk.sent_tokenize(text)

def remove_punc(text):
    # Returns the text with punctuation removed
    return re.sub(r'[^\w\d\s]', ' ', text)

def get_words(text):
    # Tokenizes text into words and returns them in a list, excluding tags
    return [word for word in nltk.word_tokenize(remove_punc(text)) \
            if word not in ('emailaddr', 'httpaddr')]

def identify_allcaps(text):
    # Counts the number of all-caps words
    return re.findall(r'\b[A-Z]{2,}', text)

def count_exclamations(text):
    # Counts the number of exclamation marks present in the text
    return text.count('!')

def count_apple_words(text):
    apple_words = frozenset(
        ['revolutionary', 'breakthrough', 'beautiful', 'magical', 
        'gorgeous', 'amazing', 'incredible', 'awesome']
    )
    
    return sum(
        1 for word in get_words(text) if word in apple_words
    )

def compute_avg_words(text):
    return pd.Series(
        [len(get_words(sentence)) for sentence in \
         get_sentences(text)]
    ).mean()

def count_paragraphs(soup, section):    
    # Use tree parsing to compute # of paragraphs for each section
    if section == 'about':
        return len(soup.find(
            'div',
            class_='full-description js-full-description responsive' + \
                '-media formatted-lists'
            ).find_all('p'))
    elif section == 'risks':
        return len(soup.find(
            'div',
            class_='mb3 mb10-sm mb3 js-risks'
            ).find_all('p'))
    
def compute_avg_sents_paragraph(soup, section):
    if section == 'about':
        paragraphs = soup.find(
            'div',
            class_='full-description js-full-description responsive' + \
                '-media formatted-lists'
            ).find_all('p')
    elif section == 'risks':
        paragraphs = soup.find(
            'div',
            class_='mb3 mb10-sm mb3 js-risks'
            ).find_all('p')
        
    return pd.Series(
        [len(get_sentences(paragraph.get_text(' '))) for paragraph in \
         paragraphs]
    ).mean()

def compute_avg_words_paragraph(soup, section):
    if section == 'about':
        paragraphs = soup.find(
            'div',
            class_='full-description js-full-description responsive' + \
                '-media formatted-lists'
            ).find_all('p')
    elif section == 'risks':
        paragraphs = soup.find(
            'div',
            class_='mb3 mb10-sm mb3 js-risks'
            ).find_all('p')

    return pd.Series(
        [len(get_words(paragraph.get_text(' '))) for paragraph in paragraphs]
    ).mean()

def count_images(soup, section):    
    # Use tree parsing to compute # of images for each section
    if section == 'about':
        return len(soup.find(
            'div',
            class_='full-description js-full-description responsive' + \
                '-media formatted-lists'
            ).find_all('img'))
    elif section == 'risks':
        return len(soup.find(
            'div',
            class_='mb3 mb10-sm mb3 js-risks'
            ).find_all('img'))
    
def count_videos(soup, section):    
    # Use tree parsing to compute # of images for each section
    if section == 'about':
        return len(soup.find(
            'div',
            class_='full-description js-full-description responsive' + \
                '-media formatted-lists'
            ).find_all('div', class_='video-player'))
    elif section == 'risks':
        return len(soup.find(
            'div',
            class_='mb3 mb10-sm mb3 js-risks'
            ).find_all('div', class_='video-player'))

def count_youtube(soup, section):    
    # Initialize total number of YouTube videos
    youtube_count = 0

    # Use tree parsing to compute # of YouTube videos for each section
    if section == 'about':
        iframes = soup.find(
            'div',
            class_='full-description js-full-description responsive' + \
            '-media formatted-lists'
        ).find_all('iframe')
    elif section == 'risks':
        iframes = soup.find(
            'div',
            class_='mb3 mb10-sm mb3 js-risks'
        ).find_all('iframe')
    
    for iframe in iframes:
        try:
            if 'youtube' in iframe.get('src'):
                youtube_count += 1
        except TypeError:
            pass
    
    return youtube_count

def count_gifs(soup, section):    
    # Initialize total number of GIFs
    gif_count = 0

    # Use tree parsing to compute # of GIFs for each section
    if section == 'about':
        images = soup.find(
            'div',
            class_='full-description js-full-description responsive' + \
            '-media formatted-lists'
        ).find_all('img')
    elif section == 'risks':
        images = soup.find(
            'div',
            class_='mb3 mb10-sm mb3 js-risks'
        ).find_all('img')
    
    for image in images:
        try:
            if 'gif' in image.get('data-src'):
                gif_count += 1
        except TypeError:
            pass
    
    return gif_count

def count_hyperlinks(soup, section):    
    # Use tree parsing to compute hyperlinks for each section
    if section == 'about':
        return len(soup.find(
            'div',
            class_='full-description js-full-description responsive' + \
                '-media formatted-lists'
            ).find_all('a'))
    elif section == 'risks':
        return len(soup.find(
            'div',
            class_='mb3 mb10-sm mb3 js-risks'
            ).find_all('a'))
    
def count_bolded(soup, section):    
    # Use tree parsing to compute hyperlinks for each section
    if section == 'about':
        return len(soup.find(
            'div',
            class_='full-description js-full-description responsive' + \
                '-media formatted-lists'
            ).find_all('b'))
    elif section == 'risks':
        return len(soup.find(
            'div',
            class_='mb3 mb10-sm mb3 js-risks'
            ).find_all('b'))

Load DataFrame containing scraped HTML for each project.

In [4]:
# Load scraped data
file_name = '/mnt/c/Users/Redwan Huq/Downloads/scraped_collection_0-3158.pkl'
scraped_collection = joblib.load(file_name)

Load a master function to extract all features for a project and a specific `section`.

In [5]:
def extract_features(soup, campaign, section):
    num_words = len(get_words(campaign[section]))
    
    # Extract all features for the given section, otherwise return np.nan
    if campaign[section] == 'section_not_found':
        return([np.nan] * 19)
    else:
        return (
            len(get_sentences(campaign[section])),
            num_words,
            len(identify_allcaps(campaign[section])),
            len(identify_allcaps(campaign[section])) / num_words,
            count_exclamations(campaign[section]),
            count_exclamations(campaign[section]) / num_words,
            count_apple_words(campaign[section]),
            count_apple_words(campaign[section]) / num_words,
            compute_avg_words(campaign[section]),
            count_paragraphs(soup, section),
            compute_avg_sents_paragraph(soup, section),
            compute_avg_words_paragraph(soup, section),
            count_images(soup, section),
            count_videos(soup, section),
            count_youtube(soup, section),
            count_gifs(soup, section),
            count_hyperlinks(soup, section),
            count_bolded(soup, section),
            count_bolded(soup, section) / num_words
    )

Initialize empty DataFrames for each section containing column names.

In [6]:
features = ['num_sents', 'num_words', 'num_all_caps', 'percent_all_caps',
             'num_exclms', 'percent_exclms', 'num_apple_words',
             'percent_apple_words', 'avg_words_per_sent', 'num_paragraphs',
             'avg_sents_per_paragraph', 'avg_words_per_paragraph',
             'num_images', 'num_videos', 'num_youtubes', 'num_gifs',
             'num_hyperlinks', 'num_bolded', 'percent_bolded']
section1_df = pd.DataFrame(columns=features)
section2_df = pd.DataFrame(columns=features)

Parse through each project using the scraped HTML and extract features for both sections.

In [7]:
for index, row in scraped_collection[0:4].iterrows():
    # Parse scraped HTML
    soup = parse(row[0])
    
    # Extract and normalize campaign sections
    campaign = get_campaign(soup)
    campaign['about'] = normalize(campaign['about'])
    campaign['risks'] = normalize(campaign['risks'])
    
    # Extract features for each section
    section1_df.loc[index] = extract_features(soup, campaign, 'about')
    section2_df.loc[index] = extract_features(soup, campaign, 'risks')

## Joining extracted features with features collected from the Web Robots database

Since the Web Robots data contains the target variable and other interesting features, we'll need to join these data with the DataFrames containing the newly extracted features.

In [8]:
# Load Web Robots data
web_robots_data = joblib.load(
    'data/web_robots_data/web_robots_data_to_06-2017.pkl'
)

Transform the index labels as a new column called `index` to serve as the join key.

In [9]:
web_robots_data = web_robots_data.reset_index()
section1_df = section1_df.reset_index()
section2_df = section2_df.reset_index()

Perform the join for each campaign section.

In [10]:
section1_merged = section1_df.merge(web_robots_data, how='left', on='index')
section2_merged = section2_df.merge(web_robots_data, how='left', on='index')

Dump the merged DataFrames and label the filename.

In [18]:
# Determine the start and end projects
starting_point = int(file_name.partition('collection_')[2].partition('-')[0])
ending_point = int(file_name.partition('-')[2].partition('.pkl')[0])

In [19]:
joblib.dump(section1_merged, 'section1_extracted_data_{}-{}.pkl'.format(
        starting_point,
        ending_point - 1
    )
)

joblib.dump(section2_merged, 'section2_extracted_data_{}-{}.pkl'.format(
        starting_point,
        ending_point - 1
    )
)

['section2_extracted_data_0-3157.pkl']