**Data Extraction and Cleaning from 'Project Description' of each Kickstarter Campaign**

This notebook has functions for extracting various features from the project description of each campaign.



In [1]:
import nltk
from bs4 import BeautifulSoup
import re
import lxml
import pandas as pd
import numpy as np
from sklearn.externals import joblib

**Helper Functions for text cleaning**

In [1]:
def parse_html(scraped_html):
    """Use the BeautifulSoup library to parse the scraped HTML of a project 
    using an lxml parser"""
    
    return BeautifulSoup(scraped_html.text, 'lxml')


def cleaning(text):    
    
    # Remove line breaks, leading and trailing whitespace, and compress all
    # whitespace to a single space
    text_cleaned = ' '.join(text.split()).strip()
    
    # Remove the HTML5 warning for videos
    return text_cleaned.replace("You'll need an HTML5 capable browser to see this content. " + \
        "Play Replay with sound Play with sound 00:00 00:00",' ')


def campaign_details(soup):
    
    # Collect the "About this project" section if available
    try:
        section1 = soup.find(
            'div',
            class_='full-description js-full-description responsive-media ' + \
                'formatted-lists'
        ).get_text(' ')
    except AttributeError:
        section1 = 'section_not_found'
    
    # Collect the "Risks and challenges" section if available, and remove all
    # unnecessary text
    try:
        section2 = soup.find(
            'div', 
            class_='mb3 mb10-sm mb3 js-risks'
        ) \
            .get_text(' ') \
            .replace('Risks and challenges',' ') \
            .replace('Learn about accountability on Kickstarter',' ')
    except AttributeError:
        section2 = 'section_not_found'
    
    # Clean both campaign sections
    return {'about': cleaning(section1), 'risks': cleaning(section2)}

def normalize(text):
    
    # Tag email addresses with regex
    normalized = re.sub(
        r'\b[\w\-.]+?@\w+?\.\w{2,4}\b',
        'emailaddr',
        text
    )
    
    # Tag hyperlinks with regex
    normalized = re.sub(
        r'(http[s]?\S+)|(\w+\.[A-Za-z]{2,4}\S*)',
        'httpaddr',
        normalized
    )
    
    # Tag money amounts with regex
    normalized = re.sub(r'\$\d+(\.\d+)?', 'dollramt', normalized)
    
    # Tag percentages with regex
    normalized = re.sub(r'\d+(\.\d+)?\%', 'percntg', normalized)
    
    # Tag phone numbers with regex
    normalized = re.sub(
        r'\b(\+\d{1,2}\s)?\d?[\-(.]?\d{3}\)?[\s.-]?\d{3}[\s.-]?\d{4}\b',
        'phonenumbr',
        normalized
    )
    
    # Tag remaining numbers with regex
    return re.sub(r'\d+(\.\d+)?', 'numbr', normalized)





**Helper Functions for extracting imp features from Campaign Descriptions**

In [3]:
def sentences_token(text):
     # Tokenize the text into sentences
    return nltk.sent_tokenize(text)


def punc_cleaning(text):
 
    # Remove punctuation with regex
    return re.sub(r'[^\w\d\s]|\_', ' ', text)


def token_words(text):
    
    # Remove punctuation and then tokenize the text into words
    return [word for word in nltk.word_tokenize(punc_cleaning(text))]


def search_allcaps(text):
        
    # Identify all-caps words with regex
    return re.findall(r'\b[A-Z]{2,}', text)


def exclamations_count(text):
    
    # Count the number of exclamation marks in the text
    return text.count('!')


def imp_words_count(text):
    # Define a set of adjectives used commonly by project writers
    imp_words = frozenset(
        ['revolutionary', 'breakthrough', 'beautiful', 'magical', 
        'gorgeous', 'amazing', 'incredible', 'awesome']
    )
    
    # Count total number of imp_words in the text
    return sum(1 for word in words_token(text) if word in imp_words)



def avg_words_count(text):
    
    # Compute the average number of words in each sentence
    return pd.Series(
        [len(words_token(sentence)) for sentence in sentences_token(text)]
    ).mean()



def paragraphs_count(soup, section):    
    
    # Use tree parsing to count the number of paragraphs depending on which
    # section is requested
    if section == 'about':
        return len(soup.find(
            'div',
            class_='full-description js-full-description responsive' + \
                '-media formatted-lists'
        ).find_all('p'))
    elif section == 'risks':
        return len(soup.find(
            'div',
            class_='mb3 mb10-sm mb3 js-risks'
        ).find_all('p'))
    
def avg_sents_paragraph(soup, section):
    #look at 'about' section
    if section == 'about':
        paragraphs = soup.find(
            'div',
            class_='full-description js-full-description responsive' + \
                '-media formatted-lists'
        ).find_all('p')
    elif section == 'risks':
        paragraphs = soup.find(
            'div',
            class_='mb3 mb10-sm mb3 js-risks'
        ).find_all('p')
    
    # Compute the average number of sentences in each paragraph    
    return pd.Series(
        [len(sentences_token(paragraph.get_text(' '))) for paragraph in \
         paragraphs]
    ).mean()


def avg_words_paragraph(soup, section):

    # Use tree parsing to identify all paragraphs depending on which section
    # is requested
    if section == 'about':
        paragraphs = soup.find(
            'div',
            class_='full-description js-full-description responsive' + \
                '-media formatted-lists'
        ).find_all('p')
    elif section == 'risks':
        paragraphs = soup.find(
            'div',
            class_='mb3 mb10-sm mb3 js-risks'
        ).find_all('p')
    
    # Compute the average number of words in each paragraph
    return pd.Series(
        [len(words_token(paragraph.get_text(' '))) for paragraph in paragraphs]
    ).mean()

def images_count(soup, section):    
    
    # Use tree parsing to identify all image tags depending on which section
    # is requested
    if section == 'about':
        return len(soup.find(
            'div',
            class_='full-description js-full-description responsive' + \
                '-media formatted-lists'
        ).find_all('img'))
    elif section == 'risks':
        return len(soup.find(
            'div',
            class_='mb3 mb10-sm mb3 js-risks'
        ).find_all('img'))
    
def videos_count(soup, section):    
    
    # Use tree parsing to count all non-YouTube video tags depending on which
    # section is requested
    if section == 'about':
        return len(soup.find(
            'div',
            class_='full-description js-full-description responsive' + \
                '-media formatted-lists'
        ).find_all('div', class_='video-player'))
    elif section == 'risks':
        return len(soup.find(
            'div',
            class_='mb3 mb10-sm mb3 js-risks'
         ).find_all('div', class_='video-player'))

def youtube_count(soup, section):    
    
    # Initialize total number of YouTube videos
    youtube_count = 0

    # Use tree parsing to identify all iframe tags depending on which section
    # is requested
    if section == 'about':
        iframes = soup.find(
            'div',
            class_='full-description js-full-description responsive' + \
            '-media formatted-lists'
        ).find_all('iframe')
    elif section == 'risks':
        iframes = soup.find(
            'div',
            class_='mb3 mb10-sm mb3 js-risks'
        ).find_all('iframe')
    
    # Since YouTube videos are contained only in iframe tags, determine which
    # iframe tags contain YouTube videos and count them
    for iframe in iframes:
        # Catch any iframes that fail to include a YouTube source link
        try:
            if 'youtube' in iframe.get('src'):
                youtube_count += 1
        except TypeError:
            pass
    
    return youtube_count


def hyperlinks_count(soup, section):    
    """Count the number of hyperlink tags in a campaign section"""
    # Use tree parsing to compute number of hyperlink tags depending on the
    # section requested
    if section == 'about':
        return len(soup.find(
            'div',
            class_='full-description js-full-description responsive' + \
                '-media formatted-lists'
        ).find_all('a'))
    elif section == 'risks':
        return len(soup.find(
            'div',
            class_='mb3 mb10-sm mb3 js-risks'
        ).find_all('a'))
    
def count_bold_tags(soup, section):    
    """Count the number of bold tags in a campaign section"""
    
    # Use tree parsing to compute number of bolded text tags depending on which
    # section is requested
    if section == 'about':
        return len(soup.find(
            'div',
            class_='full-description js-full-description responsive' + \
                '-media formatted-lists'
        ).find_all('b'))
    elif section == 'risks':
        return len(soup.find(
            'div',
            class_='mb3 mb10-sm mb3 js-risks'
        ).find_all('b'))

def preprocess_text(text):
    """Perform text preprocessing such as removing punctuation, lowercasing all
    words, removing stop words and stemming remaining words"""
    
    # Access stop word dictionary
    stop_words = set(nltk.corpus.stopwords.words('english'))

    # Initialize the Porter stemmer
    porter = nltk.PorterStemmer()
    
    # Remove punctuation and lowercase each word
    text = remove_punc(text).lower()
    
    # Remove stop words and stem each word
    return ' '.join(
        porter.stem(term )
        for term in text.split()
        if term not in set(stop_words)
    )

**Loading html scrapped data**

In [14]:
# Load scraped data
filename1 = '/Users/shwetapai/Desktop/Project5/Data/scraped_collection_0-4999.pkl'
filename2='/Users/shwetapai/Desktop/Project5/Data/scraped_collection_5001-6015.pkl'
scraped_data1 = joblib.load(filename1)
scarped_data2=joblib.load(filename2)

**Super function to extract all features from the text of a campaign**

In [9]:
def feature_extraction(soup, campaign, section):
    """Extract all the features of the text of campaign section"""
   
    
    # Compute the number of words in the section
    num_words = len(words_token(campaign[section]))
    
    # If the section contains no words, assign NaN to 'num_words' to avoid
    # potential division by zero
    if num_words == 0:
        num_words = np.nan
        
    #If the section isn't available, then return NaN for each meta feature.
    if campaign[section] == 'section_not_found':
        return([np.nan] * 19)
    else:
        return (
            len(sentences_token(campaign[section])),  #number of the sentence
            num_words,                                # number of words
            len(search_allcaps(campaign[section])), # number of all_caps
            len(search_allcaps(campaign[section])) / num_words,  #% of all caps
            exclamations_count(campaign[section]),              #number of exclamations
            exclamations_count(campaign[section]) / num_words,    #% of exclamations
            imp_words_count(campaign[section]),                   #number of buzz words
            imp_words_count(campaign[section]) / num_words,     #% of buzz words
            avg_words_count(campaign[section]),                #number of avg words
            paragraphs_count(soup, section),                     #number of paragraphs
            avg_sents_paragraph(soup, section),          #number of sentences per paragraph
            avg_words_paragraph(soup, section),          #number of words per paragraph
            images_count(soup, section),                         #number of images
            videos_count(soup, section),                        # number of videos
            youtube_count(soup, section),                       #number of youtube videos
            hyperlinks_count(soup, section),                    #number of hyperlinks
            count_bold_tags(soup, section),                      #number of bold tag
            count_bold_tags(soup, section) / num_words,          #%of bold tags
            campaign[section]
        )

In [12]:
# Initialize empty DataFrames of features for each section
features = ['num_sents', 'num_words', 'num_all_caps', 'percent_all_caps',
            'num_exclms', 'percent_exclms', 'num_imp_words',
            'percent_imp_words', 'avg_words_per_sent', 'num_paragraphs',
            'avg_sents_per_paragraph', 'avg_words_per_paragraph',
            'num_images', 'num_videos', 'num_youtubes',
            'num_hyperlinks', 'num_bolded', 'percent_bolded',
            'normalized_text']
df_description = pd.DataFrame(columns=features)

df_description1 = pd.DataFrame(columns=features)


In [23]:
def parse(scraped_html):
    """Use the BeautifulSoup library to parse the scraped HTML of a project 
    using an lxml parser"""

    # Parse the HTML content using an lxml parser
    return BeautifulSoup(scraped_html.text, 'lxml')


#parsing scrapped html data
for index, row in scraped_data1.iterrows():
    
    soup = parse(row[0])

    
    # Normalize campaign sections ('About and 'Risks')
    campaign = campaign_details(soup)
    campaign['about'] = normalize(campaign['about'])
    campaign['risks'] = normalize(campaign['risks'])
    
    
    # Extract meta features for each section
    df_description.loc[index] = feature_extraction(soup, campaign, 'about')
    

In [16]:
def parse(scraped_html):
    """Use the BeautifulSoup library to parse the scraped HTML of a project 
    using an lxml parser"""

    # Parse the HTML content using an lxml parser
    return BeautifulSoup(scraped_html.text, 'lxml')


#parsing scrapped html data
for index, row in scarped_data2.iterrows():
    
    soup = parse(row[0])

    
    # Normalize campaign sections ('About and 'Risks')
    campaign = campaign_details(soup)
    campaign['about'] = normalize(campaign['about'])
    
    
    # Extract meta features for each section
    df_description1.loc[index] = feature_extraction(soup, campaign, 'about')
    

In [24]:
df_description1.head()

Unnamed: 0,num_sents,num_words,num_all_caps,percent_all_caps,num_exclms,percent_exclms,num_imp_words,percent_imp_words,avg_words_per_sent,num_paragraphs,avg_sents_per_paragraph,avg_words_per_paragraph,num_images,num_videos,num_youtubes,num_hyperlinks,num_bolded,percent_bolded,normalized_text
6568,65,1248,24,0.019231,15,0.012019,1,0.000801,19.2,16,2.6875,55.3125,21,0,0,0,10,0.008013,We've reached our first stretch goal of dollra...
6569,48,916,20,0.021834,11,0.012009,1,0.001092,19.083333,17,2.588235,49.235294,7,0,0,5,10,0.010917,You can fund the next generation of entreprene...
6571,101,1845,63,0.034146,0,0.0,0,0.0,18.267327,40,1.9,30.2,11,1,0,1,52,0.028184,"STRETCH GOAL dollramt,numbr. All backers will ..."
6572,34,532,22,0.041353,0,0.0,0,0.0,15.647059,31,1.774194,15.83871,15,0,0,1,22,0.041353,Fast percntg Charging (numbrV numbrA Input ) I...
6573,32,551,14,0.025408,5,0.009074,1,0.001815,17.21875,17,2.058824,29.470588,13,0,0,7,8,0.014519,The Purpose: The purpose of The Sirens Project...


In [26]:
df_description.tail()

Unnamed: 0,num_sents,num_words,num_all_caps,percent_all_caps,num_exclms,percent_exclms,num_imp_words,percent_imp_words,avg_words_per_sent,num_paragraphs,avg_sents_per_paragraph,avg_words_per_paragraph,num_images,num_videos,num_youtubes,num_hyperlinks,num_bolded,percent_bolded,normalized_text
6561,48,833,6,0.007203,7,0.008403,0,0.0,17.354167,16,3.1875,50.8125,0,0,0,4,5,0.006002,Imagine a world where there are people that ha...
6562,28,420,0,0.0,0,0.0,1,0.002381,15.0,12,2.666667,35.0,0,0,0,1,4,0.009524,"Synopsis Set in Orlando, Florida, Blood Child ..."
6563,42,851,21,0.024677,2,0.00235,2,0.00235,20.261905,14,3.285714,58.357143,3,0,0,3,9,0.010576,Click the image to read about these books The ...
6564,10,149,0,0.0,0,0.0,0,0.0,14.9,1,10.0,149.0,0,0,0,0,0,0.0,I originally planned to quietly publish this n...
6566,120,2032,17,0.008366,26,0.012795,1,0.000492,16.933333,12,3.75,67.0,3,0,0,3,30,0.014764,Thank You! To get in touch with me about my pr...


**Concatenating both df_description and  df_description1**

In [27]:
frames = [df_description,df_description1]

combined_df = pd.concat(frames)

In [29]:
combined_df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 5938 entries, 2 to 7883
Data columns (total 19 columns):
num_sents                  5922 non-null object
num_words                  5900 non-null object
num_all_caps               5922 non-null object
percent_all_caps           5900 non-null float64
num_exclms                 5922 non-null object
percent_exclms             5900 non-null float64
num_imp_words              5922 non-null object
percent_imp_words          5900 non-null float64
avg_words_per_sent         5902 non-null float64
num_paragraphs             5922 non-null object
avg_sents_per_paragraph    5832 non-null float64
avg_words_per_paragraph    5832 non-null float64
num_images                 5922 non-null object
num_videos                 5922 non-null object
num_youtubes               5922 non-null object
num_hyperlinks             5922 non-null object
num_bolded                 5922 non-null object
percent_bolded             5900 non-null float64
normalized_text       

**Joining extracted features from the'project description' with other features from the Web Robots data**

The Web Robots data contains the target variable, in addition to other interesting features, let's join these data with the extracted meta features and normalized text to complete the dataset.

In [30]:
# Load Web Robots data
web_robots_data = joblib.load('/Users/shwetapai/Desktop/Project5/Data/testing_1.pk')

In [31]:
# Turn the index labels into a new column
web_robots_data = web_robots_data.reset_index()

combined_df = combined_df.reset_index()


Now let's join the extracted meta features and normalized text with the Web Robots data, containing the target variable, for each campaign section using only the projects whose features have been processed.

In [32]:
# Join Web Robots data with extracted features for each section
final_df = combined_df.merge(web_robots_data, how='left', on='index')


In [33]:
#pickling the datasets

joblib.dump(final_df, 'final_all_features.pkl')


['final_all_features.pkl']

In [None]:
#pickling the datasets

joblib.dump(section1_merged, 'finalsection1_all_features.pkl')



In [18]:
joblib.dump(section2_merged, 'finalsection2_all_features.pkl')

['finalsection2_all_features.pkl']