In [7]:
# Description of steps:
# 1. Import existing dataset of indiegogo and create a new dataset which will be used
#    to store the scraped information.
# 2. By assigning each unique project URL to the chromedriver, beautiful soup scraps
#    the information we want. The class which is identified was found by inspecting
#    a random indiegogo project webpage - CTRL + SHIFT + I in Chrome. 
# 3. Iterate the above process for all projects available by using the project ID
# 4. Create updated dataset by adding the new information extracted to the old dataset.
# 5. Export excel file with the new information.

# I used anaconda prompt for python to download all packages first. 
# Language: Python 3.6.

# Import all libraries.
from selenium import webdriver
import pandas as pd
import numpy as np
from bs4 import BeautifulSoup
import xlsxwriter

# Import the dataset - it needs to be on the same folder as the python document.
old_indiegogo = pd.read_excel('MERGED_FINAL1.xlsx')

# Create new dataset which will be used to include the scraped information.
new_indiegogo = old_indiegogo
new_indiegogo['Project_Story'] = new_indiegogo['Project_Title']
new_indiegogo['Project_Title_Scrap'] = new_indiegogo['Project_Title']
new_indiegogo['ProjectID'] = new_indiegogo.ProjectID.astype(str)
    
# Open driver by adding the path where its located on the computer.
driverPath = 'C:/Users/nassi/Desktop/chromedriver.exe'
    
# Create the options of the chrome driver which will be used in
# a later stage for better use.
options = webdriver.ChromeOptions()
    
# When scraping each webpage do not open the browser.
options.add_argument("headless")
    
# Use the above information to define the driver which will scrap the data.
driver = webdriver.Chrome(executable_path=driverPath, chrome_options=options)

new_indiegogo.info()



<class 'pandas.core.frame.DataFrame'>
RangeIndex: 6704 entries, 0 to 6703
Data columns (total 6 columns):
Unnamed: 0             6704 non-null int64
Unnamed: 0.1           6704 non-null int64
ProjectID              6704 non-null object
Project_Title          6703 non-null object
Project_Story          6703 non-null object
Project_Title_Scrap    6703 non-null object
dtypes: int64(2), object(4)
memory usage: 314.3+ KB


In [8]:
# Iterate through each row of our new data set and by indexing
# the project ID, scrap the respective webpage.

for index, row in new_indiegogo.iterrows():
    
    # The ID needs to be a string in order to be identified as part of
    # the URL.
    project_id = str(row['ProjectID'])
    
    url = 'https://www.indiegogo.com/projects/' + project_id
    
    # Run each unique URL with the driver.
    driver.get(url)
    
    # Wait only 4 seconds.
    driver.implicitly_wait(10)
    
    # Create beautiful Soup object for scraping.
    soup = BeautifulSoup(driver.page_source, 'lxml')
    
    # Create object which includes only the information under the 
    # class: routerContentStory-storyBody. 
    # Convert to string in order to perform slicing.
    content = str(soup.find('div', {'class': 'routerContentStory-storyBody'}))
    title = str(soup.find('div', {'class': 'basicsSection-title fullhd t-h3--sansSerif'}))

    # Remove all HTML/CSS commands.
    while content.find('<')!=-1 and content.find('>')!=-1:
        content = content[:content.find('<')] + content[content.find('>') + 1:]
            
    while title.find('<')!=-1 and title.find('>')!=-1:
        title = title[:title.find('<')] + title[title.find('>') + 1:]
        
    # Further "prettify" by removing unnecessary characters.
    content_new = ''.join(content)
    content_new = content_new.replace(', ' , '')
    content_new = content_new.replace('[' , '')
    content_new = content_new.replace(']' , '')
    content_new = content_new.replace('  ' , '')
    content_new = content_new.replace('\n' , ' ')
    
    title_new = ''.join(title)
    title_new = title_new.replace('  ' , '')
    # Remove Indiegogo unecessary extra comment
       
    # Add content to each unique projects 'Project_Story'.
    row['Project_Story'] = content_new
    row['Project_Title_Scrap'] = str(title_new).strip()

# Create final_indiegogo.xlsx document in which the final data for all projects will be added.
writer = pd.ExcelWriter('MERGED_FINAL1_1.xlsx', engine='xlsxwriter')

# Convert the dataframe to an XlsxWriter Excel object.
new_indiegogo.to_excel(writer)

# Output the Excel file.
writer.save()