In [49]:
# I used anaconda prompt for python to download all packages.
# Import all libraries.
from selenium import webdriver
import pandas as pd
import numpy as np
from bs4 import BeautifulSoup
import xlsxwriter

# Import the dataset - it needs to be on the same file as the python document.
old_indiegogo = pd.read_excel('Nassia1.xlsx')

# Create new dataset which will be used to include the scraped information.
new_indiegogo = old_indiegogo.filter(['ProjectID','Project_Title'], axis=1)
new_indiegogo.columns = ['ProjectID','Project_Story']
new_indiegogo['ProjectID'] = new_indiegogo.ProjectID.astype(str)
new_indiegogo.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 4 entries, 0 to 3
Data columns (total 2 columns):
ProjectID        4 non-null object
Project_Story    4 non-null object
dtypes: object(2)
memory usage: 144.0+ bytes


In [50]:
# Iterate through each row of our new data set and by indexing
# the project ID, scrap the respective webpage.

for index, row in new_indiegogo.iterrows():
    
    # The ID needs to be a string in order to be identified as part of
    # the URL.
    project_id = str(row['ProjectID'])
    
    url = 'https://www.indiegogo.com/projects/' + project_id
    
    # Open driver by adding the path where its located.
    driverPath = 'C:/Users/nassi/Desktop/chromedriver.exe'
    
    # Create the options of the chrome driver which will be used in
    # a later stage for better use.
    options = webdriver.ChromeOptions()
    
    # When scraping each webpage do not open the browser.
    options.add_argument("headless")
    
    # Use the above information to define the driver which will scrap the data.
    driver = webdriver.Chrome(executable_path=driverPath, chrome_options=options)
    
    # Wait only 4 seconds.
    driver.implicitly_wait(4)
    
    # Run each unique URL with the driver.
    driver.get(url)
    
    # Create beautiful Soup object for scraping.
    soup = BeautifulSoup(driver.page_source, 'lxml')
    
    # Create object which includes only the information under the 
    # class: routerContentStory-storyBody. 
    # Convert to string in order to perform slicing.
    content = str(soup.find_all('div', {'class': 'routerContentStory-storyBody'}))
    
    # Remove all HTML/CSS commands.
    while content.find('<')!=-1 and content.find('>')!=-1:
        content = content[:content.find('<')] + content[content.find('>') + 1:]
    
    # Further "prettify" by removing unnecessary characters.
    content_new = ''.join(content)
    content_new = content_new.replace(', ' , '')
    content_new = content_new.replace('[' , '')
    content_new = content_new.replace(']' , '')
    content_new = content_new.replace('  ' , '')
    content_new = content_new.replace('\n' , ' ')
    
    # Add content to each unique projects information
    row['Project_Story'] = content_new



In [51]:
print(new_indiegogo)

  ProjectID                                      Project_Story
0    261303   Rains In LA tells the cute love story depicte...
1    261305   Gosu's overarching goal is to be Seattle's fi...
2    261310  FIRST Robotics Competition Team 980 ThunderBot...
3    261312   Short Summary My Father never Loved Me… How d...


In [52]:
# Add the new column information to the old dataset.
old_indiegogo['Project_Story'] = new_indiegogo['Project_Story']
print(old_indiegogo)

                           Hard link  ProjectID  \
0  www.indiegogo.com/projects/261303     261303   
1  www.indiegogo.com/projects/261305     261305   
2  www.indiegogo.com/projects/261310     261310   
3  www.indiegogo.com/projects/261312     261312   

                               Project_Title  \
0                                Rains In LA   
1       Gosu: Seattle's First eSports Lounge   
2  FRC Team 980 Thunderbots - FIRST Robotics   
3                            A Father's Love   

                                    Project_Subtitle Project_Category  \
0  A creative and exciting non profit short film ...             Film   
1  Gosu is Seattle's first eSports lounge, with c...           Gaming   
2  FIRST Robotics Competition Team 980 ThunderBot...        Education   
3  The Most Incredible Way to Spread the Love tha...          Writing   

  Project_City Project_Country                    Project_Location  \
0       London              GB              London, United Kingdom 

In [53]:
# Create xlsx document in which the new data will be added
writer = pd.ExcelWriter('final_indiegogo.xlsx', engine='xlsxwriter')

# Convert the dataframe to an XlsxWriter Excel object.
old_indiegogo.to_excel(writer)

# Close the Pandas Excel writer and output the Excel file.
writer.save()