In [1]:
import pandas as pd

import json
import re
import os
import time
import pickle

from selenium import webdriver
from selenium.common.exceptions import NoSuchElementException

import requests
from bs4 import BeautifulSoup

chromedriver = "/Applications/chromedriver" # path to the chromedriver executable
os.environ["webdriver.chrome.driver"] = chromedriver

driver = webdriver.Chrome(chromedriver)

### Launch the Kickstarter website

In [2]:
KICKSTARTER_URL = 'https://www.kickstarter.com/discover/advanced'
driver.get(KICKSTARTER_URL)

### Select the location as Unites States from the pull down menu

In [3]:
driver.find_element_by_id("location_filter").click()
driver.find_element_by_link_text('United States').click()

### Sort the entries on the page based on Most Funded

In [4]:
driver.find_element_by_id("sorts").click()
driver.find_element_by_link_text('Most Funded').click()

### Find the names and ids of the various categories of the projects

In [5]:
driver.find_element_by_id("category_filter").click()
categories_menu = driver.find_element_by_class_name('js-root-categories')
categories_list = categories_menu.find_elements_by_tag_name('li')
categories_name = [c.text for c in categories_list]
categories_id = [c.find_element_by_tag_name('a').get_attribute('data-id') for c in categories_list]

### Ignore the first entry

In [6]:
categories_name = categories_name[1:]
categories_id = categories_id[1:]
for name, cid in zip(categories_name, categories_id):
    print(cid + '---' + name)

1---Art
3---Comics
26---Crafts
6---Dance
7---Design
9---Fashion
11---Film & Video
10---Food
12---Games
13---Journalism
14---Music
15---Photography
18---Publishing
16---Technology
17---Theater


### Get the URL of the first page of the first category 

In [7]:
prev_url = driver.current_url
driver.find_element_by_link_text(categories_name[0]).click()
driver.find_element_by_id('category_'+categories_id[0]).click()
category_url = driver.current_url
while prev_url==category_url:
    category_url = driver.current_url

print(category_url)

https://www.kickstarter.com/discover/advanced?category_id=1&woe_id=23424977&sort=most_funded&seed=2529653&page=1


### Generate the URLs of the other categories using the above URL as the base

In [8]:
# change the category id in the above URL
category_url_list = []
for cid in categories_id:
    category_url_list.append(re.sub('1',cid, category_url, 1))

In [9]:
category_url_list

['https://www.kickstarter.com/discover/advanced?category_id=1&woe_id=23424977&sort=most_funded&seed=2529653&page=1',
 'https://www.kickstarter.com/discover/advanced?category_id=3&woe_id=23424977&sort=most_funded&seed=2529653&page=1',
 'https://www.kickstarter.com/discover/advanced?category_id=26&woe_id=23424977&sort=most_funded&seed=2529653&page=1',
 'https://www.kickstarter.com/discover/advanced?category_id=6&woe_id=23424977&sort=most_funded&seed=2529653&page=1',
 'https://www.kickstarter.com/discover/advanced?category_id=7&woe_id=23424977&sort=most_funded&seed=2529653&page=1',
 'https://www.kickstarter.com/discover/advanced?category_id=9&woe_id=23424977&sort=most_funded&seed=2529653&page=1',
 'https://www.kickstarter.com/discover/advanced?category_id=11&woe_id=23424977&sort=most_funded&seed=2529653&page=1',
 'https://www.kickstarter.com/discover/advanced?category_id=10&woe_id=23424977&sort=most_funded&seed=2529653&page=1',
 'https://www.kickstarter.com/discover/advanced?category_id=1

In [10]:
driver.quit()

### Get the project info for all the projects in each category 

### Using BeautifulSoup

In [None]:
# Using BeautifulSoup
all_proj_list = []

for category, url in zip(categories_name, category_url_list):
    page_num = 1
    while True:
    #while page_num<3:
        curr_url = url[:-1]+str(page_num)
        response = requests.get(curr_url)
        if response.status_code!=200:
            print('Category: %s has no page: %d' %(category, page_num))
            break
        
        print(curr_url)
        
        page_soup = BeautifulSoup(response.text, "html5lib")
        div_tags = page_soup.findAll('div', {'class':'js-react-proj-card col-full col-sm-12-24 col-lg-8-24'}) 
        
        
        project_div_list = []
        for tag in div_tags:
            project_div_list.append(tag['data-project'])

        for proj_div in project_div_list:
            proj = json.loads(proj_div)
            proj_dict = {'category': category,
                         'project_id': proj['id'],
                         'project_name': proj['name'],
                         'blurb':proj['blurb'],
                         'goal_amount':proj['goal'],
                         'pledged_amount':proj['pledged'],
                         'fx_rate':proj['fx_rate'],
                         'currency':proj['currency'],
                         'project_status':proj['state'],
                         'location':proj['location']['short_name'],
                         'city':proj['location']['localized_name'],
                         'state':proj['location']['state'],
                         'country':proj['country'],
                         #'country':proj['country'],
                         #'state':proj['location']['state'],
                         'subcategory':proj['category']['name'],
                         #'parent_id':proj['category']['parent_id'],
                         'percent_funded':proj['percent_funded'],
                         'state_changed_at':proj['state_changed_at'],
                         'created_at':proj['created_at'],
                         'launched_at':proj['launched_at'],
                         'deadline':proj['deadline'],
                         'backers':proj['backers_count'],
                         'project_url':proj['urls']['web']['project'],
                         'rewards_url':proj['urls']['web']['rewards']
                        }
            all_proj_list.append(proj_dict)
            
        page_num += 1
        

### Tried the same with Selenium !!!

### Create a pandas dataframe

In [None]:
proj_dict.keys()

In [None]:
project_df = pd.DataFrame(all_proj_list, 
                          columns=['category', 'subcategory', 'project_id','project_name', 'blurb',
                                   'goal_amount', 'pledged_amount', 'percent_funded', 'currency', 'fx_rate',
                                   'project_status', 'location', 'city', 'state', 'country',
                                   'state_changed_at', 'created_at', 'launched_at','deadline',
                                   'backers', 'project_url', 'rewards_url'])

In [None]:
project_df.to_csv('KickstarterScrapeDataBS.csv', index=False)

## Initial cleaning of data

In [None]:
project_df = pd.read_csv('KickstarterScrapeDataBS.csv')

### Convert dates to datetime objects

In [None]:
project_df['state_changed_at'] = pd.to_datetime(project_df['state_changed_at'], unit='s')
project_df['created_at'] = pd.to_datetime(project_df['created_at'], unit='s')
project_df['launched_at'] = pd.to_datetime(project_df['launched_at'], unit='s')
project_df['deadline'] = pd.to_datetime(project_df['deadline'], unit='s')

In [None]:
project_df['duration'] = (project_df['deadline']-project_df['launched_at']).dt.days

In [None]:
project_df.drop_duplicates(['project_id', 'project_url'], inplace=True)
project_df.shape

In [None]:
project_df = project_df[project_df['country']=='US']
project_df.shape

In [None]:
project_df = project_df[(project_df['project_status']!='suspended') & (project_df['project_status']!='canceled')]
project_df.shape

In [None]:
project_df.head()

### Get the reward levels for all the projects

In [None]:
def getRewardsBackers(rewards_url):
    #headers = {'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_13_3) AppleWebKit/604.5.6 (KHTML, like Gecko) \
    #            Version/11.0.3 Safari/604.5.6'}

    response = requests.get(url)
    if response.status_code!=200:
        print('request failed at index, url: %d, %s' %(idx, url))
        return None
    soup = BeautifulSoup(response.text, "html5lib")

    rewards_list = []
    backers_list = []
        #print(idx, url)
        
    for pledge in soup.findAll('div',{'class':'pledge__info'}):
        tmp1 = pledge.find('span',{'class':'money'})
        if tmp1:
            rewards_list.append(tmp1.text)
                #rewards_list.append(re.sub('[^0-9]', '',tmp1.text))
        else:
            continue 
        tmp2 = pledge.find('span',{'class':'pledge__backer-count'})
        if tmp2:
            backers_list.append(tmp2.text)
                #backers_list.append(re.sub(' backers*', '', tmp2.text.strip()))
        else:
            rewards_list.pop()
                                
    return rewards_list, backers_list

In [None]:
all_proj_rewards = []
all_proj_backers = []
rewards_url_arr = project_df['rewards_url'].as_matrix()

idx = 1
for url in rewards_url_arr:    
    rewards_list, backers_list = getRewardsBackers(url)
   
    all_proj_rewards.append(rewards_list)
    all_proj_backers.append(backers_list)
    
    idx += 1
    
    if (idx%200==0):
        print(url)


In [None]:
with open("all_proj_rewards2.txt", "wb") as fpr:   #Pickling
    pickle.dump(all_proj_rewards, fpr)

with open("all_proj_backers2.txt", "wb") as fpb:   #Pickling
    pickle.dump(all_proj_backers, fpb)

### create the rewards_tier and backers_tier columns

In [None]:
proj_rewards = []
proj_backers = []
for rewards, backers in zip(all_proj_rewards, all_proj_backers):
    rewards_list = [re.sub('[^0-9]', '',r) for r in rewards]
    backers_list = [re.sub(' backers*', '', b.strip()) for b in backers]
    
    proj_rewards.append(rewards_list)
    proj_backers.append(backers_list)

In [None]:
project_df['rewards_tier'] = proj_rewards 
project_df['backers_tier'] = proj_backers 

### Save the dataframe to file

In [None]:
project_df.to_csv('KickstarterScrapeDataBS_final.csv', index=False)

In [None]:
project_df.to_pickle('KickstarterScrapeDataBS_final.txt')

### Tried with Selenium 