In [1]:

import requests
import numpy as np
import pandas as pd
from sklearn.externals import joblib
import time
import random
from IPython.core.display import clear_output

**Loading File with Scrapped Data**

In [3]:
df = joblib.load('/Users/shwetapai/Desktop/testing_1.pk')

In [4]:
df

Unnamed: 0,name,category,hyperlink,currency,pledged,goal,location,funded
0,Titan P-1 (Underwater Maned Ship),Robots,https://www.kickstarter.com/projects/160940677...,GBP,0.00,200000.0,"West Yorkshire, UK",False
1,Electric Coconut Scraper,Robots,https://www.kickstarter.com/projects/oritha/el...,GBP,688.00,15000.0,"Swansea City and County, UK",False
2,LoadAUF - The Autonomous User Follower,Robots,https://www.kickstarter.com/projects/154469338...,USD,49.00,3000.0,"Miami, FL",False
3,Project RoBro- The learning raspberry pi brain!,Robots,https://www.kickstarter.com/projects/192623886...,USD,5.50,25000.0,"San Angelo, TX",False
4,Jerry: An Affordable Personal Robot,Robots,https://www.kickstarter.com/projects/slantrobo...,USD,3290.00,20000.0,"Boise, ID",False
5,Airstring. GSM Telemetry for your drone,Robots,https://www.kickstarter.com/projects/215849561...,EUR,4887.00,30000.0,"Barcelona, Spain",False
6,Winston v2 - The robotic bartender!,Robots,https://www.kickstarter.com/projects/945840898...,EUR,1201.00,9990.0,"Merelbeke, Belgium",False
7,DonkiBot,Robots,https://www.kickstarter.com/projects/235508607...,USD,6835.00,500000.0,"Germantown, MD",False
8,The G.E.N.E.S.I.S. Robotics Platform,Robots,https://www.kickstarter.com/projects/rickrobot...,USD,156.00,50000.0,"Waverly, IA",False
9,OpenBook,Robots,https://www.kickstarter.com/projects/214341171...,EUR,0.00,12000.0,"Madrid, Spain",False


In [6]:
# Select projects described in American English

df_USD = df[df['currency'] == 'USD']

In [7]:
df_USD

Unnamed: 0,name,category,hyperlink,currency,pledged,goal,location,funded
2,LoadAUF - The Autonomous User Follower,Robots,https://www.kickstarter.com/projects/154469338...,USD,49.00,3000.0,"Miami, FL",False
3,Project RoBro- The learning raspberry pi brain!,Robots,https://www.kickstarter.com/projects/192623886...,USD,5.50,25000.0,"San Angelo, TX",False
4,Jerry: An Affordable Personal Robot,Robots,https://www.kickstarter.com/projects/slantrobo...,USD,3290.00,20000.0,"Boise, ID",False
7,DonkiBot,Robots,https://www.kickstarter.com/projects/235508607...,USD,6835.00,500000.0,"Germantown, MD",False
8,The G.E.N.E.S.I.S. Robotics Platform,Robots,https://www.kickstarter.com/projects/rickrobot...,USD,156.00,50000.0,"Waverly, IA",False
12,HydroActive & Flex Waterproof Headphones They ...,Sound,https://www.kickstarter.com/projects/494246809...,USD,35925.00,10000.0,"Corvallis, OR",True
14,Sound Affections - old fashion Greeting Cards ...,Sound,https://www.kickstarter.com/projects/206159179...,USD,19120.00,18000.0,"Louisville, KY",True
15,Obscura MIDI 8-bit NES & C64 Chiptune Synthesizer,Sound,https://www.kickstarter.com/projects/599725696...,USD,9477.00,2000.0,"Orlando, FL",True
16,VFE Pedals Live Series - guitar pedals and DIY...,Sound,https://www.kickstarter.com/projects/vfepedals...,USD,4232.00,2500.0,"Puyallup, WA",True
17,Revols - Premium Quick Custom-Fit Wireless Ear...,Sound,https://www.kickstarter.com/projects/revols/re...,USD,2530756.27,100000.0,"Montreal, Canada",True


In [8]:
len(df_USD['hyperlink'])

6016

**Extracting 'Project Description ' from the hyperlinks**

In [None]:
# Initalize an empty DataFrame to store scraped HTML
scraped_collection = pd.DataFrame(columns=['scraped_HTML'])

# Record the start time
start_time = time.time()

# Initialize the number of requests
request_count = 0
failed_count=0

# Select which projects to scrape via its index. This is used for starting
# at a position other than the beginning in case the scraper stopped 
# unexpectedly.
starting_point =0
ending_point = 5000

for index, row in df_USD[starting_point:ending_point].iterrows():
    #print(request_count)
    # Perform a request and timeout after 20 seconds since some pages may take
    # longer to scrape
    url=row['hyperlink']
    try:
        scraped_html = requests.get(row['hyperlink'], timeout=10)
        print ("OK -"), url
    except requests.exceptions.ReadTimeout:
        print ("READ TIMED OUT -"), url
        failed_count+=1
        continue
    except requests.exceptions.ConnectionError:
        print ("CONNECT ERROR -"), url
        failed_count+=1
        continue
        
    
    # Pause the loop for a random amount of time
    if request_count%10==0:
        time.sleep(random.uniform(2, 4))
    
    # Monitor the requests by clearing the output and displaying current 
    # progress
    elapsed_time = time.time() - start_time
    clear_output(wait = True)
    print(
        'Request: {}; Row ID: {}; Frequency: {} requests/sec'.format(
            request_count + starting_point,
            index,
            (request_count + 1) / elapsed_time
        )
    )
    request_count += 1
    
    
    # Record scraped HTML
    scraped_collection.loc[index, 'scraped_HTML'] = scraped_html
    
# Display the overall time, average scraping speed and total number of scraped
# project pages
run_time = time.time() - start_time
print()
print('Run time:', run_time)
print('Average rate:', len(scraped_collection) / run_time)
print('# of projects scraped:', len(scraped_collection))
print(failed_count)


Request: 4095; Row ID: 5376; Frequency: 0.12343572677935061 requests/sec


In [11]:
print(scraped_html)

<Response [200]>


In [7]:
# Serialize the data table containing the scraped HTML for each project
joblib.dump(
    scraped_collection, 'scraped_collection_{}-{}.pkl'.format(
        starting_point,
        ending_point - 1
    )
)

['scraped_collection_0-142.pkl']

**Scrapping Rest of the Data**

In [9]:
#scrapping rest of the data

# Initalize an empty DataFrame to store scraped HTML
scraped_collection = pd.DataFrame(columns=['scraped_HTML'])

# Record the start time
start_time = time.time()

# Initialize the number of requests
request_count = 0
failed_count=0

# Select which projects to scrape via its index. This is used for starting
# at a position other than the beginning in case the scraper stopped 
# unexpectedly.
starting_point =5001
ending_point = 6016

for index, row in df_USD[starting_point:ending_point].iterrows():
    #print(request_count)
    # Perform a request and timeout after 20 seconds since some pages may take
    # longer to scrape
    url=row['hyperlink']
    try:
        scraped_html = requests.get(row['hyperlink'], timeout=10)
        print ("OK -"), url
    except requests.exceptions.ReadTimeout:
        print ("READ TIMED OUT -"), url
        failed_count+=1
        continue
    except requests.exceptions.ConnectionError:
        print ("CONNECT ERROR -"), url
        failed_count+=1
        continue
        
    
    # Pause the loop for a random amount of time
    if request_count%10==0:
        time.sleep(random.uniform(2, 4))
    
    # Monitor the requests by clearing the output and displaying current 
    # progress
    elapsed_time = time.time() - start_time
    clear_output(wait = True)
    print(
        'Request: {}; Row ID: {}; Frequency: {} requests/sec'.format(
            request_count + starting_point,
            index,
            (request_count + 1) / elapsed_time
        )
    )
    request_count += 1
    
    
    # Record scraped HTML
    scraped_collection.loc[index, 'scraped_HTML'] = scraped_html
    
# Display the overall time, average scraping speed and total number of scraped
# project pages
run_time = time.time() - start_time
print()
print('Run time:', run_time)
print('Average rate:', len(scraped_collection) / run_time)
print('# of projects scraped:', len(scraped_collection))
print(failed_count)


Request: 6007; Row ID: 7883; Frequency: 0.47360519810264723 requests/sec

Run time: 2126.2467997074127
Average rate: 0.473604475331166
# of projects scraped: 1007
8


In [10]:
# Pickling the data table containing the scraped HTML for each project
joblib.dump(
    scraped_collection, 'scraped_collection_{}-{}.pkl'.format(
        starting_point,
        ending_point - 1
    )
)

['scraped_collection_5001-6015.pkl']