In [1]:
from selenium import webdriver
from selenium.webdriver.common.keys import Keys
import multiprocessing
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC

from multiprocessing import Pool

from selenium.common.exceptions import TimeoutException

import pickle
import random
import numpy as np

In [2]:
import sys
from pathlib import Path
sys.path.append(str(Path("../lib/").absolute()))

In [18]:
# initial seeding
seeds_to_visit = {
    503443009,
    482158197,
    482141037,
    663930464,
    743478194,
    670283087
}
scraped = {}


In [24]:
with open('seeds_to_visit.pkl', 'rb') as f:
    seeds_to_visit = pickle.load(f).union(seeds_to_visit)
    
with open('wattpad_scraped.pkl', 'rb') as f:
    loaded = pickle.load(f)
    scraped.update(loaded)

In [25]:
def open_headless():
    
    from selenium.webdriver.firefox.options import Options

    options = Options()
    options.headless = True
    driver = webdriver.Firefox(options=options)
    return driver

def scrape_pages(shared_scraped, shared_next, pages):
    driver = open_headless()
    for story_id in pages:
        driver.get('https://www.wattpad.com/' + str(story_id))
        import re
        # scrape the page to look for any stories we could navigate to
        new_stories = set(re.findall('(?:story|wattpad.com)/([0-9]{9})', driver.page_source))
        print('On', story_id, 'Found', len(new_stories))
        for new_id in new_stories:
            if new_id not in shared_scraped and new_id != story_id:
                shared_next.append(int(new_id))

        elements = driver.find_elements_by_css_selector("pre")
        if len(elements) > 0:
            if story_id in shared_scraped:
                print('Story Id', story_id, "already parsed!")
            shared_scraped[story_id] = elements[0].text
            print('Added story', len(shared_scraped))
        else:
            shared_scraped[story_id] = None
            print('Found Nothing on', str(story_id))
    print('Finished')
    

In [26]:
def scrape_pool(already_scraped, to_visit, num_processes=2):
    pool = Pool(num_processes)
    manager = multiprocessing.Manager()
    
    shared_scraped = manager.dict()
    shared_next = manager.list()
    for k, v in already_scraped.items():
        shared_scraped[k] = v
    for item in to_visit:
        shared_next.append(item)
        
    for id_set in np.array_split(np.array(list(to_visit)), num_processes):
        pool.apply_async(scrape_pages, args=(shared_scraped, shared_next, id_set))

    pool.close()
    pool.join()
    return shared_scraped.copy(), set(shared_next) - set(shared_scraped.keys())


In [27]:
while len(scraped) < 1000:
    scraped, seeds_to_visit = scrape_pool(scraped, seeds_to_visit, num_processes=10)
    print(len(scraped), len(seeds_to_visit))

On 153022603 Found 11
On 152980148 Found 11
Added story 296
Added story 297
Finished
On 152943867 Found 11
On 152966821 Found 11
Added story 298
Finished
Added story 299
On 153416193 Found 11
Added story 300
On 152998429 Found 11
Added story 301
On 153038268 Found 11
On 743478194 Found 40
Added story 302
Finished
Added story 303
On 153266681 Found 11
Added story 304
Finished
On 670283087 Found 41
Added story 305
Finished
On 482158197 Found 104
Story Id 482158197 already parsed!
Added story 305
Finished
On 152946739 Found 12
Added story 306
Finished
On 503443009 Found 28
Story Id 503443009 already parsed!
Added story 306
Finished
On 663930464 Found 23
Added story 307
Finished
On 482141037 Found 103
Story Id 482141037 already parsed!
Added story 307
Finished
307 99
On 669991298 Found 40
Added story 308
On 674125094 Found 42
On 620139650 Found 37
On 643351254 Found 40
Added story 309
Added story 310
Added story 311
On 617962940 Found 38
On 639316318 Found 37
Added story 312
Added story 31

Added story 492
On 142618137 Found 19
Added story 493
On 192835465 Found 14
Added story 494
On 141418918 Found 19
Added story 495
On 214273530 Found 14
Added story 496
On 128920880 Found 2
Added story 497
On 193251496 Found 13
On 126531100 Found 18
On 193886669 Found 14
On 118048688 Found 14
On 175320692 Found 39
Added story 498
Added story 499
On 119465356 Found 15
Added story 500
Added story 501
Added story 502
Added story 503
On 178692129 Found 2
Found Nothing on 178692129
On 194244567 Found 13
On 193172095 Found 2
Found Nothing on 193172095
Added story 506
On 153565618 Found 5
Added story 507
Finished
On 191719309 Found 14
Added story 508
Finished
On 116523048 Found 2
Found Nothing on 116523048
Finished
On 129821824 Found 17
Added story 510
On 179357274 Found 13
On 188647597 Found 192
Added story 511
Added story 512
On 166165631 Found 108
Added story 513
On 183232314 Found 14
Added story 514
Finished
On 195827888 Found 2
Found Nothing on 195827888
On 177347204 Found 2
Found Nothing

On 198831158 Found 107
On 209072973 Found 39
Added story 699
Added story 700
Added story 701
On 195994921 Found 39
On 385797596 Found 44
Added story 702
Added story 703
On 155927182 Found 201
On 210057662 Found 201
Added story 704
On 155193381 Found 107
Added story 705
Added story 706
On 194580366 Found 201
Added story 707
On 183357166 Found 191
Added story 708
On 155050431 Found 108
On 214006414 Found 192
Added story 709
Added story 710
On 107966193 Found 2
Found Nothing on 107966193
On 162788409 Found 202
Added story 712
On 116714131 Found 2
Found Nothing on 116714131
On 184763691 Found 192
On 155050258 Found 106
On 154997595 Found 107
Added story 714
Added story 715
Added story 716
On 198876636 Found 107
On 194405434 Found 107
Added story 717
Added story 718
On 236941972 Found 201
Added story 719
On 155880285 Found 201
On 155885454 Found 200
Added story 720
On 155025449 Found 106
Added story 721
On 183357171 Found 192
Added story 722
Added story 723
On 218990868 Found 202
On 2190137

On 225539616 Found 192
Added story 909
On 185182735 Found 192
Added story 910
On 171835339 Found 200
On 194538842 Found 191
On 195681186 Found 191
On 155876022 Found 201
Added story 911
Added story 912
Added story 913
Added story 914
On 189159202 Found 192
Added story 915
On 189771308 Found 191
Added story 916
On 178180424 Found 38
Added story 917
On 170534054 Found 191
On 200626268 Found 192
Added story 918
On 192371218 Found 201
Added story 919
Added story 920
On 171837389 Found 200
Added story 921
On 186406236 Found 191
On 176511071 Found 5
Added story 922
Added story 923
On 168975273 Found 191
On 155032082 Found 108
On 190358199 Found 192
Added story 924
On 189159207 Found 191
Added story 925
Added story 926
Added story 927
On 173827241 Found 39
Added story 928
On 185271855 Found 107
On 155885898 Found 202
Added story 929
Added story 930
On 155925012 Found 201
Added story 931
On 155877161 Found 201
Added story 932
On 188312248 Found 191
Added story 933
On 155017546 Found 107
On 155

Added story 1114
On 155015768 Found 108
Added story 1115
On 192110768 Found 201
Added story 1116
On 161466972 Found 42
On 219002266 Found 201
Added story 1117
Finished
Added story 1118
On 598286513 Found 42
Added story 1119
On 155889845 Found 201
Added story 1120
Finished
On 184319387 Found 192
Added story 1121
Finished
1121 189


In [124]:
len(scraped)

295

In [125]:
len(seeds_to_visit)

9

In [123]:
with open('seeds_to_visit.pkl', 'wb') as f:
    pickle.dump(seeds_to_visit, f)
    
with open('wattpad_scraped.pkl', 'wb') as f:
    pickle.dump(scraped, f)