# Data Acquisition: Cancer Digital Slide Archive

This notebook illustrates the steps used to retrieve data from http://cancer.digitalslidearchive.net/. This process was mostly automated, and primarily relied on web scraping methods.

In [6]:
from bs4 import BeautifulSoup as BS
import pandas as pd
import os
from os import listdir
from os.path import isfile, join
import urllib.request
import re
import joblib
num_cores = 8

ModuleNotFoundError: No module named 'bs4'

In [7]:
!sudo pip install BeautifulSoup4

[sudo] password for raj: 


Initialize empty dataframe:

In [3]:
pdf = pd.DataFrame(columns = ['url', 'collection'])

Read text files from directory. These text files were created by manually browsing each collection and copying the data into the respective .txt files.

In [4]:
mypath = '/Users/aadi/Google Drive/School/MS Data Analytics/Master\'s Project/tcga_scraping'
onlyfiles = [f for f in listdir(mypath) if isfile(join(mypath, f))]

In [5]:
onlyfiles

['acc.txt',
 'blca.txt',
 'brca.txt',
 'cesc.txt',
 'chol.txt',
 'coad.txt',
 'dlbc.txt',
 'esca.txt',
 'gbm.txt',
 'hnsc.txt',
 'kich.txt',
 'kirp.txt',
 'lgg.txt',
 'lihc.txt',
 'luad.txt',
 'lusc.txt',
 'meso.txt',
 'ov.txt',
 'paad.txt',
 'pcpg.txt',
 'prad.txt',
 'read.txt',
 'sarc.txt',
 'skcm.txt',
 'stad.txt',
 'tgct.txt',
 'thca.txt',
 'thym.txt',
 'ucec.txt',
 'ucs.txt',
 'uvm.txt']

Retrieve img tags from the html documents. These tags correspond to thumbnails displayed for each slide:

In [8]:
for file in onlyfiles:
    print(file)
    with open(mypath+'/'+file, 'r') as myfile:
        data=myfile.read().replace('\n', '')

    soup = BS(data, "lxml")

    for imgtag in soup.find_all('img'):
        pdf = pdf.append({"url": imgtag, "collection":file}, ignore_index=True)
pdf_bk = pdf

acc.txt
blca.txt
brca.txt
cesc.txt
chol.txt
coad.txt
dlbc.txt
esca.txt
gbm.txt
hnsc.txt
kich.txt
kirp.txt
lgg.txt
lihc.txt
luad.txt
lusc.txt
meso.txt
ov.txt
paad.txt
pcpg.txt
prad.txt
read.txt
sarc.txt
skcm.txt
stad.txt
tgct.txt
thca.txt
thym.txt
ucec.txt
ucs.txt
uvm.txt


Data Cleaning: Convert BS4 objects to strings, and change thumbnail width from 200 to 500:

In [9]:
pdf['collection'] = pdf['collection'].astype(str)
pdf['collection'] = pdf['collection'].str.replace('.txt', '')

In [10]:
pdf['url'] = pdf['url'].astype(str)
pdf = pdf[pdf['url'].str.contains("emory")]
pdf['url'] = pdf['url'].str.replace('WID=200', 'WID=500')

Unnamed: 0,url,collection
0,"<img src=""local_images/CDSA_Slide_50.png"" styl...",acc
1,"<img height=""85px"" src=""imgs/Winship_06-2011/W...",acc
2,"<img class=""dhx_combo_img"" src=""dsa-common-fil...",acc
3,"<img src=""http://node15.cci.emory.edu/cgi-bin/...",acc
4,"<img src=""http://node15.cci.emory.edu/cgi-bin/...",acc
5,"<img src=""http://node15.cci.emory.edu/cgi-bin/...",acc
6,"<img src=""http://node15.cci.emory.edu/cgi-bin/...",acc
7,"<img src=""http://node15.cci.emory.edu/cgi-bin/...",acc
8,"<img src=""http://node15.cci.emory.edu/cgi-bin/...",acc
9,"<img src=""http://node15.cci.emory.edu/cgi-bin/...",acc


Gather urls from between quotation marks, and construct slide names:

In [13]:
pdf['url'] = pdf['url'].str.extract('"([^"]*)"')
pdf_bk = pdf
pdf['url'] = pdf['url'].str.replace('&amp;', '&')
pdf['name'] = 'TCGA'+pdf['url'].str.extract('TCGA([^\.]*)\.')

  """Entry point for launching an IPython kernel.
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  """Entry point for launching an IPython kernel.


'http://node15.cci.emory.edu/cgi-bin/iipsrv.fcgi?FIF=/bigdata2/PYRAMIDS/CDSA/BRCA_Frozen/nationwidechildrens.org_BRCA.tissue_images.Level_1.147.1.0/TCGA-D8-A27N-01A-01-TS1.f4371b61-6f8f-48c6-9493-d9c96402634f.svs.dzi.tif&amp;WID=500&amp;CVT=jpeg'

#### Download Thumbnails:

In [22]:
folder_names = pdf['collection'].unique()
img_path = '/Users/aadi/Documents/tcga_imgs/thumbnails/'

def process_download(url_idx, df_slice, category):
    img_filename = img_path + category + '/' + df_slice['name'].iloc[url_idx] + ".jpg"
    img_url = df_slice.iloc[url_idx, 0]
    try:
        urllib.request.urlretrieve(img_url, img_filename)
    except:
        pass
    print(category+': '+str(url_idx+1)+'/'+str(len(df_slice['url'])+1))
    
for category in folder_names[13:]:
    category_dir = img_path+category
    if not os.path.exists(category_dir):
        os.makedirs(category_dir)
    df_slice = pdf[pdf['collection'].str.contains(category)]
    joblib.Parallel(n_jobs=num_cores)(joblib.delayed(process_download)(i, df_slice, category) for i, image in enumerate(df_slice['url']))


lihc: 4/1028
lihc: 6/1028
lihc: 2/1028
lihc: 1/1028
lihc: 8/1028
lihc: 3/1028
lihc: 10/1028
lihc: 12/1028
lihc: 5/1028
lihc: 7/1028
lihc: 11/1028
lihc: 9/1028
lihc: 14/1028
lihc: 16/1028
lihc: 15/1028
lihc: 25/1028
lihc: 13/1028
lihc: 23/1028
lihc: 27/1028
lihc: 17/1028
lihc: 29/1028
lihc: 31/1028
lihc: 19/1028
lihc: 21/1028
lihc: 18/1028
lihc: 26/1028
lihc: 24/1028
lihc: 28/1028
lihc: 20/1028
lihc: 33/1028
lihc: 30/1028
lihc: 35/1028
lihc: 32/1028
lihc: 37/1028
lihc: 22/1028
lihc: 39/1028
lihc: 45/1028
lihc: 40/1028
lihc: 43/1028
lihc: 34/1028
lihc: 38/1028
lihc: 41/1028
lihc: 36/1028
lihc: 46/1028
lihc: 47/1028
lihc: 42/1028
lihc: 44/1028
lihc: 49/1028
lihc: 48/1028
lihc: 51/1028
lihc: 55/1028
lihc: 53/1028
lihc: 50/1028
lihc: 57/1028
lihc: 61/1028
lihc: 52/1028
lihc: 59/1028
lihc: 63/1028
lihc: 65/1028
lihc: 54/1028
lihc: 56/1028
lihc: 67/1028
lihc: 66/1028
lihc: 62/1028
lihc: 58/1028
lihc: 60/1028
lihc: 64/1028
lihc: 71/1028
lihc: 73/1028
lihc: 77/1028
lihc: 79/1028
lihc: 69/1028
l

lihc: 556/1028
lihc: 557/1028
lihc: 554/1028
lihc: 559/1028
lihc: 558/1028
lihc: 561/1028
lihc: 560/1028
lihc: 563/1028
lihc: 562/1028
lihc: 566/1028
lihc: 567/1028
lihc: 564/1028
lihc: 569/1028
lihc: 571/1028
lihc: 565/1028
lihc: 574/1028
lihc: 576/1028
lihc: 577/1028
lihc: 572/1028
lihc: 578/1028
lihc: 570/1028
lihc: 573/1028
lihc: 575/1028
lihc: 568/1028
lihc: 584/1028
lihc: 585/1028
lihc: 580/1028
lihc: 579/1028
lihc: 583/1028
lihc: 587/1028
lihc: 589/1028
lihc: 591/1028
lihc: 588/1028
lihc: 592/1028
lihc: 593/1028
lihc: 594/1028
lihc: 590/1028
lihc: 582/1028
lihc: 599/1028
lihc: 586/1028
lihc: 596/1028
lihc: 595/1028
lihc: 581/1028
lihc: 601/1028
lihc: 600/1028
lihc: 603/1028
lihc: 602/1028
lihc: 605/1028
lihc: 604/1028
lihc: 607/1028
lihc: 609/1028
lihc: 606/1028
lihc: 608/1028
lihc: 611/1028
lihc: 598/1028
lihc: 610/1028
lihc: 617/1028
lihc: 612/1028
lihc: 614/1028
lihc: 615/1028
lihc: 619/1028
lihc: 618/1028
lihc: 621/1028
lihc: 620/1028
lihc: 613/1028
lihc: 623/1028
lihc: 616/

luad: 80/1447
luad: 78/1447
luad: 82/1447
luad: 86/1447
luad: 81/1447
luad: 83/1447
luad: 84/1447
luad: 85/1447
luad: 88/1447
luad: 87/1447
luad: 92/1447
luad: 90/1447
luad: 89/1447
luad: 91/1447
luad: 98/1447
luad: 93/1447
luad: 94/1447
luad: 95/1447
luad: 96/1447
luad: 97/1447
luad: 100/1447
luad: 99/1447
luad: 102/1447
luad: 101/1447
luad: 103/1447
luad: 107/1447
luad: 106/1447
luad: 105/1447
luad: 104/1447
luad: 108/1447
luad: 114/1447
luad: 111/1447
luad: 112/1447
luad: 113/1447
luad: 109/1447
luad: 110/1447
luad: 115/1447
luad: 119/1447
luad: 116/1447
luad: 123/1447
luad: 124/1447
luad: 120/1447
luad: 118/1447
luad: 122/1447
luad: 117/1447
luad: 126/1447
luad: 128/1447
luad: 125/1447
luad: 121/1447
luad: 130/1447
luad: 131/1447
luad: 132/1447
luad: 133/1447
luad: 129/1447
luad: 127/1447
luad: 136/1447
luad: 134/1447
luad: 135/1447
luad: 141/1447
luad: 137/1447
luad: 138/1447
luad: 143/1447
luad: 142/1447
luad: 139/1447
luad: 144/1447
luad: 140/1447
luad: 148/1447
luad: 147/1447
l

luad: 627/1447
luad: 629/1447
luad: 630/1447
luad: 628/1447
luad: 635/1447
luad: 631/1447
luad: 632/1447
luad: 633/1447
luad: 634/1447
luad: 636/1447
luad: 639/1447
luad: 638/1447
luad: 637/1447
luad: 640/1447
luad: 642/1447
luad: 643/1447
luad: 644/1447
luad: 641/1447
luad: 646/1447
luad: 645/1447
luad: 647/1447
luad: 648/1447
luad: 650/1447
luad: 651/1447
luad: 654/1447
luad: 652/1447
luad: 649/1447
luad: 655/1447
luad: 653/1447
luad: 658/1447
luad: 656/1447
luad: 660/1447
luad: 659/1447
luad: 662/1447
luad: 657/1447
luad: 663/1447
luad: 661/1447
luad: 664/1447
luad: 665/1447
luad: 666/1447
luad: 669/1447
luad: 670/1447
luad: 672/1447
luad: 667/1447
luad: 676/1447
luad: 671/1447
luad: 668/1447
luad: 673/1447
luad: 677/1447
luad: 675/1447
luad: 674/1447
luad: 679/1447
luad: 678/1447
luad: 680/1447
luad: 681/1447
luad: 684/1447
luad: 682/1447
luad: 683/1447
luad: 687/1447
luad: 688/1447
luad: 685/1447
luad: 689/1447
luad: 686/1447
luad: 690/1447
luad: 692/1447
luad: 691/1447
luad: 693/

luad: 1163/1447
luad: 1165/1447
luad: 1164/1447
luad: 1170/1447
luad: 1169/1447
luad: 1167/1447
luad: 1166/1447
luad: 1168/1447
luad: 1171/1447
luad: 1172/1447
luad: 1173/1447
luad: 1175/1447
luad: 1174/1447
luad: 1180/1447
luad: 1177/1447
luad: 1178/1447
luad: 1176/1447
luad: 1181/1447
luad: 1179/1447
luad: 1183/1447
luad: 1186/1447
luad: 1182/1447
luad: 1185/1447
luad: 1184/1447
luad: 1188/1447
luad: 1187/1447
luad: 1189/1447
luad: 1190/1447
luad: 1191/1447
luad: 1192/1447
luad: 1194/1447
luad: 1193/1447
luad: 1195/1447
luad: 1196/1447
luad: 1197/1447
luad: 1198/1447
luad: 1201/1447
luad: 1200/1447
luad: 1199/1447
luad: 1202/1447
luad: 1203/1447
luad: 1204/1447
luad: 1205/1447
luad: 1206/1447
luad: 1209/1447
luad: 1207/1447
luad: 1211/1447
luad: 1208/1447
luad: 1212/1447
luad: 1210/1447
luad: 1214/1447
luad: 1216/1447
luad: 1213/1447
luad: 1217/1447
luad: 1215/1447
luad: 1219/1447
luad: 1220/1447
luad: 1218/1447
luad: 1222/1447
luad: 1221/1447
luad: 1223/1447
luad: 1224/1447
luad: 12

lusc: 249/1500
lusc: 255/1500
lusc: 254/1500
lusc: 256/1500
lusc: 253/1500
lusc: 257/1500
lusc: 259/1500
lusc: 258/1500
lusc: 261/1500
lusc: 260/1500
lusc: 262/1500
lusc: 265/1500
lusc: 263/1500
lusc: 268/1500
lusc: 264/1500
lusc: 266/1500
lusc: 267/1500
lusc: 270/1500
lusc: 269/1500
lusc: 272/1500
lusc: 271/1500
lusc: 273/1500
lusc: 274/1500
lusc: 276/1500
lusc: 277/1500
lusc: 275/1500
lusc: 278/1500
lusc: 279/1500
lusc: 280/1500
lusc: 281/1500
lusc: 283/1500
lusc: 282/1500
lusc: 284/1500
lusc: 289/1500
lusc: 287/1500
lusc: 285/1500
lusc: 288/1500
lusc: 290/1500
lusc: 286/1500
lusc: 291/1500
lusc: 295/1500
lusc: 292/1500
lusc: 293/1500
lusc: 297/1500
lusc: 296/1500
lusc: 294/1500
lusc: 300/1500
lusc: 299/1500
lusc: 302/1500
lusc: 301/1500
lusc: 298/1500
lusc: 307/1500
lusc: 308/1500
lusc: 305/1500
lusc: 303/1500
lusc: 311/1500
lusc: 306/1500
lusc: 304/1500
lusc: 310/1500
lusc: 313/1500
lusc: 314/1500
lusc: 312/1500
lusc: 309/1500
lusc: 317/1500
lusc: 316/1500
lusc: 318/1500
lusc: 321/

lusc: 799/1500
lusc: 800/1500
lusc: 801/1500
lusc: 807/1500
lusc: 804/1500
lusc: 806/1500
lusc: 805/1500
lusc: 802/1500
lusc: 803/1500
lusc: 809/1500
lusc: 808/1500
lusc: 811/1500
lusc: 810/1500
lusc: 814/1500
lusc: 812/1500
lusc: 818/1500
lusc: 813/1500
lusc: 815/1500
lusc: 817/1500
lusc: 816/1500
lusc: 821/1500
lusc: 819/1500
lusc: 820/1500
lusc: 822/1500
lusc: 823/1500
lusc: 824/1500
lusc: 825/1500
lusc: 829/1500
lusc: 826/1500
lusc: 832/1500
lusc: 827/1500
lusc: 828/1500
lusc: 833/1500
lusc: 830/1500
lusc: 831/1500
lusc: 835/1500
lusc: 836/1500
lusc: 840/1500
lusc: 834/1500
lusc: 838/1500
lusc: 839/1500
lusc: 837/1500
lusc: 843/1500
lusc: 841/1500
lusc: 842/1500
lusc: 846/1500
lusc: 844/1500
lusc: 848/1500
lusc: 845/1500
lusc: 847/1500
lusc: 850/1500
lusc: 849/1500
lusc: 851/1500
lusc: 852/1500
lusc: 853/1500
lusc: 857/1500
lusc: 859/1500
lusc: 855/1500
lusc: 854/1500
lusc: 858/1500
lusc: 860/1500
lusc: 862/1500
lusc: 864/1500
lusc: 861/1500
lusc: 863/1500
lusc: 868/1500
lusc: 856/

lusc: 1321/1500
lusc: 1327/1500
lusc: 1325/1500
lusc: 1331/1500
lusc: 1328/1500
lusc: 1334/1500
lusc: 1332/1500
lusc: 1329/1500
lusc: 1330/1500
lusc: 1336/1500
lusc: 1326/1500
lusc: 1333/1500
lusc: 1337/1500
lusc: 1342/1500
lusc: 1335/1500
lusc: 1338/1500
lusc: 1346/1500
lusc: 1339/1500
lusc: 1344/1500
lusc: 1340/1500
lusc: 1350/1500
lusc: 1345/1500
lusc: 1351/1500
lusc: 1354/1500
lusc: 1349/1500
lusc: 1352/1500
lusc: 1353/1500
lusc: 1356/1500
lusc: 1358/1500
lusc: 1341/1500
lusc: 1348/1500
lusc: 1355/1500
lusc: 1357/1500
lusc: 1359/1500
lusc: 1360/1500
lusc: 1363/1500
lusc: 1361/1500
lusc: 1362/1500
lusc: 1364/1500
lusc: 1343/1500
lusc: 1366/1500
lusc: 1370/1500
lusc: 1371/1500
lusc: 1365/1500
lusc: 1369/1500
lusc: 1367/1500
lusc: 1372/1500
lusc: 1368/1500
lusc: 1376/1500
lusc: 1375/1500
lusc: 1374/1500
lusc: 1382/1500
lusc: 1373/1500
lusc: 1378/1500
lusc: 1347/1500
lusc: 1379/1500
lusc: 1377/1500
lusc: 1385/1500
lusc: 1383/1500
lusc: 1381/1500
lusc: 1380/1500
lusc: 1384/1500
lusc: 13

ov: 236/1462
ov: 240/1462
ov: 237/1462
ov: 243/1462
ov: 238/1462
ov: 241/1462
ov: 239/1462
ov: 242/1462
ov: 245/1462
ov: 244/1462
ov: 247/1462
ov: 249/1462
ov: 246/1462
ov: 251/1462
ov: 250/1462
ov: 248/1462
ov: 254/1462
ov: 256/1462
ov: 255/1462
ov: 252/1462
ov: 253/1462
ov: 259/1462
ov: 260/1462
ov: 258/1462
ov: 263/1462
ov: 261/1462
ov: 257/1462
ov: 265/1462
ov: 268/1462
ov: 262/1462
ov: 266/1462
ov: 264/1462
ov: 272/1462
ov: 267/1462
ov: 273/1462
ov: 269/1462
ov: 270/1462
ov: 271/1462
ov: 275/1462
ov: 274/1462
ov: 276/1462
ov: 277/1462
ov: 278/1462
ov: 281/1462
ov: 282/1462
ov: 279/1462
ov: 284/1462
ov: 285/1462
ov: 280/1462
ov: 287/1462
ov: 286/1462
ov: 283/1462
ov: 290/1462
ov: 292/1462
ov: 289/1462
ov: 291/1462
ov: 294/1462
ov: 288/1462
ov: 293/1462
ov: 296/1462
ov: 295/1462
ov: 298/1462
ov: 300/1462
ov: 297/1462
ov: 301/1462
ov: 302/1462
ov: 299/1462
ov: 305/1462
ov: 303/1462
ov: 304/1462
ov: 307/1462
ov: 310/1462
ov: 306/1462
ov: 311/1462
ov: 308/1462
ov: 309/1462
ov: 314/1462

ov: 868/1462
ov: 865/1462
ov: 867/1462
ov: 870/1462
ov: 871/1462
ov: 869/1462
ov: 873/1462
ov: 872/1462
ov: 877/1462
ov: 875/1462
ov: 878/1462
ov: 883/1462
ov: 874/1462
ov: 876/1462
ov: 879/1462
ov: 881/1462
ov: 884/1462
ov: 882/1462
ov: 880/1462
ov: 889/1462
ov: 888/1462
ov: 887/1462
ov: 885/1462
ov: 886/1462
ov: 891/1462
ov: 890/1462
ov: 894/1462
ov: 893/1462
ov: 897/1462
ov: 895/1462
ov: 896/1462
ov: 892/1462
ov: 901/1462
ov: 900/1462
ov: 904/1462
ov: 899/1462
ov: 905/1462
ov: 902/1462
ov: 898/1462
ov: 903/1462
ov: 907/1462
ov: 908/1462
ov: 909/1462
ov: 906/1462
ov: 910/1462
ov: 912/1462
ov: 911/1462
ov: 917/1462
ov: 913/1462
ov: 915/1462
ov: 914/1462
ov: 918/1462
ov: 916/1462
ov: 919/1462
ov: 924/1462
ov: 920/1462
ov: 921/1462
ov: 926/1462
ov: 922/1462
ov: 927/1462
ov: 923/1462
ov: 928/1462
ov: 929/1462
ov: 931/1462
ov: 930/1462
ov: 934/1462
ov: 932/1462
ov: 936/1462
ov: 925/1462
ov: 938/1462
ov: 937/1462
ov: 935/1462
ov: 940/1462
ov: 933/1462
ov: 945/1462
ov: 939/1462
ov: 944/1462

ov: 1456/1462
paad: 3/217
paad: 6/217
paad: 2/217
paad: 4/217
paad: 9/217
paad: 8/217
paad: 11/217
paad: 1/217
paad: 13/217
paad: 10/217
paad: 5/217
paad: 14/217
paad: 12/217
paad: 15/217
paad: 17/217
paad: 7/217
paad: 20/217
paad: 19/217
paad: 18/217
paad: 22/217
paad: 23/217
paad: 21/217
paad: 26/217
paad: 25/217
paad: 27/217
paad: 24/217
paad: 29/217
paad: 30/217
paad: 28/217
paad: 33/217
paad: 32/217
paad: 31/217
paad: 36/217
paad: 37/217
paad: 39/217
paad: 34/217
paad: 38/217
paad: 35/217
paad: 42/217
paad: 41/217
paad: 40/217
paad: 44/217
paad: 43/217
paad: 48/217
paad: 47/217
paad: 46/217
paad: 45/217
paad: 52/217
paad: 50/217
paad: 55/217
paad: 53/217
paad: 49/217
paad: 51/217
paad: 59/217
paad: 54/217
paad: 57/217
paad: 56/217
paad: 64/217
paad: 63/217
paad: 58/217
paad: 60/217
paad: 62/217
paad: 69/217
paad: 61/217
paad: 66/217
paad: 65/217
paad: 67/217
paad: 71/217
paad: 68/217
paad: 70/217
paad: 72/217
paad: 74/217
paad: 73/217
paad: 75/217
paad: 77/217
paad: 78/217
paad: 7

pcpg: 384/392
pcpg: 385/392
pcpg: 383/392
pcpg: 388/392
pcpg: 391/392
pcpg: 390/392
pcpg: 348/392
prad: 1/1263
prad: 3/1263
prad: 7/1263
prad: 5/1263
prad: 6/1263
prad: 9/1263
prad: 4/1263
prad: 11/1263
prad: 2/1263
prad: 13/1263
prad: 15/1263
prad: 8/1263
prad: 17/1263
prad: 10/1263
prad: 19/1263
prad: 14/1263
prad: 21/1263
prad: 12/1263
prad: 16/1263
prad: 27/1263
prad: 18/1263
prad: 25/1263
prad: 23/1263
prad: 22/1263
prad: 31/1263
prad: 20/1263
prad: 29/1263
prad: 28/1263
prad: 24/1263
prad: 33/1263
prad: 30/1263
prad: 39/1263
prad: 41/1263
prad: 35/1263
prad: 37/1263
prad: 26/1263
prad: 34/1263
prad: 32/1263
prad: 43/1263
prad: 47/1263
prad: 40/1263
prad: 42/1263
prad: 45/1263
prad: 49/1263
prad: 53/1263
prad: 50/1263
prad: 48/1263
prad: 51/1263
prad: 44/1263
prad: 57/1263
prad: 36/1263
prad: 59/1263
prad: 54/1263
prad: 55/1263
prad: 58/1263
prad: 52/1263
prad: 56/1263
prad: 65/1263
prad: 61/1263
prad: 60/1263
prad: 62/1263
prad: 63/1263
prad: 66/1263
prad: 67/1263
prad: 73/1263
p

prad: 553/1263
prad: 550/1263
prad: 547/1263
prad: 549/1263
prad: 551/1263
prad: 556/1263
prad: 552/1263
prad: 554/1263
prad: 557/1263
prad: 555/1263
prad: 562/1263
prad: 560/1263
prad: 561/1263
prad: 558/1263
prad: 563/1263
prad: 559/1263
prad: 564/1263
prad: 568/1263
prad: 569/1263
prad: 565/1263
prad: 567/1263
prad: 571/1263
prad: 572/1263
prad: 570/1263
prad: 576/1263
prad: 575/1263
prad: 566/1263
prad: 573/1263
prad: 574/1263
prad: 577/1263
prad: 581/1263
prad: 584/1263
prad: 580/1263
prad: 579/1263
prad: 587/1263
prad: 590/1263
prad: 585/1263
prad: 583/1263
prad: 586/1263
prad: 588/1263
prad: 589/1263
prad: 593/1263
prad: 578/1263
prad: 592/1263
prad: 596/1263
prad: 591/1263
prad: 594/1263
prad: 599/1263
prad: 597/1263
prad: 598/1263
prad: 602/1263
prad: 603/1263
prad: 601/1263
prad: 605/1263
prad: 604/1263
prad: 600/1263
prad: 606/1263
prad: 607/1263
prad: 610/1263
prad: 609/1263
prad: 608/1263
prad: 612/1263
prad: 611/1263
prad: 595/1263
prad: 615/1263
prad: 613/1263
prad: 617/

prad: 1089/1263
prad: 1092/1263
prad: 1087/1263
prad: 1093/1263
prad: 1091/1263
prad: 1094/1263
prad: 1098/1263
prad: 1096/1263
prad: 1095/1263
prad: 1103/1263
prad: 1097/1263
prad: 1102/1263
prad: 1101/1263
prad: 1104/1263
prad: 1105/1263
prad: 1100/1263
prad: 1108/1263
prad: 1111/1263
prad: 1107/1263
prad: 1110/1263
prad: 1109/1263
prad: 1106/1263
prad: 1113/1263
prad: 1114/1263
prad: 1116/1263
prad: 1099/1263
prad: 1120/1263
prad: 1119/1263
prad: 1115/1263
prad: 1117/1263
prad: 1123/1263
prad: 1124/1263
prad: 1121/1263
prad: 1122/1263
prad: 1129/1263
prad: 1127/1263
prad: 1128/1263
prad: 1130/1263
prad: 1118/1263
prad: 1125/1263
prad: 1134/1263
prad: 1133/1263
prad: 1131/1263
prad: 1138/1263
prad: 1137/1263
prad: 1132/1263
prad: 1139/1263
prad: 1143/1263
prad: 1142/1263
prad: 1140/1263
prad: 1141/1263
prad: 1144/1263
prad: 1145/1263
prad: 1146/1263
prad: 1148/1263
prad: 1150/1263
prad: 1135/1263
prad: 1112/1263
prad: 1152/1263
prad: 1147/1263
prad: 1151/1263
prad: 1155/1263
prad: 11

read: 402/546
read: 397/546
read: 401/546
read: 399/546
read: 400/546
read: 404/546
read: 403/546
read: 358/546
read: 406/546
read: 405/546
read: 411/546
read: 408/546
read: 407/546
read: 412/546
read: 413/546
read: 414/546
read: 416/546
read: 409/546
read: 410/546
read: 415/546
read: 420/546
read: 417/546
read: 422/546
read: 421/546
read: 418/546
read: 423/546
read: 425/546
read: 428/546
read: 419/546
read: 431/546
read: 427/546
read: 426/546
read: 429/546
read: 435/546
read: 432/546
read: 430/546
read: 433/546
read: 434/546
read: 436/546
read: 438/546
read: 437/546
read: 439/546
read: 440/546
read: 441/546
read: 442/546
read: 444/546
read: 443/546
read: 447/546
read: 446/546
read: 445/546
read: 449/546
read: 451/546
read: 448/546
read: 450/546
read: 424/546
read: 452/546
read: 453/546
read: 454/546
read: 457/546
read: 458/546
read: 455/546
read: 459/546
read: 460/546
read: 461/546
read: 456/546
read: 463/546
read: 464/546
read: 462/546
read: 466/546
read: 467/546
read: 468/546
read: 

sarc: 450/891
sarc: 446/891
sarc: 447/891
sarc: 448/891
sarc: 453/891
sarc: 452/891
sarc: 451/891
sarc: 449/891
sarc: 455/891
sarc: 454/891
sarc: 459/891
sarc: 457/891
sarc: 456/891
sarc: 458/891
sarc: 461/891
sarc: 460/891
sarc: 466/891
sarc: 463/891
sarc: 465/891
sarc: 462/891
sarc: 470/891
sarc: 468/891
sarc: 467/891
sarc: 472/891
sarc: 464/891
sarc: 469/891
sarc: 471/891
sarc: 473/891
sarc: 478/891
sarc: 474/891
sarc: 477/891
sarc: 479/891
sarc: 475/891
sarc: 482/891
sarc: 476/891
sarc: 483/891
sarc: 383/891
sarc: 480/891
sarc: 484/891
sarc: 487/891
sarc: 486/891
sarc: 481/891
sarc: 490/891
sarc: 493/891
sarc: 489/891
sarc: 485/891
sarc: 492/891
sarc: 491/891
sarc: 494/891
sarc: 498/891
sarc: 495/891
sarc: 488/891
sarc: 500/891
sarc: 503/891
sarc: 501/891
sarc: 502/891
sarc: 499/891
sarc: 497/891
sarc: 507/891
sarc: 506/891
sarc: 505/891
sarc: 511/891
sarc: 508/891
sarc: 510/891
sarc: 514/891
sarc: 513/891
sarc: 512/891
sarc: 515/891
sarc: 504/891
sarc: 516/891
sarc: 520/891
sarc: 

skcm: 151/980
skcm: 153/980
skcm: 150/980
skcm: 155/980
skcm: 156/980
skcm: 152/980
skcm: 147/980
skcm: 157/980
skcm: 161/980
skcm: 159/980
skcm: 154/980
skcm: 158/980
skcm: 164/980
skcm: 163/980
skcm: 160/980
skcm: 167/980
skcm: 162/980
skcm: 165/980
skcm: 170/980
skcm: 169/980
skcm: 166/980
skcm: 171/980
skcm: 172/980
skcm: 175/980
skcm: 177/980
skcm: 174/980
skcm: 178/980
skcm: 168/980
skcm: 176/980
skcm: 179/980
skcm: 183/980
skcm: 180/980
skcm: 181/980
skcm: 185/980
skcm: 186/980
skcm: 187/980
skcm: 184/980
skcm: 182/980
skcm: 191/980
skcm: 188/980
skcm: 192/980
skcm: 189/980
skcm: 190/980
skcm: 194/980
skcm: 195/980
skcm: 193/980
skcm: 173/980
skcm: 201/980
skcm: 199/980
skcm: 202/980
skcm: 200/980
skcm: 196/980
skcm: 198/980
skcm: 197/980
skcm: 204/980
skcm: 203/980
skcm: 205/980
skcm: 209/980
skcm: 206/980
skcm: 210/980
skcm: 211/980
skcm: 207/980
skcm: 208/980
skcm: 214/980
skcm: 213/980
skcm: 212/980
skcm: 215/980
skcm: 217/980
skcm: 218/980
skcm: 216/980
skcm: 221/980
skcm: 

skcm: 737/980
skcm: 736/980
skcm: 733/980
skcm: 739/980
skcm: 738/980
skcm: 742/980
skcm: 741/980
skcm: 743/980
skcm: 746/980
skcm: 744/980
skcm: 740/980
skcm: 745/980
skcm: 748/980
skcm: 747/980
skcm: 752/980
skcm: 753/980
skcm: 750/980
skcm: 751/980
skcm: 749/980
skcm: 754/980
skcm: 758/980
skcm: 755/980
skcm: 756/980
skcm: 757/980
skcm: 763/980
skcm: 759/980
skcm: 760/980
skcm: 761/980
skcm: 765/980
skcm: 762/980
skcm: 764/980
skcm: 769/980
skcm: 767/980
skcm: 766/980
skcm: 768/980
skcm: 770/980
skcm: 771/980
skcm: 775/980
skcm: 772/980
skcm: 774/980
skcm: 777/980
skcm: 773/980
skcm: 778/980
skcm: 776/980
skcm: 780/980
skcm: 779/980
skcm: 783/980
skcm: 781/980
skcm: 787/980
skcm: 786/980
skcm: 789/980
skcm: 790/980
skcm: 782/980
skcm: 791/980
skcm: 794/980
skcm: 785/980
skcm: 788/980
skcm: 793/980
skcm: 792/980
skcm: 798/980
skcm: 799/980
skcm: 784/980
skcm: 800/980
skcm: 802/980
skcm: 796/980
skcm: 803/980
skcm: 797/980
skcm: 801/980
skcm: 805/980
skcm: 795/980
skcm: 806/980
skcm: 

stad: 325/1092
stad: 327/1092
stad: 329/1092
stad: 291/1092
stad: 328/1092
stad: 331/1092
stad: 332/1092
stad: 330/1092
stad: 333/1092
stad: 336/1092
stad: 334/1092
stad: 337/1092
stad: 297/1092
stad: 335/1092
stad: 340/1092
stad: 339/1092
stad: 341/1092
stad: 338/1092
stad: 342/1092
stad: 343/1092
stad: 346/1092
stad: 349/1092
stad: 347/1092
stad: 344/1092
stad: 345/1092
stad: 350/1092
stad: 351/1092
stad: 352/1092
stad: 354/1092
stad: 355/1092
stad: 348/1092
stad: 353/1092
stad: 359/1092
stad: 356/1092
stad: 363/1092
stad: 364/1092
stad: 358/1092
stad: 357/1092
stad: 360/1092
stad: 361/1092
stad: 362/1092
stad: 365/1092
stad: 366/1092
stad: 370/1092
stad: 372/1092
stad: 374/1092
stad: 367/1092
stad: 371/1092
stad: 369/1092
stad: 368/1092
stad: 373/1092
stad: 377/1092
stad: 376/1092
stad: 378/1092
stad: 375/1092
stad: 380/1092
stad: 379/1092
stad: 382/1092
stad: 381/1092
stad: 383/1092
stad: 385/1092
stad: 387/1092
stad: 386/1092
stad: 389/1092
stad: 384/1092
stad: 390/1092
stad: 391/

stad: 875/1092
stad: 874/1092
stad: 870/1092
stad: 878/1092
stad: 880/1092
stad: 879/1092
stad: 876/1092
stad: 881/1092
stad: 873/1092
stad: 885/1092
stad: 886/1092
stad: 883/1092
stad: 882/1092
stad: 888/1092
stad: 889/1092
stad: 887/1092
stad: 884/1092
stad: 890/1092
stad: 892/1092
stad: 893/1092
stad: 891/1092
stad: 896/1092
stad: 894/1092
stad: 895/1092
stad: 900/1092
stad: 899/1092
stad: 872/1092
stad: 897/1092
stad: 898/1092
stad: 901/1092
stad: 903/1092
stad: 902/1092
stad: 905/1092
stad: 904/1092
stad: 907/1092
stad: 912/1092
stad: 911/1092
stad: 906/1092
stad: 914/1092
stad: 910/1092
stad: 913/1092
stad: 908/1092
stad: 916/1092
stad: 915/1092
stad: 918/1092
stad: 920/1092
stad: 919/1092
stad: 917/1092
stad: 921/1092
stad: 924/1092
stad: 925/1092
stad: 923/1092
stad: 928/1092
stad: 929/1092
stad: 926/1092
stad: 927/1092
stad: 922/1092
stad: 931/1092
stad: 933/1092
stad: 932/1092
stad: 877/1092
stad: 930/1092
stad: 935/1092
stad: 934/1092
stad: 938/1092
stad: 936/1092
stad: 939/

thca: 195/1244
thca: 190/1244
thca: 191/1244
thca: 194/1244
thca: 168/1244
thca: 192/1244
thca: 196/1244
thca: 198/1244
thca: 203/1244
thca: 200/1244
thca: 201/1244
thca: 199/1244
thca: 204/1244
thca: 202/1244
thca: 197/1244
thca: 205/1244
thca: 208/1244
thca: 207/1244
thca: 213/1244
thca: 149/1244
thca: 206/1244
thca: 212/1244
thca: 209/1244
thca: 215/1244
thca: 218/1244
thca: 211/1244
thca: 216/1244
thca: 210/1244
thca: 221/1244
thca: 222/1244
thca: 217/1244
thca: 219/1244
thca: 220/1244
thca: 224/1244
thca: 214/1244
thca: 227/1244
thca: 223/1244
thca: 229/1244
thca: 225/1244
thca: 232/1244
thca: 230/1244
thca: 231/1244
thca: 226/1244
thca: 236/1244
thca: 228/1244
thca: 234/1244
thca: 240/1244
thca: 238/1244
thca: 235/1244
thca: 237/1244
thca: 233/1244
thca: 243/1244
thca: 239/1244
thca: 242/1244
thca: 241/1244
thca: 245/1244
thca: 247/1244
thca: 248/1244
thca: 249/1244
thca: 244/1244
thca: 252/1244
thca: 251/1244
thca: 250/1244
thca: 253/1244
thca: 256/1244
thca: 257/1244
thca: 254/

thca: 739/1244
thca: 742/1244
thca: 738/1244
thca: 743/1244
thca: 741/1244
thca: 731/1244
thca: 745/1244
thca: 744/1244
thca: 748/1244
thca: 746/1244
thca: 751/1244
thca: 747/1244
thca: 740/1244
thca: 753/1244
thca: 750/1244
thca: 755/1244
thca: 752/1244
thca: 754/1244
thca: 756/1244
thca: 757/1244
thca: 759/1244
thca: 762/1244
thca: 639/1244
thca: 758/1244
thca: 761/1244
thca: 760/1244
thca: 769/1244
thca: 765/1244
thca: 763/1244
thca: 764/1244
thca: 766/1244
thca: 767/1244
thca: 771/1244
thca: 768/1244
thca: 770/1244
thca: 772/1244
thca: 775/1244
thca: 776/1244
thca: 774/1244
thca: 781/1244
thca: 778/1244
thca: 779/1244
thca: 777/1244
thca: 780/1244
thca: 782/1244
thca: 786/1244
thca: 783/1244
thca: 784/1244
thca: 788/1244
thca: 789/1244
thca: 785/1244
thca: 790/1244
thca: 787/1244
thca: 792/1244
thca: 791/1244
thca: 794/1244
thca: 796/1244
thca: 799/1244
thca: 798/1244
thca: 793/1244
thca: 795/1244
thca: 797/1244
thca: 803/1244
thca: 802/1244
thca: 800/1244
thca: 801/1244
thca: 804/

thym: 18/318
thym: 22/318
thym: 28/318
thym: 35/318
thym: 24/318
thym: 30/318
thym: 37/318
thym: 32/318
thym: 43/318
thym: 33/318
thym: 36/318
thym: 39/318
thym: 41/318
thym: 45/318
thym: 47/318
thym: 38/318
thym: 34/318
thym: 40/318
thym: 49/318
thym: 44/318
thym: 42/318
thym: 53/318
thym: 51/318
thym: 55/318
thym: 48/318
thym: 46/318
thym: 56/318
thym: 57/318
thym: 50/318
thym: 54/318
thym: 52/318
thym: 59/318
thym: 63/318
thym: 61/318
thym: 58/318
thym: 65/318
thym: 67/318
thym: 71/318
thym: 69/318
thym: 73/318
thym: 62/318
thym: 60/318
thym: 64/318
thym: 66/318
thym: 74/318
thym: 70/318
thym: 68/318
thym: 72/318
thym: 77/318
thym: 79/318
thym: 75/318
thym: 83/318
thym: 85/318
thym: 87/318
thym: 81/318
thym: 76/318
thym: 78/318
thym: 88/318
thym: 86/318
thym: 80/318
thym: 84/318
thym: 89/318
thym: 82/318
thym: 91/318
thym: 92/318
thym: 93/318
thym: 94/318
thym: 96/318
thym: 90/318
thym: 98/318
thym: 95/318
thym: 97/318
thym: 99/318
thym: 100/318
thym: 101/318
thym: 102/318
thym: 108

ucec: 291/1540
ucec: 290/1540
ucec: 286/1540
ucec: 294/1540
ucec: 295/1540
ucec: 298/1540
ucec: 292/1540
ucec: 293/1540
ucec: 296/1540
ucec: 302/1540
ucec: 299/1540
ucec: 301/1540
ucec: 297/1540
ucec: 304/1540
ucec: 300/1540
ucec: 307/1540
ucec: 305/1540
ucec: 308/1540
ucec: 303/1540
ucec: 313/1540
ucec: 311/1540
ucec: 306/1540
ucec: 310/1540
ucec: 315/1540
ucec: 317/1540
ucec: 312/1540
ucec: 309/1540
ucec: 316/1540
ucec: 319/1540
ucec: 318/1540
ucec: 314/1540
ucec: 323/1540
ucec: 320/1540
ucec: 322/1540
ucec: 326/1540
ucec: 324/1540
ucec: 321/1540
ucec: 327/1540
ucec: 325/1540
ucec: 328/1540
ucec: 333/1540
ucec: 329/1540
ucec: 334/1540
ucec: 330/1540
ucec: 332/1540
ucec: 331/1540
ucec: 339/1540
ucec: 336/1540
ucec: 338/1540
ucec: 335/1540
ucec: 337/1540
ucec: 340/1540
ucec: 346/1540
ucec: 341/1540
ucec: 342/1540
ucec: 344/1540
ucec: 244/1540
ucec: 349/1540
ucec: 343/1540
ucec: 348/1540
ucec: 347/1540
ucec: 353/1540
ucec: 345/1540
ucec: 355/1540
ucec: 351/1540
ucec: 359/1540
ucec: 352/

ucec: 837/1540
ucec: 840/1540
ucec: 844/1540
ucec: 836/1540
ucec: 842/1540
ucec: 841/1540
ucec: 843/1540
ucec: 839/1540
ucec: 838/1540
ucec: 847/1540
ucec: 834/1540
ucec: 848/1540
ucec: 846/1540
ucec: 849/1540
ucec: 854/1540
ucec: 851/1540
ucec: 852/1540
ucec: 850/1540
ucec: 857/1540
ucec: 853/1540
ucec: 856/1540
ucec: 861/1540
ucec: 862/1540
ucec: 859/1540
ucec: 863/1540
ucec: 855/1540
ucec: 865/1540
ucec: 858/1540
ucec: 867/1540
ucec: 864/1540
ucec: 866/1540
ucec: 870/1540
ucec: 860/1540
ucec: 868/1540
ucec: 875/1540
ucec: 871/1540
ucec: 869/1540
ucec: 874/1540
ucec: 872/1540
ucec: 876/1540
ucec: 879/1540
ucec: 873/1540
ucec: 877/1540
ucec: 880/1540
ucec: 878/1540
ucec: 881/1540
ucec: 884/1540
ucec: 883/1540
ucec: 882/1540
ucec: 885/1540
ucec: 890/1540
ucec: 886/1540
ucec: 889/1540
ucec: 845/1540
ucec: 891/1540
ucec: 887/1540
ucec: 895/1540
ucec: 888/1540
ucec: 897/1540
ucec: 893/1540
ucec: 899/1540
ucec: 894/1540
ucec: 892/1540
ucec: 900/1540
ucec: 898/1540
ucec: 903/1540
ucec: 902/

ucec: 1357/1540
ucec: 1363/1540
ucec: 1362/1540
ucec: 1365/1540
ucec: 1359/1540
ucec: 1364/1540
ucec: 1369/1540
ucec: 1354/1540
ucec: 1366/1540
ucec: 1370/1540
ucec: 1367/1540
ucec: 1368/1540
ucec: 1376/1540
ucec: 1371/1540
ucec: 1372/1540
ucec: 1373/1540
ucec: 1375/1540
ucec: 1378/1540
ucec: 1380/1540
ucec: 1379/1540
ucec: 1377/1540
ucec: 1383/1540
ucec: 1384/1540
ucec: 1381/1540
ucec: 1386/1540
ucec: 1385/1540
ucec: 1387/1540
ucec: 1390/1540
ucec: 1382/1540
ucec: 1389/1540
ucec: 1388/1540
ucec: 1393/1540
ucec: 1392/1540
ucec: 1391/1540
ucec: 1396/1540
ucec: 1394/1540
ucec: 1398/1540
ucec: 1397/1540
ucec: 1401/1540
ucec: 1395/1540
ucec: 1400/1540
ucec: 1403/1540
ucec: 1374/1540
ucec: 1399/1540
ucec: 1407/1540
ucec: 1402/1540
ucec: 1405/1540
ucec: 1406/1540
ucec: 1411/1540
ucec: 1409/1540
ucec: 1404/1540
ucec: 1413/1540
ucec: 1408/1540
ucec: 1410/1540
ucec: 1414/1540
ucec: 1417/1540
ucec: 1419/1540
ucec: 1416/1540
ucec: 1415/1540
ucec: 1412/1540
ucec: 1418/1540
ucec: 1423/1540
ucec: 14

### Download Tiles

Make adjustments to urls to allow tiles to be requested:

In [35]:
dz_pdf = pdf
dz_pdf['url'] = dz_pdf['url'].str.replace('FIF=', 'DeepZoom=')

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  


The following is the code used to download tile data. Small adjustments were made to this code to allow multiple instances of python, and to allow simultaneous download on a Google Compute Engine VM -- modifications that allowed parallelization and significant speed-ups in data acquisition:

In [94]:
img_path = '/Users/aadi/Documents/tcga_imgs/tiles/'


def process_download_tile(url_idx, df_slice, category, save_dir, x, y):
    img_filename = save_dir + df_slice['name'].iloc[url_idx] + '_' + str(x) + '_' + str(y) + ".jpg"
    img_url = df_slice['url'].iloc[url_idx]
    img_url = img_url[:-17]
    img_url = img_url+"_files/15/"+str(x)+'_'+str(y)+'.jpg'
    #print(img_url)
    try:
        urllib.request.urlretrieve(img_url, img_filename)
    except:
        pass
    #print(category+': '+str(url_idx+1)+'/'+str(len(df_slice['url'])+1)+' -- '+str(x)+','+str(y))


In [96]:
for category in folder_names:
    category_dir = img_path+category
    if not os.path.exists(category_dir):
        os.makedirs(category_dir)
    df_slice = dz_pdf[dz_pdf['collection'].str.contains(category)]
    for url_idx in range(0, len(df_slice['url'])):
        print(category+': '+str(url_idx))
        save_dir = '/Users/aadi/Documents/tcga_imgs/tiles/'+category+'/'+df_slice['name'].iloc[url_idx]+'/'
        if not os.path.exists(save_dir):
            os.makedirs(save_dir)
#        for x in range(0, 100):
        joblib.Parallel(n_jobs=16)(joblib.delayed(process_download_tile)(url_idx, df_slice, category, save_dir, x, y) for y in range(0,30) for x in range(0,100))


brca: 1000
brca: 1001
brca: 1002
brca: 1003
brca: 1004
brca: 1005
brca: 1006
brca: 1007
brca: 1008
brca: 1009
brca: 1010
brca: 1011
brca: 1012
brca: 1013
brca: 1014
brca: 1015
brca: 1016
brca: 1017
brca: 1018
brca: 1019
brca: 1020
brca: 1021
brca: 1022
brca: 1023
brca: 1024
brca: 1025
brca: 1026
brca: 1027
brca: 1028
brca: 1029
brca: 1030
brca: 1031
brca: 1032
brca: 1033
brca: 1034
brca: 1035
brca: 1036
brca: 1037
brca: 1038
brca: 1039
brca: 1040
brca: 1041
brca: 1042
brca: 1043
brca: 1044
brca: 1045
brca: 1046
brca: 1047
brca: 1048
brca: 1049


KeyboardInterrupt: 