In [1]:
import phyloGraph as ph

from plotly.offline import download_plotlyjs, init_notebook_mode
init_notebook_mode(connected=True)

#vim ~/.Plotly/.Credentials

import requests
import collections
import json
import pandas as pd
import numpy as np
import re
import sys
from time import time

import wikipedia
#https://stackoverflow.com/questions/8088226/content-of-infobox-of-wikipedia



In [2]:
GEO_TIME = "geological_time.csv"
DATA_FILE = "data/Mammalia-15040-df-fixedage.csv"
OUT_FILE = 'data/Mammalia-15040-df-images.csv'

#DATA_FILE = "data/Chordata-2499-df-fixedage.csv"
#OUT_FILE = 'data/Chordata-2499-df-images.csv'

In [3]:
gt = pd.read_csv(GEO_TIME)

def trynum(n):
    try:
        return float(n)
    except:
        return None

In [4]:
df = pd.read_csv(DATA_FILE)
print(df.shape)
print(df.columns)

(1688, 11)
Index(['ancestor', 'depth', 'extinct', 'id', 'name', 'num_kids', 'phylesis',
       'x', 'y', 'Begin', 'End'],
      dtype='object')


In [5]:
this_query = df.name[0]
this_query

'Mammalia'

In [6]:
start = time()

In [7]:
def parse_png_from_box(box):
    png_str = [b for b in box.split('|') if (('.png' in b) or ('.jpg' in b)) and ('image' in b)][0]
    #clean up string
    png_str = re.sub("\\n", '', png_str)
    png_str = re.sub("^(.+?)image +?= +?", '', png_str)
    png_str = re.sub("^(.+?)File:", '', png_str)
    png_str = png_str.replace(" ", "_")
    return png_str

def get_image_url(this_query, 
                  base_url = "https://en.wikipedia.org/wiki/File:",
                  fallback = "https://en.wikipedia.org/wiki/File:Wikipedia-logo-v2.svg"):
    """
    """
    try:
        # get wikipedia page
        this_page_id = wikipedia.search(this_query, results=1)[0]
    
        # fetch infobox
        wiki_url = 'http://en.wikipedia.org/w/api.php?action=query&prop=revisions&rvprop=content&format=json&titles={}&rvsection=0'.format(this_page_id)
        box_raw = json.loads(requests.get(wiki_url).content.decode('utf-8'))
        box = next(iter(box_raw['query']['pages'].values()))['revisions'][0]['*']
        
        png_str = parse_png_from_box(box)
        
        return base_url + png_str
    except:
        return fallback

In [8]:
get_image_url(df.name[0])

'https://en.wikipedia.org/wiki/File:image=Kryptopterus.jpg'

In [9]:
img_list = []
for i, row in df.iterrows():
    img_list.append(get_image_url(row['name']))
    if i % 50 == 0:
        print("{}::{}".format(i, int(time()-start)), end= ' ', flush=True)

0::1 50::36 100::72 150::111 200::152 250::189 300::239 350::277 400::313 450::348 500::384 550::422 600::457 650::489 700::523 750::560 800::594 850::626 900::662 950::693 1000::729 1050::770 1100::806 1150::838 1200::874 1250::908 1300::944 1350::982 1400::1016 1450::1048 1500::1085 1550::1126 1600::1161 1650::1196 1700::1227 1750::1266 1800::1301 1850::1338 1900::1375 1950::1412 2000::1447 2050::1482 2100::1521 2150::1558 2200::1593 2250::1629 2300::1668 2350::1701 2400::1737 2450::1776 2500::1814 2550::1850 2600::1886 2650::1922 2700::1958 2750::1998 2800::2030 2850::2065 2900::2104 2950::2136 3000::2168 3050::2199 3100::2232 3150::2268 3200::2303 3250::2332 3300::2365 3350::2401 3400::2434 3450::2469 3500::2504 3550::2536 3600::2569 3650::2604 3700::2643 3750::2677 3800::2713 3850::2749 3900::2784 3950::2823 4000::2865 4050::2904 4100::2938 4150::2971 4200::3010 4250::3047 4300::3084 4350::3122 4400::3164 4450::3197 4500::3235 4550::3275 4600::3309 4650::3344 4700::3384 4750::3422

In [10]:
df['img_url'] = img_list
df.tail()

Unnamed: 0,ancestor,depth,extinct,id,name,num_kids,phylesis,x,y,Begin,End,img_url
10390,15088,9,0,15091,Polymixiiformes,0,0,-1.329644,1.238158,34.0,0.0,https://en.wikipedia.org/wiki/File:Polymixia_n...
10391,15088,9,0,15089,Lampridiformes,0,0,-1.387254,0.381895,0.0,0.0,https://en.wikipedia.org/wiki/File:Lophotus_la...
10392,14843,5,2,14924,Acanthodii,0,0,-1.555681,0.007475,188.0,0.0,https://en.wikipedia.org/wiki/File:image_=_Aca...
10393,14843,5,0,14925,Chondrichthyes,0,0,-2.013908,0.478732,188.0,0.0,https://en.wikipedia.org/wiki/File:White_shark...
10394,14843,5,2,14926,Placodermi,0,0,-1.105787,0.408763,188.0,0.0,https://en.wikipedia.org/wiki/File:Bothriolepi...


In [11]:
print(df.shape)
print(df.groupby('img_url').count().sort_values(by='name', ascending=False).head())

(10395, 12)
                                                    ancestor  depth  extinct  \
img_url                                                                        
https://en.wikipedia.org/wiki/File:Wikipedia-lo...      3048   3048     3048   
https://en.wikipedia.org/wiki/File:Stone_Sheep_...        35     35       35   
https://en.wikipedia.org/wiki/File:MoreporkMaun...        21     21       21   
https://en.wikipedia.org/wiki/File:Family_Cervi...        20     20       20   
https://en.wikipedia.org/wiki/File:White-bellie...        19     19       19   

                                                      id  name  num_kids  \
img_url                                                                    
https://en.wikipedia.org/wiki/File:Wikipedia-lo...  3048  3048      3048   
https://en.wikipedia.org/wiki/File:Stone_Sheep_...    35    35        35   
https://en.wikipedia.org/wiki/File:MoreporkMaun...    21    21        21   
https://en.wikipedia.org/wiki/File:Family_Cervi

In [12]:
#df.to_csv(OUT_FILE, index=False)

In [13]:
#get_image_url("Bill Murray")

In [14]:
df = pd.read_csv(OUT_FILE)
df.head()

Unnamed: 0,ancestor,depth,extinct,id,name,num_kids,phylesis,x,y,Begin,End,img_url
0,14973,1,0,15040,Mammalia,2,0,0.985482,-0.609871,225.0,0.0,https://en.wikipedia.org/wiki/File:Mammal_Dive...
1,15040,2,2,15989,Triconodonts,0,0,0.626042,-1.029422,190.0,70.0,https://en.wikipedia.org/wiki/File:Blank_page_...
2,15040,2,0,15991,Monotremata,2,0,1.577066,-1.15783,210.0,0.0,https://en.wikipedia.org/wiki/File:Prototheria...
3,15991,3,0,16250,Tachyglossidae,4,0,1.934838,-1.035496,23.03,0.0,https://en.wikipedia.org/wiki/File:Short-beake...
4,16250,4,0,16251,Tachyglossus aculeatus,0,0,2.682316,-1.617251,11.5,0.0,https://en.wikipedia.org/wiki/File:Wild_shortb...


In [15]:
#print(df.shape)
#df.groupby('img_url').count().sort_values(by='name', ascending=False)

In [15]:
wiki_img_url = df.img_url[0]
wiki_img_url

'https://en.wikipedia.org/wiki/File:Mammal_Diversity_2011.png'

In [16]:
def parse_wiki_pic(wiki_img_url, telltale="//upload.wikimedia.org/wikipedia/commons"):
    wiki_pic = requests.get(wiki_img_url).content.decode("utf-8")
    png_str = [b for b in wiki_pic.split('|') if ('.png' in b) and (telltale in b)][0]
    #clean up string
    png_str = re.sub("\\n", '', png_str)
    #png_str = re.sub('^(.+?)href ?= ?\"', '', png_str)
    png_str = re.sub('^(.+?)href ?= ?\"'+telltale, telltale, png_str)
    png_str = re.sub('\"(.+?)$', '', png_str)
    return "https:"+png_str

In [17]:
import base64
import requests
from resizeimage import resizeimage
from PIL import Image
import io

def bytes_to_img(img_bytes, resize=[]):
    image = Image.open(io.BytesIO(img_bytes))
    
    if len(resize) == 2:
        image = resizeimage.resize_contain(image, resize)
    elif len(resize) != 0:
        raise ValueError("resize argument takes a list of length 2 (height x width)")
    
    return image
    
def get_image_base64(url, resize=[]):
    img_bytes = requests.get(url).content
    
    if len(resize) == 2:
        img = bytes_to_img(img_bytes, resize=resize)
        buffered = io.BytesIO()
        img.save(buffered, format="JPEG")
        img_bytes = buffered.getvalue()
    elif len(resize) != 0:
        raise ValueError("resize argument takes a list of length 2 (height x width)")
        
    return base64.b64encode(img_bytes)



In [21]:
#df.img_url[1]

In [25]:
#parse_wiki_pic(wiki_img_url)
wiki_img_url = df.img_url[1]
wiki_img_url
requests.get(real_img_url).content

b'\x89PNG\r\n\x1a\n\x00\x00\x00\rIHDR\x00\x00\x03 \x00\x00\x02A\x08\x06\x00\x00\x00\xbe\xaf\xd1#\x00\x00\x00\x04gAMA\x00\x00\xaf\xc87\x05\x8a\xe9\x00\x00\x00\tpHYs\x00\x00\x0e\xc4\x00\x00\x0e\xc4\x01\x95+\x0e\x1b\x00\x00\x00\x19tEXtSoftware\x00Adobe ImageReadyq\xc9e<\x00\x00\xff\x8dIDATx^\xec\xfd\x07\xbce\xd7y\xde\x07?\xa7\x9f{\xee\xb9\xbd\xd7\x99\xb9\xd3+:\x88F\x80\x04\xbbD\xb1\xa9Rr\xec$N\xe4\xd8\x8e\xed/\x8e\xe3XI\xec\xcfv\xf2\xfb\\\xe28V\x149\xb6\x14\xc9\x89EI\x14%\x91\x14;H\x82\xe8}0\xc0\xf4\xde\xe7\xf6^\xce=\xfd\x9c\xef\xfd\xaf}\xd7\xcc\xc5\x10\x04\xc1\x82\xe1\x80\xd8\xcf\xcc\xba{\x9f\xbdWyW\xd9oY5R7(D\x88\x10!B\x84\x08\x11"D\x88\x10!n\x00\xa2k\xd7\x10!B\x84\x08\x11"D\x88\x10!B\x84x\xcb\x11\x1a !B\x84\x08\x11"D\x88\x10!B\x84\xb8a\x08\r\x90\x10!B\x84\x08\x11"D\x88\x10!B\xdc0\x84\x06H\x88\x10!B\x84\x08\x11"D\x88\x10!n\x18B\x03$D\x88\x10!B\x84\x08\x11"D\x88\x107\x0c\xa1\x01\x12"D\x88\x10!B\x84\x08\x11"D\x88\x1b\x86\xd0\x00\t\x11"D\x88\x10!B\x84\x08\x11"\xc4\rCh\x80\x84\x08\x11"D\x88

In [23]:
real_img_url = parse_wiki_pic(wiki_img_url)
print(real_img_url)
bytes_to_img(requests.get(real_img_url).content, resize=[150,150]).show()
#get_image_base64(real_img_url, resize=[100,100])

https://upload.wikimedia.org/wikipedia/commons/3/30/Blank_page_by_neoslashott.png


In [23]:
###num_pics = 25
###for i in list(np.random.choice(list(range(df.shape[0])), num_pics)):
###    print(i, end=' ', flush=True)
###    try:
###        wiki_img_url = df.img_url[i]
###        print(df.name[i], end=' ', flush=True)
###        real_img_url = parse_wiki_pic(wiki_img_url)
###        get_image(requests.get(real_img_url).content, resize=[150,150]).show()
###        print("showing "+real_img_url)
###    except:
###        print("-- Bill!")
###        #wiki_img_url = get_image_url("Bill Murray")
###        #real_img_url = parse_wiki_pic(wiki_img_url)
###        #get_image(requests.get(real_img_url).content, resize=[150,150]).show()

In [24]:
wiki_img_url = get_image_url("Bill Murray")
real_img_url = parse_wiki_pic(wiki_img_url)
fallback_bill = get_image_base64(real_img_url, resize=[100,100])

In [25]:
img_list = []
for i, row in df.iterrows():
    try:
        wiki_img_url = row['img_url']
        real_img_url = parse_wiki_pic(wiki_img_url)
        img_list.append(get_image_base64(real_img_url, resize=[100,100]))
    except:
        #print("-- Bill!")
        img_list.append(fallback_bill)
    if i % 50 == 0:
        print("{}::{}".format(i, int(time()-start)), end= ' ', flush=True)

0::7910 50::7955 100::7981 


Corrupt EXIF data.  Expecting to read 4 bytes but only got 0. 



150::8012 200::8059 250::8099 300::8144 350::8173 400::8202 450::8224 500::8271 550::8361 600::8410 650::8475 700::8629 750::8773 800::8842 850::8898 900::8918 950::8959 1000::9081 1050::9165 1100::9190 1150::9246 1200::9302 1250::9459 1300::9543 1350::9631 1400::9734 1450::9785 1500::9892 1550::9962 1600::9991 1650::10051 1700::10102 1750::10154 1800::10215 1850::10267 1900::10335 1950::10406 2000::10511 2050::10573 2100::10619 2150::10717 2200::10792 2250::10854 2300::10919 2350::10990 2400::11074 2450::11137 2500::11279 2550::11377 2600::11457 2650::11535 2700::11622 2750::11672 2800::11717 2850::11784 2900::11846 2950::11885 3000::11940 3050::12013 3100::12070 3150::12118 3200::12167 3250::12201 3300::12253 3350::12302 3400::12355 3450::12402 3500::12465 3550::12556 3600::12620 3650::12659 3700::12710 3750::12763 3800::12793 3850::12860 3900::12927 3950::12975 4000::13030 4050::13111 4100::13195 4150::13265 4200::13335 4250::13412 4300::13491 4350::13543 4400::13616 4450::13691 450


Possibly corrupt EXIF data.  Expecting to read 32 bytes but only got 0. Skipping tag 270


Possibly corrupt EXIF data.  Expecting to read 24 bytes but only got 0. Skipping tag 271


Possibly corrupt EXIF data.  Expecting to read 17 bytes but only got 0. Skipping tag 272


Possibly corrupt EXIF data.  Expecting to read 8 bytes but only got 0. Skipping tag 282


Possibly corrupt EXIF data.  Expecting to read 8 bytes but only got 0. Skipping tag 283


Possibly corrupt EXIF data.  Expecting to read 32 bytes but only got 0. Skipping tag 305


Possibly corrupt EXIF data.  Expecting to read 20 bytes but only got 0. Skipping tag 306


Possibly corrupt EXIF data.  Expecting to read 528 bytes but only got 0. Skipping tag 50341


Possibly corrupt EXIF data.  Expecting to read 8 bytes but only got 0. Skipping tag 33434


Possibly corrupt EXIF data.  Expecting to read 8 bytes but only got 0. Skipping tag 33437


Possibly corrupt EXIF data.  Expecting to read 20 bytes but only got 0. Skipping tag 3

6600::16749 6650::16912 6700::17045 6750::17123 6800::17193 6850::17271 6900::17332 6950::17383 7000::17434 7050::17502 7100::17574 7150::17667 7200::17726 7250::17779 


Image size (177053184 pixels) exceeds limit of 89478485 pixels, could be decompression bomb DOS attack.



7300::17926 7350::18007 7400::18101 7450::18185 7500::18290 7550::18427 7600::18567 7650::18661 7700::18745 7750::18855 7800::18921 7850::19005 7900::19072 7950::19141 8000::19216 8050::19308 8100::19368 8150::19445 8200::19528 8250::19574 8300::19604 8350::19641 8400::19670 8450::19705 8500::19741 8550::19764 8600::19799 8650::19829 8700::19873 8750::19907 8800::19943 8850::19969 8900::19994 8950::20020 9000::20083 9050::20099 9100::20121 9150::20173 9200::20233 9250::20255 9300::20296 9350::20321 9400::20363 9450::20402 9500::20438 9550::20469 9600::20497 9650::20527 9700::20588 9750::20616 9800::20669 9850::20699 9900::20734 9950::20763 10000::20789 10050::20815 10100::20844 10150::20855 10200::20880 10250::20907 10300::20946 10350::20986 

In [26]:
df['img_base64'] = img_list
df.tail()

Unnamed: 0,ancestor,depth,extinct,id,name,num_kids,phylesis,x,y,Begin,End,img_url,img_base64
10390,15088,9,0,15091,Polymixiiformes,0,0,-1.329644,1.238158,34.0,0.0,https://en.wikipedia.org/wiki/File:Polymixia_n...,b'/9j/4AAQSkZJRgABAQAAAQABAAD/2wBDAAgGBgcGBQgH...
10391,15088,9,0,15089,Lampridiformes,0,0,-1.387254,0.381895,0.0,0.0,https://en.wikipedia.org/wiki/File:Lophotus_la...,b'/9j/4AAQSkZJRgABAQAAAQABAAD/2wBDAAgGBgcGBQgH...
10392,14843,5,2,14924,Acanthodii,0,0,-1.555681,0.007475,188.0,0.0,https://en.wikipedia.org/wiki/File:image_=_Aca...,b'/9j/4AAQSkZJRgABAQAAAQABAAD/2wBDAAgGBgcGBQgH...
10393,14843,5,0,14925,Chondrichthyes,0,0,-2.013908,0.478732,188.0,0.0,https://en.wikipedia.org/wiki/File:White_shark...,b'/9j/4AAQSkZJRgABAQAAAQABAAD/2wBDAAgGBgcGBQgH...
10394,14843,5,2,14926,Placodermi,0,0,-1.105787,0.408763,188.0,0.0,https://en.wikipedia.org/wiki/File:Bothriolepi...,b'/9j/4AAQSkZJRgABAQAAAQABAAD/2wBDAAgGBgcGBQgH...


In [27]:
df.to_csv(OUT_FILE.replace(".csv", "-base64.csv"), index=False)