# Download the data

## Tables

In [1]:
import json
import requests
import os
import pandas as pd
from multiprocessing import Pool
import tqdm
import time
import logging

time.time()
logging.basicConfig(filename='api_stress.log',level=logging.DEBUG)

f_df = pd.read_csv('featured.csv')
nf_df = pd.read_csv('not_featured.csv')

featured_ids = list(f_df.id.values)
non_featured_ids = list(nf_df.id.values)
print(len(featured_ids),len(non_featured_ids))

def get_remote_data(pic_id):
    try:
        logging.info('PIC_ID = {} UNIX_TS_START = {}'.format(pic_id,time.time()).encode('utf-8').strip())

        api_url = 'https://api.picsart.com/photos/show/%s.json' % str(pic_id)    
        response = requests.get(api_url)

        logging.debug(('PIC_ID = {} UNIX_TS = {} HEADERS = '.format(pic_id,time.time())+str(response.headers)).encode('utf-8').strip())      
        logging.debug(('PIC_ID = {} UNIX_TS = {} RESPONSE = '.format(pic_id,time.time())+response.content.decode("utf-8")).encode('utf-8').strip())   

        data = json.loads(response.content.decode(response.encoding))

        logging.info('PIC_ID = {} UNIX_TS_END = {}'.format(pic_id,time.time()).encode('utf-8').strip())     

        return data 
    
    except:
        logging.info('PIC_ID = {} UNIX_EXCEPTION_TS_END = {}'.format(pic_id,time.time()).encode('utf-8').strip()) 

def parse_data_item(data, entities_list, user_entities_list, location_entities_list):
    entities = { new_key: catch(data,new_key) for new_key in entities_list}
    tags = { new_key: catch(data,new_key) for new_key in tags_entities}
    user_entities = { new_key: catch(catch(data,'user'),new_key) for new_key in user_entities_list}
    location_entities = { new_key: catch(catch(data,'location'),new_key) for new_key in location_entities_list}
   
    entities.update(user_entities)
    entities.update(location_entities)
    entities.update(tags)
    
    return entities
def catch(data,key):
    try:
        return data[key]
    except:
        return ''
def formatted_api_response(pic_id):
    return parse_data_item(get_remote_data(pic_id),entities_list, user_entities_list, location_entities_list)

entities_list = ['comments_count',
                 'views_count',
                 'reposts_count',
                 'created',
                 'public',
                 'sources_count',
                 'streams_count',
                 'height','width',
                 'type',
                 'is_reposted',
                 'url',
                 'likes_count',
                 'status',
                 'mature',
                 'id',
                 'has_similars',
                 'forks_count']
user_entities_list = ['photo', 'followers_count', 'photos_count', 'username', 'is_verified', 'name', 'id']
location_entities_list = ['country_code']
tags_entities = ['tags']


99687 100039


In [2]:
with Pool(16) as p:
    pic_data = list(tqdm.tqdm(p.imap(formatted_api_response, featured_ids), total=len(featured_ids)))

f_p_df = pd.DataFrame(pic_data)
f_p_df.to_csv('f_p_df_w_tags.csv')

with Pool(16) as p:
    pic_data = list(tqdm.tqdm(p.imap(formatted_api_response, non_featured_ids), total=len(non_featured_ids)))

nf_p_df = pd.DataFrame(pic_data)
nf_p_df.to_csv('nf_p_df_w_tags.csv')

100%|██████████| 99687/99687 [34:56<00:00, 47.55it/s]
100%|██████████| 100039/100039 [32:13<00:00, 51.74it/s]


## Pics

In [2]:
import pandas as pd
import numpy as np
import collections


# download picsart dataset
f_p_df = pd.read_csv('f_p_df_w_tags.csv')
nf_p_df = pd.read_csv('nf_p_df_wo_tags.csv')

def split_string(string):
    try:
        return string.split('/')[3]
    except:
        np.nan

f_p_df['filename'] = f_p_df['url'].apply(lambda x: split_string(x))
f_p_dict = dict(zip(f_p_df[pd.notnull(f_p_df.url)].url,f_p_df[pd.notnull(f_p_df.url)].filename))

nf_p_df['filename'] = nf_p_df['url'].apply(lambda x: split_string(x))
nf_p_dict = dict(zip(nf_p_df[pd.notnull(nf_p_df.url)].url,nf_p_df[pd.notnull(nf_p_df.url)].filename))

f_p_dict = collections.OrderedDict(f_p_dict) 
nf_p_dict = collections.OrderedDict(nf_p_dict)

In [18]:
!mkdir -p ../data/aesthetics
!mkdir -p ../data/aesthetics/f
!mkdir -p ../data/aesthetics/nf

In [24]:
list(f_p_dict.items())[0]

('http://cdn61.picsart.com/185271245000201.jpg', '185271245000201.jpg')

In [3]:
import pandas as pd
import numpy as np
import collections
import tqdm
import requests
import shutil
from multiprocessing import Pool

# download picsart dataset
f_p_df = pd.read_csv('f_p_df_w_tags.csv')
nf_p_df = pd.read_csv('nf_p_df_wo_tags.csv')

def split_string(string):
    try:
        return string.split('/')[3]
    except:
        np.nan

f_p_df['filename'] = f_p_df['url'].apply(lambda x: split_string(x))
f_p_dict = dict(zip(f_p_df[pd.notnull(f_p_df.url)].url,f_p_df[pd.notnull(f_p_df.url)].filename))

nf_p_df['filename'] = nf_p_df['url'].apply(lambda x: split_string(x))
nf_p_dict = dict(zip(nf_p_df[pd.notnull(nf_p_df.url)].url,nf_p_df[pd.notnull(nf_p_df.url)].filename))



def dowload_file_f(url_path):
    r = requests.get(url_path[0], stream=True)
    if r.status_code == 200:
        with open('../data/aesthetics/f/' + url_path[1], 'wb') as f:
            r.raw.decode_content = True
            shutil.copyfileobj(r.raw, f)

def dowload_file_nf(url_path):
    r = requests.get(url_path[0], stream=True)
    if r.status_code == 200:
        with open('../data/aesthetics/nf/' + url_path[1], 'wb') as f:
            r.raw.decode_content = True
            shutil.copyfileobj(r.raw, f)
            
            
with Pool(16) as p:
    pic_data = list(tqdm.tqdm(p.imap(dowload_file_nf, list(nf_p_dict.items())), total=len(list(nf_p_dict.items()))))

with Pool(16) as p:
    pic_data = list(tqdm.tqdm(p.imap(dowload_file_f, list(f_p_dict.items())), total=len(list(f_p_dict.items()))))

        

  0%|          | 154/99899 [00:13<2:21:48, 11.72it/s]Process ForkPoolWorker-15:
Process ForkPoolWorker-13:
Process ForkPoolWorker-14:
Process ForkPoolWorker-16:
Process ForkPoolWorker-12:
Process ForkPoolWorker-11:
Process ForkPoolWorker-9:
Process ForkPoolWorker-10:


KeyboardInterrupt: 

In [14]:
list(f_p_dict.items())

98939

In [8]:
list(nf_p_dict)[3]

'http://cdn20.picsart.com/162526344000202.jpeg'

In [1]:
i = 0
for url,file in tqdm.tqdm(f_p_dict.items()):
    if i>-1:
        url_q = "'" + url + "'"
        file_path = '../data/aesthetics/f/' + file
        ! wget --quiet --continue --no-check-certificate --no-proxy -O $file_path $url_q
    i += 1
    

i = 0

for url,file in tqdm.tqdm(nf_p_dict.items()):
    if i>-1:
        url_q = "'" + url + "'"
        file_path = '../data/aesthetics/nf/' + file
    ! wget --continue --quiet --no-check-certificate --no-proxy -O $file_path $url_q
    i += 1
        

NameError: name 'tqdm' is not defined

In [10]:
import requests
import shutil

url = 'http://cdn20.picsart.com/162526344000202.jpeg'
path = '../data/aesthetics/nf/162526344000202.jpeg'

r = requests.get(url, stream=True)
if r.status_code == 200:
    with open(path, 'wb') as f:
        r.raw.decode_content = True
        shutil.copyfileobj(r.raw, f)

## AVA dataset

In [None]:
# AVA torrent link
http://academictorrents.com/download/71631f83b11d3d79d8f84efe0a7e12f0ac001460.torrent

# Original threads - seem to be obsolete / non-maintained / behind the paywall
https://www.reddit.com/r/MachineLearning/comments/5sa3ag/r_image_aestetics_dataset_avadpchallenge/
https://github.com/mtobeiyf/ava_downloader

# Transmission cli
https://help.ubuntu.com/community/TransmissionHowTo

sudo apt-get install transmission-cli transmission-common transmission-daemon
sudo service transmission-daemon start

# sudo service transmission-daemon stop

sudo nano /var/lib/transmission-daemon/info/settings.json
"rpc-whitelist": "127.0.0.1,192.168.*.*",

# Do some EDA

In [13]:
len(featured_ids)

99687

In [60]:
from random import randint

api_url = 'https://api.picsart.com/photos/show/%s.json' % str(featured_ids[randint(0, len(featured_ids) )])  
response = requests.get(api_url)
data = json.loads(response.content.decode(response.encoding))
data

{'comments_count': 6,
 'created': '2014-12-27T14:16:01.904Z',
 'forks_count': 0,
 'has_similars': False,
 'height': 639,
 'id': 157385761000201,
 'is_reposted': False,
 'likes_count': 111,
 'location': None,
 'mature': False,
 'public': True,
 'reposts_count': 1,
 'sources_count': 0,
 'status': 'success',
 'streams_count': 2,
 'tags': ['music', 'blackandandwhite', 'photography', 'hands'],
 'title': '"Taal" or Rythem ....this one goes to @carlos111 and his music series my friends \n#music  #blackandandwhite #photography #hands ',
 'type': 'photo',
 'url': 'http://cdn23.picsart.com/157385761000201.jpeg',
 'user': {'followers_count': 312659,
  'id': 48249099,
  'is_verified': False,
  'name': 'Prithwiraj Sarker',
  'photo': 'http://cdn35.picsart.com/136072804001201.jpeg',
  'photos_count': 1393,
  'username': 'new2me'},
 'views_count': 7214,
 'width': 826}

In [2]:
import pandas as pd

In [6]:
f_p_df.columns.values

array(['Unnamed: 0', 'comments_count', 'country_code', 'created',
       'followers_count', 'forks_count', 'has_similars', 'height', 'id',
       'is_reposted', 'is_verified', 'likes_count', 'mature', 'name',
       'photo', 'photos_count', 'public', 'reposts_count', 'sources_count',
       'status', 'streams_count', 'type', 'url', 'username', 'views_count'], dtype=object)

In [3]:
f_p_df = pd.read_csv('f_p_df.csv')
f_p_df.head()

Unnamed: 0.1,Unnamed: 0,comments_count,country_code,created,followers_count,forks_count,has_similars,height,id,is_reposted,...,photos_count,public,reposts_count,sources_count,status,streams_count,type,url,username,views_count
0,0,9.0,RU,2017-07-02T04:30:40.201Z,815.0,4.0,True,1447.0,204794600000000.0,False,...,1332.0,True,9.0,3.0,success,22.0,photo,https://cdn131.picsart.com/236665883091202.jpg,primachuk71,2828.0
1,1,8.0,US,2017-06-28T02:13:06.502Z,358.0,0.0,True,1095.0,212855600000000.0,False,...,590.0,True,7.0,0.0,success,18.0,photo,https://cdn141.picsart.com/236312023077202.jpg,isabellamelo7,3887.0
2,2,12.0,US,2017-05-14T20:52:13.009Z,6179.0,21.0,True,1080.0,160674400000000.0,False,...,776.0,True,38.0,1.0,success,31.0,photo,https://cdn171.picsart.com/232491133001202.jpg,williamld05,37668.0
3,3,14.0,IT,2015-12-24T18:42:35.433Z,107364.0,0.0,False,1280.0,188678400000000.0,False,...,104.0,True,62.0,0.0,success,10.0,photo,http://cdn78.picsart.com/188678555000202.jpg,riccardonosvelli,70063.0
4,4,15.0,,2014-06-03T10:41:18.435Z,83.0,0.0,False,696.0,133348300000000.0,False,...,67.0,True,,0.0,success,2.0,photo,http://cdn29.picsart.com/139488078000202.jpeg,jeanettebau,491.0


In [4]:
nf_p_df = pd.read_csv('nf_p_df.csv')
nf_p_df.head()

Unnamed: 0.1,Unnamed: 0,comments_count,country_code,created,followers_count,forks_count,has_similars,height,id,is_reposted,...,photos_count,public,reposts_count,sources_count,status,streams_count,type,url,username,views_count
0,0,0.0,,2015-05-10T17:08:14.297Z,0.0,0.0,False,1536.0,165105500000000.0,False,...,6.0,True,0.0,0.0,success,0.0,photo,http://cdn51.picsart.com/168973693005202.jpeg,renaehuff1,65.0
1,1,0.0,,2015-06-29T14:04:13.665Z,1375.0,0.0,False,2059.0,169560800000000.0,False,...,200.0,True,0.0,0.0,success,0.0,photo,http://cdn55.picsart.com/173282653001202.jpg,kairi_seila,68.0
2,2,0.0,,2015-06-20T04:04:51.923Z,1.0,0.0,False,3052.0,172377700000000.0,False,...,1.0,True,0.0,0.0,success,0.0,photo,http://cdn55.picsart.com/172469088003202.jpg,latashathompson1,57.0
3,3,1.0,US,2015-12-17T02:24:22.739Z,1.0,0.0,False,640.0,187802000000000.0,False,...,8.0,True,0.0,0.0,success,0.0,photo,http://cdn65.picsart.com/188015062000202.jpg,tunhtet,65.0
4,4,0.0,,2015-04-05T11:52:26.097Z,0.0,0.0,False,1024.0,11698180.0,False,...,4.0,True,0.0,0.0,success,0.0,photo,http://cdn46.picsart.com/165930745001201.jpeg,bruno-mesquita-393,63.0


In [12]:
nf_p_df['type'][0]

'photo'

In [None]:
numeric_columns = ['comments_count','followers_count', 'forks_count','likes_count','photos_count','reposts_count','sources_count', 'streams_count','views_count']
categorical_columns = ['country_code','type','status']
date_columns = ['created']
technical_data = ['height','id']
binary_vars = ['has_similars','is_verified', 'public','mature','is_reposted', 'username']
other_data = ['name']