# Data Collection - Conceptual Captions
In this document, I'll download the images from Google's Conceptual Captions dataset. (https://ai.google.com/research/ConceptualCaptions/download)
There are a lot of images, but for this project sake, I will only get 30K images.

In [51]:
import pandas as pd
import requests
import sys
import time
from PIL import Image
import os
import joblib

In [52]:
train_df = pd.read_csv('../DATA/Train_GCC-training.tsv', delimiter='\t', header=None, names = ['description', 'url'])

### Download images
Now I'll make a call to individual url to download images to the local drive.

In [53]:
path = '../DATA/IMAGES/Conceptual_Captions'

In [54]:
flist = os.listdir(path)

In [None]:
to_delete = []

In [85]:
# downloading images
def download_img(ind, url, path, flist = None, pause = False): 
    status = None 
    
    if flist == None: 
        flist = os.listdir(path)
    
    print(f"Processing {ind} ", end = '...')
    
    if f"{ind}.jpg" in flist: 
        print(f"{ind}.jpg exists.", end = '.')
        status = 'saved'
    
    if url.startswith('http'):
        
        # save the file
        
        fpath = f"{path}/{ind}.jpg"

        if status != 'saved': 
            
            try:
                resp = requests.get(url, headers = {'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/86.0.4240.198 Safari/537.36'})
                with open(fpath, 'wb') as fp:
                    fp.write(resp.content)

                status = 'saved'
                
            except:
                print(f'Failed to save. \n{sys.exc_info()[0]}')
                status = 'failed'
                return status

        try:
            # resizing image
            with Image.open(fpath) as img: 
                img.thumbnail((500, 500))
                img.save(fpath)
            
            status = 'complete'
            print('Complete')

        except: 
            print(f'Failed to resize. \n{sys.exc_info()[0]}')
            status = 'delete'

        


    else:
        status = 'failed'
    
    if pause: 
        pause = np.random.randint(1, 60)
        time.sleep(pause) # buffer
        
    return status

In [60]:
tmp = train_df.copy()

In [61]:
tmp['status'] = None

In [None]:
for ind, url in enumerate(tmp['url'][:30000]): 
    status = download_img(ind, url, path, flist = flist) 
    if status: 
        tmp['status'][ind] = status
    # save the log every 1000 runs
    if ind % 1000 == 0: 
        joblib.dump(tmp, '../TMP/tmp')

In [36]:
#tmp = joblib.load('../TMP/tmp')

In [87]:
# filtering out ones without images
tmp = tmp.dropna(subset = ['status'])

In [88]:
tmp.status.value_counts()

complete    26681
delete       2680
failed        639
Name: status, dtype: int64

Let's delete ones that did not get successfully resized.

In [92]:
for item in list(tmp[tmp.status == 'delete'].index):
    try:
        os.remove(f"{path}/{item}.jpg")
    except:
        print(f"{item}: {sys.exc_info()[0]}")

In [94]:
complete = tmp[tmp.status == 'complete']

In [96]:
# exporting file 
cc_reference = dict(zip(complete.index, complete.description))
joblib.dump(cc_reference, '../PKL/cc_reference')

['../PKL/cc_reference']