In [1]:
import struct
import binascii
import requests
import scipy

import scipy.misc
import scipy.cluster

import numpy as np
import pandas as pd

from PIL import Image, ImageChops
from io import BytesIO
from PIL import Image

### **1. Read in the link csv**

In [2]:
pwd

'/Users/rosiebabbra/Desktop/side-projects/hallmart-card-color-analysis/viz'

In [2]:
df = pd.read_csv('../csvs/hallmark_image_s3_urls.csv')

### **2. Get the most common color in each card**

In [3]:
def get_pil_img(url):
    
    """ Returns the image from the URL in bytes form """
    
    response = requests.get(url)
    im = Image.open(BytesIO(response.content))
    
    return im

### **3. Get rid of whitespace in all cards**

In [5]:
def get_rid_of_whitespace(im):
    
    """ Gets rid of any white in an image, returns the image """
    
#     im = get_pil_img(url)

    bg = Image.new(im.mode, im.size, im.getpixel((0,0)))
    diff = ImageChops.difference(im, bg)
    diff = ImageChops.add(diff, diff, 2.0, -100)
    bbox = diff.getbbox()
    
    if bbox:
        return im.crop(bbox)

In [None]:
im = get_pil_img(df['s3_link'][39])
get_rid_of_whitespace(im)

Check that white space was removed...

In [None]:
get_pil_img(df['s3_link'][3]).size

In [None]:
get_rid_of_whitespace(im).size

### **4. Convert de-whitespaced photo to JPEG to feed into color extractor**

In [6]:
def convert_to_jpeg_img_plug_in(removed_white_space_img):   

#     img = get_rid_of_whitespace(url)
    img2 = removed_white_space_img.crop((1,20,50,80))

    b = BytesIO()
    im.save(b, format="jpeg")
    img3 = Image.open(b)
    
    return img3

In [None]:
convert_to_jpeg_img_plug_in(get_rid_of_whitespace(get_pil_img(df['s3_link'][39])))

### **5. Get colors from images**

In [7]:
from __future__ import print_function

def get_color_codes(im):
    
#     pil_img = get_pil_img(url)
#     white_space_removed_im = get_rid_of_whitespace(pil_img)
#     im = convert_to_jpeg_img_plug_in(white_space_removed_im)

    NUM_CLUSTERS = 5

#     print('reading image')
    im = im.resize((150, 150))      # optional, to reduce time
    ar = np.asarray(im)
    shape = ar.shape
    ar = ar.reshape(scipy.product(shape[:2]), shape[2]).astype(float)

#     print('finding clusters')
    codes, dist = scipy.cluster.vq.kmeans(ar, NUM_CLUSTERS)
#     print('cluster centres:\n', codes)

    vecs, dist = scipy.cluster.vq.vq(ar, codes)         # assign codes
    counts, bins = scipy.histogram(vecs, len(codes))    # count occurrences

    index_max = scipy.argmax(counts)                    # find most frequent
    peak = codes[index_max]
    colour = binascii.hexlify(bytearray(int(c) for c in peak)).decode('ascii')
#     print('most frequent is %s (#%s)' % (peak, colour))
    
#     data = {}
#     data[url] = {'hex': colour, 'rgb': peak}
    
    for x in peak:
        if x > 225:
            return codes[index_max - 1]
        else:
            return codes[index_max]

### **6. Run**

In [None]:
# D3: Filter to get the hex codes for the category, sort by them

In [8]:
def main(url):
    
    try:
        pil_image = get_pil_img(url)
#         im = get_rid_of_whitespace(pil_image)
#         converted_im = convert_to_jpeg_img_plug_in(im)

        color_code_dict = get_color_codes(pil_image)
    #     df = pd.DataFrame.from_dict(test_dict, orient='index')

        return color_code_dict
    
    except:
        pass

In [9]:
%time df['rgb'] = df['s3_link'].apply(main)

CPU times: user 13min 42s, sys: 14.6 s, total: 13min 57s
Wall time: 28min 44s


In [12]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 4358 entries, 0 to 4357
Data columns (total 9 columns):
Unnamed: 0        4358 non-null int64
Unnamed: 0.1      4358 non-null int64
Unnamed: 0.1.1    4358 non-null int64
imageLink         4358 non-null object
category          4358 non-null object
cardName          4358 non-null object
s3_file_name      4358 non-null object
s3_link           4358 non-null object
rgb               4358 non-null object
dtypes: int64(3), object(6)
memory usage: 306.5+ KB


In [16]:
df.to_csv('hallmark_card_rgb_codes.csv')

In [20]:
df['rgb'][0]

array([234.01605314, 204.78272903,  73.98200941])

In [21]:
234+204+73

511

In [23]:
df['rgb_sum'] = df['rgb'].apply(lambda x: sum(x))

In [34]:
df_sorted = df.sort_values(by='rgb_sum', ascending=False).reset_index()

In [39]:
! pip install plotly --upgrade

Collecting plotly
[?25l  Downloading https://files.pythonhosted.org/packages/ff/75/3982bac5076d0ce6d23103c03840fcaec90c533409f9d82c19f54512a38a/plotly-3.10.0-py2.py3-none-any.whl (41.5MB)
[K    100% |████████████████████████████████| 41.5MB 92kB/s 
Collecting retrying>=1.3.3 (from plotly)
  Downloading https://files.pythonhosted.org/packages/44/ef/beae4b4ef80902f22e3af073397f079c96969c69b2c7d52a57ea9ae61c9d/retrying-1.3.3.tar.gz
Building wheels for collected packages: retrying
  Building wheel for retrying (setup.py) ... [?25ldone
[?25h  Stored in directory: /Users/rosiebabbra/Library/Caches/pip/wheels/d7/a9/33/acc7b709e2a35caa7d4cae442f6fe6fbf2c43f80823d46460c
Successfully built retrying
[31mError checking for conflicts.
Traceback (most recent call last):
  File "/anaconda3/lib/python3.6/site-packages/pip/_vendor/pkg_resources/__init__.py", line 2897, in _dep_map
    return self.__dep_map
  File "/anaconda3/lib/python3.6/site-packages/pip/_vendor/pkg_resources/__init__.py", line 

In [59]:
#final_df = df_sorted.loc[:, df_sorted.columns != 'rgb_sum']
final_df.to_csv('hallmark_card_hex_codes_sorted.csv')