In [1]:
import cv2
import numpy as np
import pandas as pd
from scipy.stats import itemfreq
import os
from PIL import Image as IMG
from collections import defaultdict
import operator
from skimage import feature
import time
import gc
from colorthief import ColorThief

In [11]:
def get_dominant_color(img):
    path=images_dir+img 
    color_thief = ColorThief(path)
    dominant_color = color_thief.get_color(quality=3)
    return dominant_color

def get_dominant_color_plus(img):
    path=images_dir+img 
    img = cv2.imread(path)
    img=cv2.resize(img, (0,0), fx=0.20, fy=0.20,interpolation = cv2.INTER_AREA) 
    arr = np.float32(img)
    pixels = arr.reshape((-1, 3))

    n_colors = 5
    criteria = (cv2.TERM_CRITERIA_EPS + cv2.TERM_CRITERIA_MAX_ITER, 100, .1)
    flags = cv2.KMEANS_RANDOM_CENTERS
    _, labels, centroids = cv2.kmeans(pixels, n_colors, None, criteria, 10, flags)

    palette = np.uint8(centroids)
    quantized = palette[labels.flatten()]
    quantized = quantized.reshape(img.shape)

    dominant_color = palette[np.argmax(itemfreq(labels)[:, -1])]
    return dominant_color

In [2]:
def get_average_color(img):
    path=images_dir+img
    image = cv2.imread(path)
    avg_color_per_row = np.average(image, axis=0)
    avg_color = np.average(avg_color_per_row, axis=0)
    
    image = cv2.cvtColor(image, cv2.COLOR_BGR2GRAY)
    fm = cv2.Laplacian(image, cv2.CV_64F).var()
    
    return (avg_color[0]/255,avg_color[1]/255,avg_color[2]/255,fm)

In [3]:
def color_analysis(img):
    # obtain the color palatte of the image 
    palatte = defaultdict(int)
    for pixel in img.getdata():
        palatte[pixel] += 1
    
    # sort the colors present in the image 
    sorted_x = sorted(palatte.items(), key=operator.itemgetter(1), reverse = True)
    light_shade, dark_shade, shade_count, pixel_limit = 0, 0, 0, 25
    for i, x in enumerate(sorted_x[:pixel_limit]):
        if all(xx <= 20 for xx in x[0][:3]): ## dull : too much darkness 
            dark_shade += x[1]
        if all(xx >= 240 for xx in x[0][:3]): ## bright : too much whiteness 
            light_shade += x[1]
        shade_count += x[1]
        
    light_percent = round((float(light_shade)/shade_count)*100, 2)
    dark_percent = round((float(dark_shade)/shade_count)*100, 2)
    return light_percent, dark_percent

def perform_color_analysis(img):
    path = images_dir + img 
    im = IMG.open(path) #.convert("RGB")
    
    # cut the images into two halves as complete average may give bias results
    size = im.size
    halves = (size[0]/2, size[1]/2)
    im1 = im.crop((0, 0, size[0], halves[1]))
    im2 = im.crop((0, halves[1], size[0], size[1]))

    
    light_percent1, dark_percent1 = color_analysis(im1)
    light_percent2, dark_percent2 = color_analysis(im2)
    
    

    light_percent = (light_percent1 + light_percent2)/2 
    dark_percent = (dark_percent1 + dark_percent2)/2 
    
    im_array = np.asarray(im.convert(mode='L'))
    edges_sigma1 = feature.canny(im_array, sigma=3)
    apw = (float(np.sum(edges_sigma1)) / (im.size[0]*im.size[1]))
    
    
    return (dark_percent,light_percent,apw*100,size[0],size[1])

In [4]:
def getSize(filename):
    filename = images_dir + filename
    st = os.stat(filename)
    return st.st_size

In [5]:
def get_blurrness_score(image):
    path =  images_dir + image 
    image = cv2.imread(path)
    image = cv2.cvtColor(image, cv2.COLOR_BGR2GRAY)
    fm = cv2.Laplacian(image, cv2.CV_64F).var()
    return fm

In [6]:
folder='train_jpg_4'
images_dir='E:\\kaggle\\Avito Demand Prediction Challenge\\'+folder+'\\'
indexlist= os.listdir()
if 'index'+folder+'.csv' in indexlist:
    features=pd.read_csv('index'+folder+'.csv',header=0)
    print ('features exist!')
else:
    image_files = os.listdir(images_dir)
    features = pd.DataFrame(image_files, columns=['image'])
    features.to_csv('index'+folder+'.csv', index=False)   
    print ('features do not exist, creat one!' )

features exist!


In [None]:
splitnum=5
length=int(len(features)/splitnum)
binsize=[x*length for x in range(splitnum)]
binsize.append(len(features))
start_time = time.time()

for i in range(0,splitnum):
    count=0
    image=[]
    ave_r=[]
    ave_g=[]
    ave_b=[]
    blurr=[]
    dull=[]
    white=[]
    ave_pixel_wth=[]
    wth=[]
    hgt=[]
    size=[]
    dom_r=[]
    dom_g=[]
    dom_b=[]
    dom1_r=[]
    dom1_g=[]
    dom1_b=[]
    for n in range(binsize[i],binsize[i+1]):
        im=features.iloc[n]['image']
        image.append(im)
        
        average_red,average_green,average_blue,blurrness=get_average_color(im)
        ave_r.append(average_red)
        ave_g.append(average_green)
        ave_b.append(average_blue)
        blurr.append(blurrness)
        
        dullness,whiteness,average_pixel_width,width,height=perform_color_analysis(im)
        dull.append(dullness)
        white.append(whiteness)
        ave_pixel_wth.append(average_pixel_width)
        wth.append(width)
        hgt.append(height)
        
        image_size=getSize(im)
        size.append(image_size)
        
        red,green,blue=get_dominant_color(im)
        dom_r.append(red)
        dom_g.append(green)
        dom_b.append(blue)
        
        red,green,blue=get_dominant_color_plus(im)
        dom1_r.append(red)
        dom1_g.append(green)
        dom1_b.append(blue)
        
                
        if (count%1000)==0:
            print ('Folder is',i, 'progress is',int(count/1000),'k')
            elapsed_time = time.time() - start_time
            print(time.strftime("%H:%M:%S", time.gmtime(elapsed_time)))
        count+=1
    
    filename=folder+'_%d.csv'%(i)
    result = {'image':image, 'ave_r': ave_r, 'ave_g': ave_g, 'ave_b':ave_b, 
              'dom_r':dom_r,'dom_g':dom_g, 'dom_b':dom_b,
              'dom1_r':dom_r,'dom1_g':dom_g, 'dom1_b':dom_b,
              'blurr':blurr, 'dull':dull, 'white':white, 
              'ave_pixel_wth':ave_pixel_wth, 'wth':wth, 'hgt':hgt, 'size':size}
    result = pd.DataFrame(data=result)
    col=['image','ave_r', 'ave_g', 'ave_b',
         'dom_r', 'dom_g', 'dom_b',
         'dom1_r', 'dom1_g', 'dom1_b',
         'blurr', 'dull', 'white', 
         'ave_pixel_wth', 'wth', 'hgt', 'size']
    result=result[col]
    result.to_csv(filename, index=False)    
    del result, ave_r, ave_g, ave_b, blurr, dull, white, ave_pixel_wth, wth, hgt, size, dom_r, 
    dom_g, dom_b, dom1_r, dom1_g, dom1_b, image
    gc.collect()
        
        

`itemfreq` is deprecated and will be removed in a future version. Use instead `np.unique(..., return_counts=True)`


Folder is 0 progress is 0 k
00:00:00


In [7]:
#images_dir='E:\\kaggle\\Avito Demand Prediction Challenge\\image\\'
images_dir='E:\\kaggle\\Avito Demand Prediction Challenge\\train_jpg_0\\'
image_files = os.listdir(images_dir)
#image_files=image_files[:7]
features = pd.DataFrame(image_files, columns=['image'])
print(features)

                                                 image
0    00000a699c7995573766c5403c2428143e7f0c40b311da...
1    000012cc5c75ccd9dccc1e325d22edc9ec16b9926a0194...
2    000037a93d25a1c3f3066b33d667ed3c03d20a7159b32a...
3    00003f4d0f91fa03e947b568ef83e03295ac735f47a082...
4    00005eb46a94b7fbeaaec8f571bf6c09117329de5bf744...
5    00006bf12b2df373e4f918c4e5f39e8674b5620fd53711...
6    0000f99fa22d31d72c9e0de562879e36ed30d88bc22cbb...
7    000a355b9c6387892b72423587360c0f61faafcd694a59...
8    000a89bc439bf4bfe729cf95c26bd95656e4d8347d66b2...
9    000abf57cea73aab63fe1e7588398da2d80c164a31eef4...
10   000ae7eb0d3c4e4e344a15c5f550426057cbea0ad997ce...
11   000af87a0af53efe68abf88e86383205caf1a77c69924a...
12   000b47d7c13ea31a29571e23ade529cf1d0699a7a97179...
13   000b614c5a94d96d021732ec507f47c7c0d07fd3524672...
14   000b740db74624d188ea937028f9c5efb453bd0f0bc405...
15   000b878a30521a452aa603999142ad75f148afbc2b3c63...
16   000b97f1add5b134530fed0979fbb22786b4cf9d0cfa8a...
17   000bd

In [8]:
start_time = time.time()

features['average_color'] = features['image'].apply(get_average_color)
#features['average_red'] = features['average_color'].apply(lambda x: x[0])
#features['average_green'] = features['average_color'].apply(lambda x: x[1])
#features['average_blue'] = features['average_color'].apply(lambda x: x[2])
#features['blurrness'] = features['average_color'].apply(lambda x: x[3])
features[['average_red','average_green','average_blue','blurrness']] = pd.DataFrame(features.average_color.values.tolist(), 
                                                                                    index= features.index)
print('average_color finished!')

#features['dominant_color'] = features['image'].apply(get_dominant_color)
#features['dominant_red'] = features['dominant_color'].apply(lambda x: x[0]) / 255
#features['dominant_green'] = features['dominant_color'].apply(lambda x: x[1]) / 255
#features['dominant_blue'] = features['dominant_color'].apply(lambda x: x[2]) / 255
#print('dominant_color finished!')


features['color_analysis']=features['image'].apply(perform_color_analysis)
#features['dullness'] = features['color_analysis'].apply(lambda x : x[0])
#features['whiteness'] = features['color_analysis'].apply(lambda x : x[1])
#features['average_pixel_width']=features['color_analysis'].apply(lambda x : x[2])
#features['width'] = features['color_analysis'].apply(lambda x : x[3])
#features['height'] = features['color_analysis'].apply(lambda x : x[4])
features[['dullness','whiteness','average_pixel_width','width','height']] = pd.DataFrame(features.color_analysis.values.tolist(), 
                                                                                    index= features.index)
features = features.drop(['color_analysis','average_color'], axis=1)
print('color_analysis finished!')

features['image_size'] = features['image'].apply(getSize)
#features = features.drop(['temp_size','average_color','dominant_color'], axis=1)
print('size finished!')


elapsed_time = time.time() - start_time
print(time.strftime("%H:%M:%S", time.gmtime(elapsed_time)))

features.to_csv('imagefeature01.csv', index=False)
features

average_color finished!
color_analysis finished!
size finished!
00:01:15


Unnamed: 0,image,average_red,average_green,average_blue,blurrness,dullness,whiteness,average_pixel_width,width,height,image_size
0,00000a699c7995573766c5403c2428143e7f0c40b311da...,0.389782,0.376299,0.388127,181.965334,12.295,0.000,1.461227,288,480,26580
1,000012cc5c75ccd9dccc1e325d22edc9ec16b9926a0194...,0.435402,0.509505,0.563468,580.963294,22.910,9.160,1.827932,270,480,33147
2,000037a93d25a1c3f3066b33d667ed3c03d20a7159b32a...,0.446792,0.461440,0.455517,984.028037,0.000,0.000,3.920139,480,360,58595
3,00003f4d0f91fa03e947b568ef83e03295ac735f47a082...,0.495299,0.503907,0.475629,57.333156,0.000,0.000,0.817708,360,480,20041
4,00005eb46a94b7fbeaaec8f571bf6c09117329de5bf744...,0.440757,0.396534,0.291716,829.444957,0.000,6.950,3.243056,360,360,35086
5,00006bf12b2df373e4f918c4e5f39e8674b5620fd53711...,0.293706,0.373062,0.345208,10819.093185,46.175,1.660,4.102431,480,360,72299
6,0000f99fa22d31d72c9e0de562879e36ed30d88bc22cbb...,0.590418,0.616538,0.618698,344.739501,0.000,4.295,2.189994,360,362,34931
7,000a355b9c6387892b72423587360c0f61faafcd694a59...,0.444976,0.451058,0.440232,558.775942,46.200,43.360,2.817130,360,480,40937
8,000a89bc439bf4bfe729cf95c26bd95656e4d8347d66b2...,0.559711,0.596014,0.595554,1435.729149,0.000,0.000,5.381687,540,360,68378
9,000abf57cea73aab63fe1e7588398da2d80c164a31eef4...,0.374753,0.406857,0.413143,4147.993791,33.885,0.000,7.837064,360,417,64106


In [38]:
start_time = time.time()
features['dominant_color'] = features['image'].apply(get_dominant_color)
elapsed_time = time.time() - start_time
print(time.strftime("%H:%M:%S", time.gmtime(elapsed_time)))
features

`itemfreq` is deprecated and will be removed in a future version. Use instead `np.unique(..., return_counts=True)`


00:00:35


Unnamed: 0,image,average_red,average_green,average_blue,blurrness,dullness,whiteness,average_pixel_width,width,height,image_size,dominant_color
0,00000a699c7995573766c5403c2428143e7f0c40b311da...,0.389782,0.376299,0.388127,181.965334,12.295,0.000,1.461227,288,480,26580,"[45, 29, 27]"
1,000012cc5c75ccd9dccc1e325d22edc9ec16b9926a0194...,0.435402,0.509505,0.563468,580.963294,22.910,9.160,1.827932,270,480,33147,"[100, 126, 147]"
2,000037a93d25a1c3f3066b33d667ed3c03d20a7159b32a...,0.446792,0.461440,0.455517,984.028037,0.000,0.000,3.920139,480,360,58595,"[89, 94, 91]"
3,00003f4d0f91fa03e947b568ef83e03295ac735f47a082...,0.495299,0.503907,0.475629,57.333156,0.000,0.000,0.817708,360,480,20041,"[150, 161, 162]"
4,00005eb46a94b7fbeaaec8f571bf6c09117329de5bf744...,0.440757,0.396534,0.291716,829.444957,0.000,6.950,3.243056,360,360,35086,"[124, 118, 79]"
5,00006bf12b2df373e4f918c4e5f39e8674b5620fd53711...,0.293706,0.373062,0.345208,10819.093185,46.175,1.660,4.102431,480,360,72299,"[128, 149, 139]"
6,0000f99fa22d31d72c9e0de562879e36ed30d88bc22cbb...,0.590418,0.616538,0.618698,344.739501,0.000,4.295,2.189994,360,362,34931,"[140, 152, 152]"
7,000a355b9c6387892b72423587360c0f61faafcd694a59...,0.444976,0.451058,0.440232,558.775942,46.200,43.360,2.817130,360,480,40937,"[80, 84, 85]"
8,000a89bc439bf4bfe729cf95c26bd95656e4d8347d66b2...,0.559711,0.596014,0.595554,1435.729149,0.000,0.000,5.381687,540,360,68378,"[252, 228, 202]"
9,000abf57cea73aab63fe1e7588398da2d80c164a31eef4...,0.374753,0.406857,0.413143,4147.993791,33.885,0.000,7.837064,360,417,64106,"[27, 32, 34]"


In [25]:
start_time = time.time()
features['dominant_color'] = features['image'].apply(get_dominant_color)
elapsed_time = time.time() - start_time
print(time.strftime("%H:%M:%S", time.gmtime(elapsed_time)))
features


`itemfreq` is deprecated and will be removed in a future version. Use instead `np.unique(..., return_counts=True)`


00:01:00


Unnamed: 0,image,average_red,average_green,average_blue,blurrness,dullness,whiteness,average_pixel_width,width,height,image_size,dominant_color
0,00000a699c7995573766c5403c2428143e7f0c40b311da...,0.389782,0.376299,0.388127,181.965334,12.295,0.000,1.461227,288,480,26580,"[45, 29, 27]"
1,000012cc5c75ccd9dccc1e325d22edc9ec16b9926a0194...,0.435402,0.509505,0.563468,580.963294,22.910,9.160,1.827932,270,480,33147,"[99, 126, 147]"
2,000037a93d25a1c3f3066b33d667ed3c03d20a7159b32a...,0.446792,0.461440,0.455517,984.028037,0.000,0.000,3.920139,480,360,58595,"[52, 53, 51]"
3,00003f4d0f91fa03e947b568ef83e03295ac735f47a082...,0.495299,0.503907,0.475629,57.333156,0.000,0.000,0.817708,360,480,20041,"[150, 161, 162]"
4,00005eb46a94b7fbeaaec8f571bf6c09117329de5bf744...,0.440757,0.396534,0.291716,829.444957,0.000,6.950,3.243056,360,360,35086,"[123, 117, 77]"
5,00006bf12b2df373e4f918c4e5f39e8674b5620fd53711...,0.293706,0.373062,0.345208,10819.093185,46.175,1.660,4.102431,480,360,72299,"[14, 29, 22]"
6,0000f99fa22d31d72c9e0de562879e36ed30d88bc22cbb...,0.590418,0.616538,0.618698,344.739501,0.000,4.295,2.189994,360,362,34931,"[143, 155, 155]"
7,000a355b9c6387892b72423587360c0f61faafcd694a59...,0.444976,0.451058,0.440232,558.775942,46.200,43.360,2.817130,360,480,40937,"[80, 84, 86]"
8,000a89bc439bf4bfe729cf95c26bd95656e4d8347d66b2...,0.559711,0.596014,0.595554,1435.729149,0.000,0.000,5.381687,540,360,68378,"[253, 229, 203]"
9,000abf57cea73aab63fe1e7588398da2d80c164a31eef4...,0.374753,0.406857,0.413143,4147.993791,33.885,0.000,7.837064,360,417,64106,"[24, 29, 32]"


In [33]:
color_thief = ColorThief(images_dir+image_files[0])
# get the dominant color
dominant_color = color_thief.get_color(quality=1)
# build a color palette
palette = color_thief.get_palette(color_count=6)

In [18]:
def get_dominant_color(img):
    path=images_dir+img 
    color_thief = ColorThief(path)
    dominant_color = color_thief.get_color(quality=3)
    return dominant_color

In [36]:
features['dominant_color'] = features['image'].apply(get_dominant_color)
features

Unnamed: 0,image,dominant_color
0,00000a699c7995573766c5403c2428143e7f0c40b311da...,"(150, 162, 165)"
1,000012cc5c75ccd9dccc1e325d22edc9ec16b9926a0194...,"(146, 128, 104)"
2,000037a93d25a1c3f3066b33d667ed3c03d20a7159b32a...,"(82, 84, 81)"
3,00003f4d0f91fa03e947b568ef83e03295ac735f47a082...,"(159, 160, 149)"
4,00005eb46a94b7fbeaaec8f571bf6c09117329de5bf744...,"(57, 86, 101)"
5,00006bf12b2df373e4f918c4e5f39e8674b5620fd53711...,"(56, 63, 43)"
6,0000f99fa22d31d72c9e0de562879e36ed30d88bc22cbb...,"(191, 192, 182)"


In [36]:
def get_dominant_color(img):
    path=images_dir+img 
    img = cv2.imread(path)
    img=cv2.resize(img, (0,0), fx=0.20, fy=0.20,interpolation = cv2.INTER_AREA) 
    arr = np.float32(img)
    pixels = arr.reshape((-1, 3))

    n_colors = 5
    criteria = (cv2.TERM_CRITERIA_EPS + cv2.TERM_CRITERIA_MAX_ITER, 100, .1)
    flags = cv2.KMEANS_RANDOM_CENTERS
    _, labels, centroids = cv2.kmeans(pixels, n_colors, None, criteria, 10, flags)

    palette = np.uint8(centroids)
    quantized = palette[labels.flatten()]
    quantized = quantized.reshape(img.shape)

    dominant_color = palette[np.argmax(itemfreq(labels)[:, -1])]
    return dominant_color

In [7]:
perform_color_analysis('393911b437b1cb941e80a7c33950b2266ecf8128233db2d6e3b263399f83b5ef.jpg')

(11.47, 0.0, 4.539351851851852, 360, 480)

In [8]:
get_average_color('393911b437b1cb941e80a7c33950b2266ecf8128233db2d6e3b263399f83b5ef.jpg')

(0.40310584604212035,
 0.40755548747276676,
 0.41906147875816974,
 1535.2923322600188)