In [None]:
import cv2
import numpy as np
import pandas as pd
from scipy.stats import itemfreq
import os
from PIL import Image as IMG
from collections import defaultdict
import operator
from skimage import feature
import time
import gc
from colorthief import ColorThief

def get_dominant_color(img):
    path=images_dir+img 
    color_thief = ColorThief(path)
    dominant_color = color_thief.get_color(quality=3)
    return dominant_color

def get_dominant_color_plus(img):
    path=images_dir+img 
    img = cv2.imread(path)
    img=cv2.resize(img, (0,0), fx=0.25, fy=0.25,interpolation = cv2.INTER_AREA) 
    arr = np.float32(img)
    pixels = arr.reshape((-1, 3))

    n_colors = 5
    criteria = (cv2.TERM_CRITERIA_EPS + cv2.TERM_CRITERIA_MAX_ITER, 100, .1)
    flags = cv2.KMEANS_RANDOM_CENTERS
    _, labels, centroids = cv2.kmeans(pixels, n_colors, None, criteria, 10, flags)

    palette = np.uint8(centroids)
    quantized = palette[labels.flatten()]
    quantized = quantized.reshape(img.shape)

    dominant_color = palette[np.argmax(itemfreq(labels)[:, -1])]
    return dominant_color

def get_average_color(img):
    path=images_dir+img
    image = cv2.imread(path)
    avg_color_per_row = np.average(image, axis=0)
    avg_color = np.average(avg_color_per_row, axis=0)
    
    image = cv2.cvtColor(image, cv2.COLOR_BGR2GRAY)
    fm = cv2.Laplacian(image, cv2.CV_64F).var()
    
    return (avg_color[0]/255,avg_color[1]/255,avg_color[2]/255,fm)

def color_analysis(img):
    # obtain the color palatte of the image 
    palatte = defaultdict(int)
    for pixel in img.getdata():
        palatte[pixel] += 1
    
    # sort the colors present in the image 
    sorted_x = sorted(palatte.items(), key=operator.itemgetter(1), reverse = True)
    light_shade, dark_shade, shade_count, pixel_limit = 0, 0, 0, 25
    for i, x in enumerate(sorted_x[:pixel_limit]):
        if all(xx <= 20 for xx in x[0][:3]): ## dull : too much darkness 
            dark_shade += x[1]
        if all(xx >= 240 for xx in x[0][:3]): ## bright : too much whiteness 
            light_shade += x[1]
        shade_count += x[1]
        
    light_percent = round((float(light_shade)/shade_count)*100, 2)
    dark_percent = round((float(dark_shade)/shade_count)*100, 2)
    return light_percent, dark_percent

def perform_color_analysis(img):
    path = images_dir + img 
    im = IMG.open(path) #.convert("RGB")
    
    # cut the images into two halves as complete average may give bias results
    size = im.size
    halves = (size[0]/2, size[1]/2)
    im1 = im.crop((0, 0, size[0], halves[1]))
    im2 = im.crop((0, halves[1], size[0], size[1]))

    
    try:
        light_percent1, dark_percent1 = color_analysis(im1)
        light_percent2, dark_percent2 = color_analysis(im2)
        light_percent = (light_percent1 + light_percent2)/2 
        dark_percent = (dark_percent1 + dark_percent2)/2 
    except:
        light_percent=None
        dark_percent=None
    
    
    im_array = np.asarray(im.convert(mode='L'))
    edges_sigma1 = feature.canny(im_array, sigma=3)
    apw = (float(np.sum(edges_sigma1)) / (im.size[0]*im.size[1]))
    
    
    return (dark_percent,light_percent,apw*100,size[0],size[1])

def getSize(filename):
    filename = images_dir + filename
    st = os.stat(filename)
    return st.st_size

def get_blurrness_score(image):
    path =  images_dir + image 
    image = cv2.imread(path)
    image = cv2.cvtColor(image, cv2.COLOR_BGR2GRAY)
    fm = cv2.Laplacian(image, cv2.CV_64F).var()
    return fm


folder='test_jpg' ###### folder name
images_dir='E:\\kaggle\\Avito Demand Prediction Challenge\\'+folder+'\\'
indexlist= os.listdir()
if 'index'+folder+'.csv' in indexlist:
    features=pd.read_csv('index'+folder+'.csv',header=0)
    print ('features exist!')
else:
    image_files = os.listdir(images_dir)
    features = pd.DataFrame(image_files, columns=['image'])
    features.to_csv('index'+folder+'.csv', index=False)   
    print ('features do not exist, creat one!' )

splitnum=5
length=int(len(features)/splitnum)
binsize=[x*length for x in range(splitnum)]
binsize.append(len(features))
start_time = time.time()

for i in range(0,splitnum):
    count=0
    indexs=[]
    #ave_r=[]
    #ave_g=[]
    #ave_b=[]
    #blurr=[]
    #dull=[]
    #white=[]
    #ave_pixel_wth=[]
    #wth=[]
    #hgt=[]
    size=[]
    #dom_r=[]
    #dom_g=[]
    #dom_b=[]
    #dom1_r=[]
    #dom1_g=[]
    #dom1_b=[]
    average_colors=[]
    color_analysiss=[]
    
    for n in range(binsize[i],binsize[i+1]):
        im=features.iloc[n]['image']
        indexs.append(im)

        try:
            average_color=get_average_color(im)           
            image_size=getSize(im)
            size.append(image_size)
            average_colors.append(average_color)
            color_analysiss.append(analysis_color)
        except: 
            average_colors.append((None,None,None,None))
            size.append(None)
            print ('1. Cannot identify image file',im)
            
        try:
            analysis_color=perform_color_analysis(im)
            color_analysiss.append(analysis_color)
        except:
            color_analysiss.append((None,None,None,None,None))
            print ('2. Cannot identify image file',im)
        
        #average_red,average_green,average_blue,blurrness=get_average_color(im)
        #ave_r.append(average_red)
        #ave_g.append(average_green)
        #ave_b.append(average_blue)
        #blurr.append(blurrness)
        
        #dullness,whiteness,average_pixel_width,width,height=perform_color_analysis(im)
        #dull.append(dullness)
        #white.append(whiteness)
        #ave_pixel_wth.append(average_pixel_width)
        #wth.append(width)
        #hgt.append(height)
        
        #red,green,blue=get_dominant_color(im)
        #dom_r.append(red)
        #dom_g.append(green)
        #dom_b.append(blue)
        
        #red,green,blue=get_dominant_color_plus(im)
        #dom1_r.append(red)
        #dom1_g.append(green)
        #dom1_b.append(blue)
        
                
        if (count%1000)==0:
            print ('Folder is',folder, ', block is',i, ', progress is',int(count/1000),'k')
            elapsed_time = time.time() - start_time
            print(time.strftime("%H:%M:%S", time.gmtime(elapsed_time)))
        count+=1
    
    filename=folder+'_im_%d.csv'%(i)
    result = {'indexs':indexs, 'average_colors': average_colors, 'color_analysiss': color_analysiss,'size':size}
    result = pd.DataFrame(data=result)
    col=['indexs','average_colors', 'color_analysiss','size']
    result=result[col]
    result.to_csv(filename, index=False)    
    del result, indexs, average_colors, color_analysiss, size
    gc.collect()

features exist!
Folder is test_jpg , block is 0 , progress is 0 k
00:00:00
Folder is test_jpg , block is 0 , progress is 0 k
00:00:10
Folder is test_jpg , block is 0 , progress is 0 k
00:00:20
Folder is test_jpg , block is 0 , progress is 0 k
00:00:30
Folder is test_jpg , block is 0 , progress is 0 k
00:00:40
Folder is test_jpg , block is 0 , progress is 0 k
00:00:50
Folder is test_jpg , block is 0 , progress is 0 k
00:01:00
Folder is test_jpg , block is 0 , progress is 0 k
00:01:10
Folder is test_jpg , block is 0 , progress is 0 k
00:01:20
Folder is test_jpg , block is 0 , progress is 0 k
00:01:29


In [None]:
get_average_color('00242d755aa03c8982829193d5045d006cff7a5ff2f7963ff0430d085f80918c.jpg')

(0, 0)