# main script

In [1]:
# import the necessary packages
from sklearn.cluster import KMeans
from collections import Counter
import pandas as pd
import numpy as np
import urllib
import cv2
import time


In [2]:
# function to replace char in a string
def clean_sw(string, x,y):
    try:
        x = str(string).replace(x,y)
    except:
        x = ""
    return x

In [3]:
# METHOD #1: OpenCV, NumPy, and urllib
def url_to_image(url):
    # download the image, convert it to a NumPy array, and then read
    # it into OpenCV format
    resp = urllib.request.urlopen(url)
    image = np.asarray(bytearray(resp.read()), dtype="uint8")
    image = cv2.imdecode(image, cv2.IMREAD_COLOR)

    # return the image
    return image

In [4]:
# METHOD #2: get RGB
def get_dominant_color(image, k=10, image_processing_size = None):
    """
    takes an image as input
    returns the dominant color of the image as a list
    
    dominant color is found by running k means on the 
    pixels & returning the centroid of the largest cluster

    processing time is sped up by working with a smaller image; 
    this resizing can be done with the image_processing_size param 
    which takes a tuple of image dims as input

    >>> get_dominant_color(my_image, k=4, image_processing_size = (25, 25))
    [56.2423442, 34.0834233, 70.1234123]
    """
    #resize image if new dims provided
    if image_processing_size is not None:
        image = cv2.resize(image, image_processing_size, 
                            interpolation = cv2.INTER_AREA)
    
    #reshape the image to be a list of pixels
    image = image.reshape((image.shape[0] * image.shape[1], 3))

    #cluster and assign labels to the pixels 
    clt = KMeans(n_clusters = k)
    labels = clt.fit_predict(image)

    #count labels to find most popular
    label_counts = Counter(labels)

    #subset out most popular centroid
    dominant_color = clt.cluster_centers_[label_counts.most_common(1)[0][0]]

    return list(dominant_color)

In [5]:
# Load DataFrame from CSV
# df = pd.read_csv("data/df_sample_32530.csv", index_col=0)
df1 = pd.read_csv('data_2.csv', sep=';', index_col=0)
# df2 = pd.read_csv('data_2.csv', sep=';', index_col=0)
# df3 = pd.read_csv('data_3.csv', sep=';', index_col=0)

# Data Cleansing: replace ';' -> '|'
df1['product_name'] = np.vectorize(clean_sw)(df1['product_name'], ';',"|")
df1['cat'] = np.vectorize(clean_sw)(df1['cat'], ';',"|")

In [6]:
# Iterate every row to get color dominant (RGB)
count = 0
time_start = time.time()
for index, row in df1.iterrows():
    if row['r'] == 0 and row['g'] == 0 and row['b'] == 0:
        url = row['image_url']
        try:
            img = url_to_image(url)
            if img is not None:
                try:
                    img_resized = cv2.resize(img,(32,32))
                    r,g,b = get_dominant_color(img_resized, k=5, image_processing_size = None)
                    df1.loc[index,'r'] = r
                    df1.loc[index,'g'] = g
                    df1.loc[index,'b'] = b
                except:
                    df1.loc[index,'r'] = 0
                    df1.loc[index,'g'] = 0
                    df1.loc[index,'b'] = 0
            else:
                pass
        except:
            print("Error:", url)
    else:
        pass
    count += 1
    runtime = time.time() - time_start
    # Debugging for every 200 rows
    if (count == 1):
        print('started at ' + str(runtime))
    if (count % 200) == 0:
        print ('Data ke:' + str(count)+ ". runtime " + str(runtime))
#     if (count > 6600):
#         print(str(count), row['product_id'])
# img_size 32x32 is about 0.2 s - 0.3 s for every single load 
# img_size 100x100 is about 0.5 s - 0.6 s for every single load


started at 0.5749845504760742
Data ke:200. runtime 72.37220048904419
Data ke:400. runtime 136.45870089530945


  return_n_iter=True)


Data ke:600. runtime 207.46075010299683
Data ke:800. runtime 276.45070242881775
Data ke:1000. runtime 352.6776123046875
Data ke:1200. runtime 417.8917443752289
Data ke:1400. runtime 489.3996489048004
Data ke:1600. runtime 568.1075258255005
Data ke:1800. runtime 646.0466272830963
Data ke:2000. runtime 719.7183711528778


  return_n_iter=True)


Data ke:2200. runtime 785.606511592865
Data ke:2400. runtime 854.6189222335815
Data ke:2600. runtime 927.1413764953613
Data ke:2800. runtime 999.9766328334808
Data ke:3000. runtime 1075.9875102043152
Data ke:3200. runtime 1146.3213346004486
Data ke:3400. runtime 1224.5922544002533
Data ke:3600. runtime 1299.541249036789
Data ke:3800. runtime 1382.5418741703033
Data ke:4000. runtime 1468.8346824645996
Data ke:4200. runtime 1534.0570273399353
Data ke:4400. runtime 1603.2579820156097


  return_n_iter=True)


Data ke:4600. runtime 1697.281596660614
Data ke:4800. runtime 1767.3648626804352
Data ke:5000. runtime 1841.7494111061096
Data ke:5200. runtime 1920.069585800171
Data ke:5400. runtime 2004.506386756897
Data ke:5600. runtime 2081.8876638412476
Data ke:5800. runtime 2156.387681722641
Data ke:6000. runtime 2232.163448572159
Data ke:6200. runtime 2320.482969045639
Data ke:6400. runtime 2396.807921886444
Data ke:6600. runtime 2497.887033224106
Data ke:6800. runtime 2571.71791100502
Data ke:7000. runtime 2662.6931154727936
Data ke:7200. runtime 2741.5759382247925
Data ke:7400. runtime 2824.8430802822113
Data ke:7600. runtime 2914.8828134536743
Data ke:7800. runtime 2989.6788442134857
Data ke:8000. runtime 3073.7715492248535
Data ke:8200. runtime 3161.1583523750305
Error: https://ecs7.tokopedia.net/img/product-1/2014/8/24/189238/189238_111bdcba-2b3f-11e4-9f32-7ba04908a8c2.jpg
Data ke:8400. runtime 3250.347081184387
Data ke:8600. runtime 3320.5810132026672
Data ke:8800. runtime 3402.9351720809

  return_n_iter=True)


Data ke:11600. runtime 4252.405158281326
Data ke:11800. runtime 4307.4409284591675
Data ke:12000. runtime 4369.218811511993
Data ke:12200. runtime 4409.501935005188


In [8]:
# Convert float to int
df1['r'] = df1['r'].astype(int)
df1['g'] = df1['g'].astype(int)
df1['b'] = df1['b'].astype(int)

In [9]:
# Export DataFrame to CSV
df1.to_csv("df_rgb_32x32_2.csv", sep=';')

# for testing

In [116]:
link_url = "https://ecs7.tokopedia.net/img/product-1/2016/11/10/10901253/10901253_23463d23-d216-4717-8086-b6314241e232.jpg"
resize_to = (100,100)
img = url_to_image(link_url)
b = cv2.resize(img,resize_to)
r,g,b = get_dominant_color(b, k=5, image_processing_size = None)
# print(resize_to, r, g, b)

URLError: <urlopen error [Errno 11001] getaddrinfo failed>

In [117]:
resize_to, r, g, b

((100, 100), 123.0, 133.0, 138.5)

# scratching

In [111]:
r,g,b # 5

(123.0, 133.0, 138.5)

In [102]:
r,g,b # 10

(113.03571428571428, 119.35714285714286, 136.35714285714286)

In [100]:
r,g,b # 64

(125.54145854145854, 133.42157842157843, 153.83816183816185)

In [98]:
r,g,b # 25

(127.52287581699345, 135.50980392156862, 154.08496732026146)

In [96]:
r,g,b # 50

(119.23547400611616, 127.64373088685018, 146.72324159021403)

In [93]:
r,g,b # 32

(91.66423357664235, 96.65693430656934, 108.10583941605839)

In [91]:
r,g,b #100

(122.22102425876018, 130.32306507508656, 150.09279938390506)

In [5]:
c = cv2.resize(b,(1000,1000))

In [7]:
df = pd.read_csv("data/df_sample_32530.csv", index_col=0)

In [60]:
count = 0
time_start = time.time()
for index, row in df.iterrows():
    url = row['image_url']
    img = url_to_image(url)
    if img is not None:
        img_resized = cv2.resize(img,(100,100))
        r,g,b = get_dominant_color(img_resized, k=5, image_processing_size = None)
        df.loc[index,'r'] = r
        df.loc[index,'g'] = g
        df.loc[index,'b'] = b
        count += 1
        runtime = time.time() - time_start
        if count == 1:
            print('started at ' + str(runtime))
        if count % 200 == 0:
            print ('Data ke:' + str(count)+ ". runtime " + str(runtime))
    else:
        df.loc[index,'r'] = 0
        df.loc[index,'g'] = 0
        df.loc[index,'b'] = 0

#     if count > 500:
#         break
#         
#     row['new'] = "link " + url
#     print(url)

started at 0.30394840240478516
Data ke:200. runtime 98.23565435409546
Data ke:400. runtime 200.19902658462524


  return_n_iter=True)


Data ke:600. runtime 306.3606150150299
Data ke:800. runtime 416.8569676876068
Data ke:1000. runtime 521.2771518230438
Data ke:1200. runtime 627.2788045406342
Data ke:1400. runtime 732.4171130657196
Data ke:1600. runtime 835.8000919818878
Data ke:1800. runtime 936.4156267642975
Data ke:2000. runtime 1039.6819944381714
Data ke:2200. runtime 1145.2895891666412
Data ke:2400. runtime 1251.887137413025
Data ke:2600. runtime 1351.9467375278473
Data ke:2800. runtime 1462.5375728607178
Data ke:3000. runtime 1568.747395515442
Data ke:3200. runtime 1674.3159413337708
Data ke:3400. runtime 1782.767166852951
Data ke:3600. runtime 1893.4202914237976
Data ke:3800. runtime 1998.7704083919525


  return_n_iter=True)


Data ke:4000. runtime 2108.1556515693665
Data ke:4200. runtime 2208.77726149559
Data ke:4400. runtime 2308.565583229065
Data ke:4600. runtime 2409.0203201770782
Data ke:4800. runtime 2509.840015888214
Data ke:5000. runtime 2611.14306139946
Data ke:5200. runtime 2718.7993206977844
Data ke:5400. runtime 2817.358578681946
Data ke:5600. runtime 2919.696995973587
Data ke:5800. runtime 3026.599869251251


HTTPError: HTTP Error 403: Forbidden

In [61]:
df_ = df.copy()

In [77]:
df.head()

Unnamed: 0,product_id,product_name,create_time,child_cat_id,cat,product_pic_id,image_url,len,r,g,b
35891,8342220,Tas Sekolah Anak Ransel ZR 005,2015-01-22 10:25:10.832840,1973,"[(78, Fashion Anak), (1973, Tas Anak), (0, )]",13366912,https://ecs7.tokopedia.net/img/product-1/2015/...,105,254,254,254
93741,34766098,sepatu kets nike airmax 90 women ( l ) termurah,2016-03-02 17:28:14.528214,90,0,60062805,https://ecs7.tokopedia.net/img/product-1/2016/...,106,64,88,95
213886,175101431,Natural Star Suntone 6.88 cts,2017-04-21 18:33:23.725939,1234,0,352749549,https://ecs7.tokopedia.net/img/product-1/2017/...,105,107,144,209
671714,306627274,TRAVEL BAG KULIT (WATERPROOF) Travel Bag Kulit...,2018-07-16 20:59:15.553273,1678,0,680560208,https://ecs7.tokopedia.net/img/product-1/2018/...,115,254,254,254
442833,426414967,Mesin Bor Bosch 6mm GBM 6 Terlaris,2019-03-14 15:03:02.008698,174,0,1030560424,https://ecs7.tokopedia.net/img/product-1/2019/...,118,179,190,199


In [82]:
df['product_name'] = np.vectorize(clean_sw)(df['product_name'], ';',"|")
df['cat'] = np.vectorize(clean_sw)(df['cat'], ';',"|")

In [80]:
df = pd.read_csv('data/df_rgb_5000an_32_32.csv',sep=';', index_col=0)

In [63]:
df.to_csv('data/df_rgb_5000an_100_100.csv', sep=';')

In [85]:
df = df.fillna(0)

In [87]:
df['r'] = df['r'].astype(int)
df['g'] = df['g'].astype(int)
df['b'] = df['b'].astype(int)

In [89]:
df.to_csv("data/df_rgb_32x32.csv", sep=';')

In [21]:
x = time.time()

In [56]:
time.time() - x

21.976031064987183

In [17]:
2189180 % 10

0

In [14]:
df_head

Unnamed: 0.1,Unnamed: 0,product_id,product_name,create_time,child_cat_id,cat,product_pic_id,image_url,new
0,0,252734311,Kemeja Tunik Anak cewek,2018-01-18 12:01:16.827911,151,"[(78, Fashion Anak), (82, Pakaian Anak Perempu...",1027145252,https://ecs7.tokopedia.net/img/product-1/2019/...,link https://ecs7.tokopedia.net/img/product-1/...
1,1,252734311,Kemeja Tunik Anak cewek,2018-01-18 12:01:16.827911,151,"[(78, Fashion Anak), (82, Pakaian Anak Perempu...",1027145251,https://ecs7.tokopedia.net/img/product-1/2019/...,link https://ecs7.tokopedia.net/img/product-1/...
2,2,252734311,Kemeja Tunik Anak cewek,2018-01-18 12:01:16.827911,151,"[(78, Fashion Anak), (82, Pakaian Anak Perempu...",537467002,https://ecs7.tokopedia.net/img/product-1/2018/...,link https://ecs7.tokopedia.net/img/product-1/...
3,3,252734311,Kemeja Tunik Anak cewek,2018-01-18 12:01:16.827911,151,"[(78, Fashion Anak), (82, Pakaian Anak Perempu...",537467001,https://ecs7.tokopedia.net/img/product-1/2018/...,link https://ecs7.tokopedia.net/img/product-1/...
4,4,252734311,Kemeja Tunik Anak cewek,2018-01-18 12:01:16.827911,151,"[(78, Fashion Anak), (82, Pakaian Anak Perempu...",537467000,https://ecs7.tokopedia.net/img/product-1/2018/...,link https://ecs7.tokopedia.net/img/product-1/...
