In [1]:
from clean_data import CleanImage, CleanTabular
import pandas as pd
import os
import numpy as np
from PIL import Image

In [2]:
image_path = os.path.join(os.path.dirname(os.getcwd()), 'image_data', 'fb_image')
image_cleaner = CleanImage(image_path)
cleaned_img_folder = image_cleaner.process_images(greyscale = True)

In [9]:
tabular_cleaner = CleanTabular(os.path.join(os.path.dirname(os.getcwd()), 'data/Images.csv'))
tabular_cleaner.slice_df(['id', 'product_id'])
image_df = tabular_cleaner.df
image_df['image_data'] = np.NaN
image_df['image_data']=image_df['image_data'].astype('object')

for i in range(len(image_df['id'])):
    img = Image.open(os.path.join(cleaned_img_folder, image_df['id'][i] + '.jpg'))
    numpydata = np.asarray(img)
    image_df.at[i, 'image_data'] = numpydata

image_df = image_df.drop(['id'], axis = 1)

In [4]:
def concat_array(df, series):
    array_list = []
    for i in range(len(df[series])):
        a = df[series][i][0]
        for j in range(len(df[series][i])-1):
            a = np.concatenate((a, df[series][i][j+1]), axis = None)
        array_list.append(a)

    
    return np.array(array_list)

def generate_pixel_name(array):
    pixel_name = []
    for i in range(len(array[0])):
        pixel_name.append("pixel"+str(i))
    return pixel_name

# numpy.reshape()

In [8]:
pixel_array = concat_array(image_df, 'image_data')
pixel_name = generate_pixel_name(pixel_array)
pixel_df = pd.DataFrame(pixel_array, columns=pixel_name)

In [6]:
image_df = pd.concat([image_df, pixel_df], axis=1)
image_df = image_df.drop(['image_data'], axis = 1)

In [7]:
image_df.head(5)

Unnamed: 0,product_id,pixel0,pixel1,pixel2,pixel3,pixel4,pixel5,pixel6,pixel7,pixel8,...,pixel615,pixel616,pixel617,pixel618,pixel619,pixel620,pixel621,pixel622,pixel623,pixel624
0,5f5f57d7-778f-4336-bb10-b43863418c8c,8,0,0,2,0,0,2,3,5,...,0,0,0,0,0,0,0,0,0,0
1,5f5f57d7-778f-4336-bb10-b43863418c8c,0,0,4,123,163,167,157,169,167,...,46,52,52,42,49,59,32,0,1,0
2,c2c8949f-3cde-4651-a234-4a4a1b2a9ad4,0,2,0,169,214,190,191,173,185,...,88,103,128,133,136,146,111,2,0,0
3,c2c8949f-3cde-4651-a234-4a4a1b2a9ad4,2,0,2,147,156,193,202,195,172,...,109,113,119,117,111,109,91,5,0,0
4,8292aa4e-7f1b-4655-bf0e-f1f2c9e3ffaf,0,0,1,2,1,1,1,1,0,...,0,0,0,0,0,0,0,0,0,0


In [8]:
tabular_cleaner = CleanTabular(os.path.join(os.path.dirname(os.getcwd()), 'data/Products.csv'))
tabular_cleaner.slice_df(['id', 'category'])
tabular_cleaner.clean_to_general_category('category')
tabular_cleaner.clean_to_category_type('category', ohe=False)
product_df = tabular_cleaner.df
product_df = product_df.rename(columns={'id':'product_id'})
product_df

Unnamed: 0,product_id,category
0,243809c0-9cfc-4486-ad12-3b7a16605ba9,6
1,1c58d3f9-8b93-47ea-9415-204fcc2a22e6,6
2,860673f1-57f6-47ba-8d2f-13f9e05b8f9a,6
3,59948726-29be-4b35-ade5-bb2fd7331856,6
4,16dbc860-696e-4cda-93f6-4dd4926573fb,6
...,...,...
7151,c4148656-78a9-4f3e-b393-134fdc5ef900,12
7152,564e3411-768d-4250-a624-b119d696f103,12
7153,2b0a652b-46a2-4297-b619-5efeeb222787,12
7154,719fd40a-870e-4144-b324-55dff2e66fb4,12


In [9]:
merged_df = image_df.merge(product_df, how='left', on='product_id')
merged_df = merged_df.drop(['product_id'], axis=1)

In [10]:
merged_df.head(5)

Unnamed: 0,pixel0,pixel1,pixel2,pixel3,pixel4,pixel5,pixel6,pixel7,pixel8,pixel9,...,pixel616,pixel617,pixel618,pixel619,pixel620,pixel621,pixel622,pixel623,pixel624,category
0,8,0,0,2,0,0,2,3,5,0,...,0,0,0,0,0,0,0,0,0,6
1,0,0,4,123,163,167,157,169,167,169,...,52,52,42,49,59,32,0,1,0,6
2,0,2,0,169,214,190,191,173,185,180,...,103,128,133,136,146,111,2,0,0,6
3,2,0,2,147,156,193,202,195,172,151,...,113,119,117,111,109,91,5,0,0,6
4,0,0,1,2,1,1,1,1,0,0,...,0,0,0,0,0,0,0,0,0,6


In [11]:
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score

In [12]:
merged_df.head(5)

Unnamed: 0,pixel0,pixel1,pixel2,pixel3,pixel4,pixel5,pixel6,pixel7,pixel8,pixel9,...,pixel616,pixel617,pixel618,pixel619,pixel620,pixel621,pixel622,pixel623,pixel624,category
0,8,0,0,2,0,0,2,3,5,0,...,0,0,0,0,0,0,0,0,0,6
1,0,0,4,123,163,167,157,169,167,169,...,52,52,42,49,59,32,0,1,0,6
2,0,2,0,169,214,190,191,173,185,180,...,103,128,133,136,146,111,2,0,0,6
3,2,0,2,147,156,193,202,195,172,151,...,113,119,117,111,109,91,5,0,0,6
4,0,0,1,2,1,1,1,1,0,0,...,0,0,0,0,0,0,0,0,0,6


In [13]:
y = merged_df['category']
x = merged_df.drop(['category'], axis = 1)


In [39]:
y

0         6
1         6
2         6
3         6
4         6
         ..
12599    12
12600    12
12601    12
12602    12
12603    12
Name: category, Length: 12604, dtype: int8

In [15]:
x

Unnamed: 0,pixel0,pixel1,pixel2,pixel3,pixel4,pixel5,pixel6,pixel7,pixel8,pixel9,...,pixel615,pixel616,pixel617,pixel618,pixel619,pixel620,pixel621,pixel622,pixel623,pixel624
0,8,0,0,2,0,0,2,3,5,0,...,0,0,0,0,0,0,0,0,0,0
1,0,0,4,123,163,167,157,169,167,169,...,46,52,52,42,49,59,32,0,1,0
2,0,2,0,169,214,190,191,173,185,180,...,88,103,128,133,136,146,111,2,0,0
3,2,0,2,147,156,193,202,195,172,151,...,109,113,119,117,111,109,91,5,0,0
4,0,0,1,2,1,1,1,1,0,0,...,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
12599,0,13,0,96,155,97,108,90,78,80,...,86,83,79,77,73,69,62,0,2,0
12600,2,0,2,0,5,0,103,247,238,244,...,254,252,255,106,0,5,3,0,0,0
12601,0,0,1,8,225,255,251,255,255,255,...,254,254,255,241,186,192,5,0,2,0
12602,6,0,0,2,0,0,1,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [40]:
x = pixel_array

In [42]:
x_train, x_test, y_train, y_test= train_test_split(x, y, test_size=0.2, random_state=42)

In [43]:
from sklearn import svm
svc = svm.SVC(gamma=0.001 , C = 100.) 
svc.fit(x_train , y_train)
y_pred = svc.predict(x_test)

In [44]:
from sklearn import metrics
metrics.mean_squared_error(y_pred, y_test)

11.933756445854819