In [1]:
import pandas as pd
import numpy as np
import os
from sklearn.preprocessing import LabelEncoder, MinMaxScaler, StandardScaler
import keras
import math
from keras.preprocessing.text import Tokenizer
from keras.utils import pad_sequences
from keras.layers import Input, Dropout, Dense, BatchNormalization, Activation, concatenate, GRU, Embedding, Flatten, BatchNormalization
from keras.models import Model
from keras.callbacks import ModelCheckpoint, Callback, EarlyStopping
from keras import backend as K
from sklearn.metrics import mean_absolute_error as mae

In [3]:
dataset = pd.read_csv('dataset_clean.csv')

In [4]:
dataset.head()

Unnamed: 0,category_1,category_2,category_3,title,product_rating,selling_price,seller_rating,description
0,4,48,47,"[15085, 1335, 1319, 4225, 11612, 1319, 9855, 6...",4.4,152,4.4,"[111, 77, 111, 343, 1335, 1319, 5068, 1032]"
1,4,48,47,"[7667, 11613, 234, 71, 251, 782, 5234, 1335, 1...",3.4,329,4.7,"[2, 7667, 5234, 4951, 1335, 1320, 35, 291, 60,..."
2,4,48,47,"[3359, 77, 1335, 1320, 334, 310, 11614, 221, 6...",4.1,369,4.1,"[3359, 77, 1335, 1197, 11614, 1320, 152, 74, 5..."
3,4,48,47,"[4446, 234, 71, 408, 365, 119, 1335, 1320, 4, ...",4.0,249,4.8,"[4446, 365, 119, 234, 71, 1335, 1320, 167, 4, ..."
4,4,48,47,"[4446, 3605, 1335, 1320, 71, 32, 4, 421, 1300,...",3.9,249,4.8,"[4446, 3605, 1335, 1320, 71, 32, 4, 421, 1300,..."


In [7]:
def rmsle_cust(y_true, y_pred):
    first_log = K.log(K.clip(y_pred, K.epsilon(), None) + 1.)
    second_log = K.log(K.clip(y_true, K.epsilon(), None) + 1.)
    return K.sqrt(K.mean(K.square(first_log - second_log), axis=-1))
def rmsle(y, y_pred):
    assert len(y) == len(y_pred)
    to_sum = [(math.log(y_pred[i] + 1) - math.log(y[i] + 1)) ** 2.0 for i,pred in enumerate(y_pred)]
    return (sum(to_sum) * (1.0/len(y))) ** 0.5

In [6]:
max_title = 32
max_description = 64

In [None]:
x_title = pad_sequences(dataset.title,maxlen = max_title)
x_description = pad_sequences(dataset.description,maxlen = max_description)
cat2 = dataset.category_2.to_numpy()
cat3 = dataset.category_3.to_numpy()

In [None]:
feed_data = {
    'name':x_title,
    'item_desc':x_description,
    'category_2':cat2,
    'category_3':cat3
}

In [None]:
max_name = np.max(x_title)+1
max_desp = np.max(x_description)+1
max_cat2=np.max(cat2)+1
max_cat3=np.max(cat3)+1

In [None]:
y = dataset.selling_price.to_numpy()

In [None]:
#shrinking y better look of mae, mse
y = np.log(y+1)
target_scaler = MinMaxScaler(feature_range=(-1, 1))
y = target_scaler.fit_transform(y.reshape(-1,1))

### notebook 

In [None]:
def get_model():
    #params
    dr_r = 0.1
    
    #Inputs
    name = Input(shape=[x_title.shape[1]], name="name")
    item_desc = Input(shape=[x_description.shape[1]], name="item_desc")
    category_1 = Input(shape=[1], name="category_2")
    category_2 = Input(shape=[1], name="category_3")
    
    #Embeddings layers
    emb_name = Embedding(max_name, 32)(name)
    emb_item_desc = Embedding(max_desp, 64)(item_desc)
    emb_category_1 = Dense(8)(category_1)
    emb_category_2 = Dense(8)(category_2)

    
    #rnn layer
    rnn_layer1 = GRU(16) (emb_item_desc)
    rnn_layer2 = GRU(8) (emb_name)
    
    #main layer
    main_l = concatenate([
           Flatten() (category_1)
         , Flatten() (category_2)
         , rnn_layer1
         , rnn_layer2
    ])
    main_l = Dropout(dr_r) (Dense(128) (main_l))
    main_l = Dropout(dr_r) (Dense(64) (main_l))
    
    #output
    output = Dense(1, activation="linear") (main_l)
    
    #model
    model = Model([name,item_desc,category_1,category_2], output) #name, item_desc, category_1, category_2
    opt = keras.optimizers.Adam(learning_rate=0.01)
    model.compile(loss="mse", optimizer=opt, metrics=["mae", rmsle_cust])
    
    return model

    
model = get_model()
model.summary()

In [None]:
epochs = 10
model.fit(feed_data,y,epochs = epochs)