In [1]:
import pandas as pd
import numpy as np
from tensorflow.keras.layers import Input, Dense, Concatenate, Dropout, Embedding, Flatten
from tensorflow.keras.models import Model
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from sklearn.model_selection import train_test_split

In [2]:
# load the training data
train_data = pd.read_csv('train.csv')

In [7]:
train_data.isnull().sum()

PRODUCT_ID               0
TITLE                   12
BULLET_POINTS       837364
DESCRIPTION        1157381
PRODUCT_TYPE_ID          0
PRODUCT_LENGTH           0
dtype: int64

In [8]:
train_data.fillna('', inplace=True)

In [9]:
# tokenize and embed the text data
tokenizer = Tokenizer()
tokenizer.fit_on_texts(train_data['TITLE'] + ' ' + train_data['DESCRIPTION'] + ' ' + train_data['BULLET_POINTS'])
vocab_size = len(tokenizer.word_index) + 1
max_len = 50

In [10]:
# define the input layers for the text data
title_input = Input(shape=(max_len,))
description_input = Input(shape=(max_len,))
bullet_input = Input(shape=(max_len,))

In [11]:
# define the embedding layers for the text data
embedding = Embedding(vocab_size, 50)

In [12]:
# embed the text data
title_embed = embedding(title_input)
description_embed = embedding(description_input)
bullet_embed = embedding(bullet_input)

In [13]:
# flatten the embedded data
title_flatten = Flatten()(title_embed)
description_flatten = Flatten()(description_embed)
bullet_flatten = Flatten()(bullet_embed)

In [14]:
# define the input layer for the numerical data
numerical_input = Input(shape=(1,))

In [15]:
# concatenate the embedded and numerical data
concatenated = Concatenate()([title_flatten, description_flatten, bullet_flatten, numerical_input])

In [17]:
# define the fully connected layers
fc1 = Dense(128, activation='relu')(concatenated)
dropout1 = Dropout(0.2)(fc1)
fc2 = Dense(64, activation='relu')(dropout1)
dropout2 = Dropout(0.2)(fc2)
output = Dense(1, activation='linear')(dropout2)

In [18]:
# create the model
model = Model(inputs=[title_input, description_input, bullet_input, numerical_input], outputs=output)
model.compile(loss='mean_absolute_percentage_error', optimizer='adam')

In [19]:
# split the data into training and validation sets
train_set, val_set = train_test_split(train_data, test_size=0.2, random_state=42)

In [20]:
# preprocess the text data for the training and validation sets
train_title = tokenizer.texts_to_sequences(train_set['TITLE'])
train_description = tokenizer.texts_to_sequences(train_set['DESCRIPTION'])
train_bullet = tokenizer.texts_to_sequences(train_set['BULLET_POINTS'])
train_title = pad_sequences(train_title, maxlen=max_len, padding='post')
train_description = pad_sequences(train_description, maxlen=max_len, padding='post')
train_bullet = pad_sequences(train_bullet, maxlen=max_len, padding='post')

In [21]:
val_title = tokenizer.texts_to_sequences(val_set['TITLE'])
val_description = tokenizer.texts_to_sequences(val_set['DESCRIPTION'])
val_bullet = tokenizer.texts_to_sequences(val_set['BULLET_POINTS'])
val_title = pad_sequences(val_title, maxlen=max_len, padding='post')
val_description = pad_sequences(val_description, maxlen=max_len, padding='post')
val_bullet = pad_sequences(val_bullet, maxlen=max_len, padding='post')

In [22]:
# train the model
model.fit([train_title, train_description, train_bullet, train_set['PRODUCT_TYPE_ID']], train_set['PRODUCT_LENGTH'], 
          validation_data=([val_title, val_description, val_bullet, val_set['PRODUCT_TYPE_ID']], val_set['PRODUCT_LENGTH']), epochs=5, batch_size=128)

Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5


<keras.callbacks.History at 0x2597d107730>

In [23]:
# load the test data
test_data = pd.read_csv('test.csv')

In [25]:
test_data.fillna('', inplace=True)

In [26]:
# preprocess the text data for the test set
test_title = tokenizer.texts_to_sequences(test_data['TITLE'])
test_description = tokenizer.texts_to_sequences(test_data['DESCRIPTION'])
test_bullet = tokenizer.texts_to_sequences(test_data['BULLET_POINTS'])
test_title = pad_sequences(test_title, maxlen=max_len, padding='post')
test_description = pad_sequences(test_description, maxlen=max_len, padding='post')
test_bullet = pad_sequences(test_bullet, maxlen=max_len, padding='post')

In [27]:
# predict the length of the products in the test set
predictions = model.predict([test_title, test_description, test_bullet, test_data['PRODUCT_TYPE_ID']])



In [29]:
# create the submission file
submission = pd.read_csv('sample_submission.csv')
submission['PRODUCT_LENGTH'] = predictions
submission.to_csv('submission_dlm1.csv', index=False)