In [None]:
!wget https://s3-ap-southeast-1.amazonaws.com/he-public-data/datasetb2d9982.zip

import zipfile
import os

# Define the path to the ZIP file
zip_file_path = 'datasetb2d9982.zip'

# Extract the contents of the ZIP file to a directory
with zipfile.ZipFile(zip_file_path, 'r') as zip_ref:
    zip_ref.extractall()

# Set the file paths for the extracted dataset
base_dir = 'dataset'
train_dir = os.path.join(base_dir, 'train')
validation_dir = os.path.join(base_dir, 'test')
sample_submission_dir = os.path.join(base_dir, 'sample_submission')








--2023-04-22 19:24:01--  https://s3-ap-southeast-1.amazonaws.com/he-public-data/datasetb2d9982.zip
Resolving s3-ap-southeast-1.amazonaws.com (s3-ap-southeast-1.amazonaws.com)... 52.219.41.6
Connecting to s3-ap-southeast-1.amazonaws.com (s3-ap-southeast-1.amazonaws.com)|52.219.41.6|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 895569552 (854M) [binary/octet-stream]
Saving to: ‘datasetb2d9982.zip’


2023-04-22 19:25:06 (13.3 MB/s) - ‘datasetb2d9982.zip’ saved [895569552/895569552]



In [None]:
import pandas as pd 
import tensorflow as tf
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
import numpy as np

# Load the training and testing data into Pandas dataframes
train_df = pd.read_csv("dataset/train.csv")
test_df = pd.read_csv("dataset/test.csv")

# Replace missing or NaN values with empty strings
train_df.fillna("", inplace=True)
test_df.fillna("", inplace=True)

# Convert any integer values to strings in the training data
train_df = train_df.applymap(lambda x: str(x) if isinstance(x, int) else x)

# Combine the TITLE, DESCRIPTION, and BULLET_POINTS columns into a single list
training_sentences = train_df["TITLE"] + train_df["DESCRIPTION"] + train_df["BULLET_POINTS"]
testing_sentences = test_df["TITLE"]+ test_df["DESCRIPTION"] + test_df["BULLET_POINTS"]

# Tokenize the text data
tokenizer = Tokenizer(num_words=10000, oov_token="<OOV>")
tokenizer.fit_on_texts(training_sentences)

# Convert sentences to sequences of integers using the word index
training_sequences = tokenizer.texts_to_sequences(training_sentences)
testing_sequences = tokenizer.texts_to_sequences(testing_sentences)

# Pad sequences to a fixed length
max_length = 100
training_padded = pad_sequences(training_sequences, maxlen=max_length)
testing_padded = pad_sequences(testing_sequences, maxlen=max_length)

# Get the word index with the OOV token
word_index = tokenizer.word_index


In [None]:
# Load the training labels
train_labels = train_df['PRODUCT_TYPE_ID'].astype(int)

# Align the training labels with the training data
training_labels_final = []
for i, seq in enumerate(training_sequences):
    label = train_labels[i // 3] # Divide by 3 since there are 3 sequences per training example
    training_labels_final.append(label)
training_labels_final = np.array(training_labels_final)

# Build the model
model = tf.keras.Sequential([
    tf.keras.layers.Embedding(input_dim=len(word_index)+1, output_dim=32, input_length=100),
    tf.keras.layers.Bidirectional(tf.keras.layers.LSTM(32, return_sequences=True)),
    tf.keras.layers.Dense(1024, activation='relu'),
    tf.keras.layers.Dense(1, activation='sigmoid')
])


In [None]:
model.compile(loss='mean_absolute_error', optimizer='adam', metrics=['mae'])

In [None]:
model.fit(training_padded, training_labels_final, epochs=10, batch_size=200, validation_split=0.67)

Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


<keras.callbacks.History at 0x7f79a13b9c70>

In [None]:
y_pred = model.predict(testing_padded)
y_new=y_pred.tolist()
# Create the submission DataFrame
submission_df = pd.DataFrame({'PRODUCT_ID': test_df['PRODUCT_ID'], 'PRODUCT_LENGTH': y_new})
# Save the submission DataFrame to a CSV file
submission_df.to_csv('submission.csv',index=False)

