In [1]:
# Import libraries
import pandas as pd
import numpy as np
import re
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split
import xgboost as xgb
from sklearn.metrics import mean_absolute_percentage_error

In [2]:
# Load data
train = pd.read_csv('train.csv')
test = pd.read_csv('test.csv')

In [3]:
# Combine train and test data for feature engineering
data = pd.concat([train, test], axis=0, sort=False)

In [4]:
# Fill any missing values in the DataFrame with empty strings
data.fillna("", inplace=True)

In [5]:
# Convert non-string data types to strings
data = data.applymap(lambda x: str(x) if type(x) != str else x)

In [6]:
# Feature engineering: extract text features from title, description, and bullet points columns
data['title_len'] = data['TITLE'].apply(lambda x: len(str(x)))
data['desc_len'] = data['DESCRIPTION'].apply(lambda x: len(str(x)))
data['bp_len'] = data['BULLET_POINTS'].apply(lambda x: len(str(x)))
data['num_bp'] = data['BULLET_POINTS'].apply(lambda x: len(x.split(',')))
data['all_text'] = data['TITLE'] + ' ' + data['DESCRIPTION'] + ' ' + data['BULLET_POINTS'].fillna('')
data['all_text'] = data['all_text'].apply(lambda x: re.sub('[^a-zA-Z0-9\s]', '', x.lower()))

In [7]:
# Encode categorical variable
le = LabelEncoder()
data['PRODUCT_TYPE_ID'] = le.fit_transform(data['PRODUCT_TYPE_ID'])

In [None]:
# Vectorize text data using CountVectorizer with n-grams
cv = CountVectorizer(stop_words='english', max_features=5000, ngram_range=(1,2))
text_feat = cv.fit_transform(data['all_text']).toarray()

In [None]:
# Combine text features with categorical variable
feat_cols = ['title_len', 'desc_len', 'bp_len', 'num_bp', 'PRODUCT_TYPE_ID']
X = np.hstack((text_feat, data[feat_cols].values))

In [None]:
# Split data back into train and test sets
X_train = X[:len(train)]
y_train = train['PRODUCT_LENGTH']
X_test = X[len(train):]

In [None]:
# Split train data into train and validation sets
X_tr, X_val, y_tr, y_val = train_test_split(X_train, y_train, test_size=0.2, random_state=42)

In [None]:
# Train XGBoost model
xgb_model = xgb.XGBRegressor(n_estimators=100, max_depth=6, learning_rate=0.05, random_state=42)
xgb_model.fit(X_tr, y_tr)

In [None]:
# Evaluate model on validation set
y_val_pred = xgb_model.predict(X_val)
score = max(0, 100 * (1 - mean_absolute_percentage_error(y_val, y_val_pred)))
print("Validation score: {:.2f}".format(score))

In [None]:
# Predict on test set
y_test_pred = xgb_model.predict(X_test)

In [None]:
# Create submission file
sub = pd.DataFrame({'PRODUCT_ID': test['PRODUCT_ID'], 'PRODUCT_LENGTH': y_test_pred})
sub.set_index('PRODUCT_ID', inplace=True)
sub.to_csv('submission_x.csv')