In [2]:
import numpy as np
import pandas as pd
import csv
import seaborn as sns
import matplotlib.pyplot as plt
import xgboost as xgb
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_absolute_error, mean_absolute_percentage_error

In [3]:
df = pd.read_csv("train.csv", encoding = 'iso-8859-1')

df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1968134 entries, 0 to 1968133
Data columns (total 6 columns):
 #   Column           Dtype  
---  ------           -----  
 0   PRODUCT_ID       int64  
 1   TITLE            object 
 2   BULLET_POINTS    object 
 3   DESCRIPTION      object 
 4   PRODUCT_TYPE_ID  int64  
 5   PRODUCT_LENGTH   float64
dtypes: float64(1), int64(2), object(3)
memory usage: 90.1+ MB


In [4]:
df.isnull().sum()

PRODUCT_ID         0
TITLE              0
BULLET_POINTS      0
DESCRIPTION        0
PRODUCT_TYPE_ID    0
PRODUCT_LENGTH     0
dtype: int64

In [5]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1968134 entries, 0 to 1968133
Data columns (total 6 columns):
 #   Column           Dtype  
---  ------           -----  
 0   PRODUCT_ID       int64  
 1   TITLE            object 
 2   BULLET_POINTS    object 
 3   DESCRIPTION      object 
 4   PRODUCT_TYPE_ID  int64  
 5   PRODUCT_LENGTH   float64
dtypes: float64(1), int64(2), object(3)
memory usage: 90.1+ MB


In [6]:
df= df.dropna()

In [7]:
df.isnull().sum()

PRODUCT_ID         0
TITLE              0
BULLET_POINTS      0
DESCRIPTION        0
PRODUCT_TYPE_ID    0
PRODUCT_LENGTH     0
dtype: int64

In [5]:
test_df = pd.read_csv('test.csv')
test_df.info()
test_df.isnull().sum()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 734736 entries, 0 to 734735
Data columns (total 5 columns):
 #   Column           Non-Null Count   Dtype 
---  ------           --------------   ----- 
 0   PRODUCT_ID       734736 non-null  int64 
 1   TITLE            734731 non-null  object
 2   BULLET_POINTS    458814 non-null  object
 3   DESCRIPTION      354735 non-null  object
 4   PRODUCT_TYPE_ID  734736 non-null  int64 
dtypes: int64(2), object(3)
memory usage: 28.0+ MB


PRODUCT_ID              0
TITLE                   5
BULLET_POINTS      275922
DESCRIPTION        380001
PRODUCT_TYPE_ID         0
dtype: int64

In [6]:
test_df = test_df.fillna(test_df.mode().iloc[0])

In [7]:
test_df.isnull().sum()

PRODUCT_ID         0
TITLE              0
BULLET_POINTS      0
DESCRIPTION        0
PRODUCT_TYPE_ID    0
dtype: int64

In [8]:
df['title_length'] = df['TITLE'].apply(len)
df['bullet_points_length'] = df['BULLET_POINTS'].apply(lambda x: len(x.split(',')))
test_df['title_length'] = test_df['TITLE'].apply(len)
test_df['bullet_points_length'] = test_df['BULLET_POINTS'].apply(lambda x: len(x.split(',')))

In [9]:
X_train, X_val, y_train, y_val = train_test_split(df[['title_length', 'bullet_points_length', 'PRODUCT_TYPE_ID']], df['PRODUCT_LENGTH'], test_size=0.2, random_state=42)


# Define the model
model = xgb.XGBRegressor(n_estimators=1000, learning_rate=0.05, max_depth=8, subsample=0.8, colsample_bytree=0.8, gamma=0.01)

# Train the model
model.fit(X_train, y_train, eval_set=[(X_val, y_val)], early_stopping_rounds=50, verbose=False)
test_predictions = model.predict(test_df[['title_length', 'bullet_points_length', 'PRODUCT_TYPE_ID']])



In [10]:
submission_df = pd.DataFrame({'PRODUCT_ID': test_df['PRODUCT_ID'], 'PRODUCT_LENGTH': test_predictions})
submission_df.to_csv('submission.csv', index=False)

In [11]:
y_val_pred = model.predict(X_val)
mape = mean_absolute_percentage_error(y_val, y_val_pred)
print('Mean Absolute Percentage Error on validation set:', mape)

Mean Absolute Percentage Error on validation set: 1.6476842301240713


In [12]:
df_s = pd.read_csv('submission.csv')
df_s.info()


<class 'pandas.core.frame.DataFrame'>
RangeIndex: 734736 entries, 0 to 734735
Data columns (total 2 columns):
 #   Column          Non-Null Count   Dtype  
---  ------          --------------   -----  
 0   PRODUCT_ID      734736 non-null  int64  
 1   PRODUCT_LENGTH  734736 non-null  float64
dtypes: float64(1), int64(1)
memory usage: 11.2 MB


In [None]:

from xgboost import XGBRegressor
from sklearn.model_selection import GridSearchCV, train_test_split
from sklearn.metrics import mean_absolute_percentage_error


# Prepare the feature matrix X and target variable y
X = df[['TITLE', 'DESCRIPTION', 'BULLET_POINTS', 'PRODUCT_TYPE_ID']]
y = df['PRODUCT_LENGTH']

# Split the data into train and validation sets
X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.2, random_state=42)

# Define the model
xgb = XGBRegressor()

# Define the hyperparameters to search
params = {
    'max_depth': [3, 4, 5],
    'n_estimators': [100, 200, 300],
    'learning_rate': [0.1, 0.2, 0.3],
    'subsample': [0.5, 0.7, 1],
    'colsample_bytree': [0.5, 0.7, 1]
}

# Define the GridSearchCV object
grid_search = GridSearchCV(estimator=xgb, param_grid=params, cv=5, n_jobs=-1, verbose=2)

# Fit the GridSearchCV object on the training data
grid_search.fit(X_train, y_train)

# Print the best hyperparameters and the corresponding score
print("Best hyperparameters: ", grid_search.best_params_)
print("Best score: ", grid_search.best_score_)

# Evaluate the model on the validation set using the best hyperparameters
best_xgb = grid_search.best_estimator_
y_pred = best_xgb.predict(X_val)
score = max(0, 100*(1-mean_absolute_percentage_error(y_val, y_pred)))
print("Validation score: ", score)


Fitting 5 folds for each of 243 candidates, totalling 1215 fits


