# Full training algorithm

### Training Data preparation

In [1]:
import pandas as pd
import seaborn as sns
import numpy as np
import matplotlib.pyplot as plt
import nltk
import string
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from sklearn.feature_extraction.text import CountVectorizer, TfidfTransformer
from sklearn.model_selection import train_test_split

df = pd.read_csv("train.csv")

def clean_col_names(col_name):
    col_name = (
        col_name.strip()
        .lower()
    )
    return col_name

columns = list(df.columns)
columns = [clean_col_names(col_name) for col_name in columns]
df.columns = columns

def fill_missing_value(data):
    data['title'].fillna(value = 'Unknown', inplace=True)
    data['bullet_points'].fillna(value = 'Unavailable', inplace=True)
    data['description'].fillna(value = 'No description', inplace=True)
    
    return data

dfx = fill_missing_value(df)

dfx['product_length'] = np.log1p(dfx['product_length'])

df1 =dfx.sample(frac=0.2).reset_index(drop=True)
df1.shape

df1['bullet_points'] = df1['bullet_points'].astype(str)

stop = stopwords.words('english')

def remove_punctuation(sentence: str) -> str:
    return sentence.translate(str.maketrans('', '', string.punctuation))

# Create a function to remove stop words
def remove_stop_words(x):
    x = ' '.join([i for i in x.lower().split(' ') if i not in stop])
    return x

def text_process(text):
    text = remove_punctuation(text)
    text = remove_stop_words(text)
    text = text.lower()
    
    return text

from nltk.stem.porter import PorterStemmer
porter = PorterStemmer()

df1['bullet_points'] = df1['bullet_points'].apply(porter.stem)

df1['bullet_points'] = df1['bullet_points'].apply(text_process)

bow_transformer = CountVectorizer(analyzer=text_process)
bow_transformer.fit(df1['bullet_points'])
bow_bullet_points = bow_transformer.transform(df1['bullet_points'])

tfidf_transformer = TfidfTransformer().fit(bow_bullet_points)
tfidf_reviews = tfidf_transformer.transform(bow_bullet_points)

X_train, X_test, y_train, y_test = train_test_split(df1['bullet_points'],df1['product_length'],test_size=0.25)

### Training

In [2]:
import pickle
from sklearn.pipeline import Pipeline
from sklearn.ensemble import RandomForestRegressor
from sklearn.svm import SVR
from sklearn.ensemble import GradientBoostingRegressor
from sklearn.linear_model import ElasticNet
from sklearn.linear_model import SGDRegressor
from sklearn.linear_model import BayesianRidge
from xgboost.sklearn import XGBRegressor
from lightgbm import LGBMRegressor
from sklearn.kernel_ridge import KernelRidge
from sklearn.metrics import mean_absolute_percentage_error

In [None]:
%%time
#RandomForestRegressor

pipeline_RFR = Pipeline([
    ('bow',CountVectorizer(analyzer=text_process)),
    ('tfidf',TfidfTransformer()),
    ('regressor', SVR())
])

pipeline_RFR.fit(X_train, y_train)
y_pred = pipeline_RFR.predict(X_test)

with open('RFR_model.pkl', 'wb') as file:
    pickle.dump(pipeline_RFR, file)

score = max(0 , 100*(1-mean_absolute_percentage_error(y_test,y_pred)))
print(score)

In [5]:
##SVM
%%time

pipeline_SVR = Pipeline([
    ('bow',CountVectorizer(analyzer=text_process)),
    ('tfidf',TfidfTransformer()),
    ('regressor', RandomForestRegressor())
])

pipeline_SVR.fit(X_train, y_train)
y_pred = pipeline_SVR.predict(X_test)

score = max(0 , 100*(1-mean_absolute_percentage_error(y_test,y_pred)))
print(score)

with open('SVR_model.pkl', 'wb') as file:
    pickle.dump(pipeline_SVR, file)

file.close()

89.9823274311251
Wall time: 1min 56s


In [15]:
#ElasticNet
#%%time

pipeline_EN = Pipeline([
    ('bow',CountVectorizer(analyzer=text_process)),
    ('tfidf',TfidfTransformer()),
    ('regressor', ElasticNet())
])

pipeline_EN.fit(X_train, y_train)
y_pred = pipeline_EN.predict(X_test)

score = max(0 , 100*(1-mean_absolute_percentage_error(y_test,y_pred)))
print(score)

with open('EN_model.pkl', 'wb') as file:
    pickle.dump(pipeline_EN, file)

file.close()

88.8666373915257


In [19]:
#SGDRegressor
#%%time

pipeline_SGD = Pipeline([
    ('bow',CountVectorizer(analyzer=text_process)),
    ('tfidf',TfidfTransformer()),
    ('regressor', SGDRegressor())
])

pipeline_SGD.fit(X_train, y_train)
y_pred = pipeline_SGD.predict(X_test)

score = max(0 , 100*(1-mean_absolute_percentage_error(y_test,y_pred)))
print(score)

with open('SGD_model.pkl', 'wb') as file:
    pickle.dump(pipeline_SGD, file)

file.close()

88.3852104660631


In [21]:
%%time

##XGB
pipeline_XGB = Pipeline([
    ('bow',CountVectorizer(analyzer=text_process)),
    ('tfidf',TfidfTransformer()),
    ('regressor', XGBRegressor())
])

pipeline_XGB.fit(X_train, y_train)
y_pred = pipeline_XGB.predict(X_test)

score = max(0 , 100*(1-mean_absolute_percentage_error(y_test,y_pred)))
print(score)

with open('XGB_model.pkl', 'wb') as file:
    pickle.dump(pipeline_XGB, file)

file.close()

89.45604634457723
Wall time: 12.2 s


# Full testing algorithm

In [22]:
with open('RFR_model.pkl', 'rb') as file:
    pipeline = pickle.load(file)

### Test data preparation

In [23]:
df_test = pd.read_csv("test.csv")

def fill_missing_value(data):
    data['TITLE'].fillna(value = 'Unknown', inplace=True)
    data['BULLET_POINTS'].fillna(value = 'Unavailable', inplace=True)
    data['DESCRIPTION'].fillna(value = 'No description', inplace=True)
    
    return data

dfxt = fill_missing_value(df_test)

dfxt['BULLET_POINTS'] = dfxt['BULLET_POINTS'].astype(str)

from nltk.stem.porter import PorterStemmer
porter = PorterStemmer()

dfxt['BULLET_POINTS'] = dfxt['BULLET_POINTS'].apply(porter.stem)

dfxt['BULLET_POINTS'] = dfxt['BULLET_POINTS'].apply(text_process)

bow_transformer = CountVectorizer(analyzer=text_process)
bow_transformer.fit(dfxt['BULLET_POINTS'])
bow_bullet_points = bow_transformer.transform(dfxt['BULLET_POINTS'])

tfidf_transformer = TfidfTransformer().fit(bow_bullet_points)
tfidf_reviews = tfidf_transformer.transform(bow_bullet_points)
log_pred = pipeline.predict(dfxt['BULLET_POINTS'])

predictions = np.expm1(log_pred)

submission = {"PRODUCT_ID": dfxt["PRODUCT_ID"],
              "PRODUCT_LENGTH": predictions}

submission_df = pd.DataFrame(submission)
submission_df.to_csv("submission.csv")