In [1]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

/kaggle/input/amazon-2023/dataset/sample_submission.csv
/kaggle/input/amazon-2023/dataset/train.csv
/kaggle/input/amazon-2023/dataset/test.csv


In [2]:
import pandas as pd
import numpy as np
import scipy

from sklearn.linear_model import Ridge, LogisticRegression
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.preprocessing import LabelBinarizer

In [3]:
import gc

In [4]:
NAME_MIN_DF = 10
MAX_FEAT_DESCP = 50000

In [5]:
print("Reading in Data")

df_train = pd.read_csv('/kaggle/input/amazon-2023/dataset/train.csv')

Reading in Data


In [6]:
df_train.head()

Unnamed: 0,PRODUCT_ID,TITLE,BULLET_POINTS,DESCRIPTION,PRODUCT_TYPE_ID,PRODUCT_LENGTH
0,1925202,ArtzFolio Tulip Flowers Blackout Curtain for D...,[LUXURIOUS & APPEALING: Beautiful custom-made ...,,1650,2125.98
1,2673191,Marks & Spencer Girls' Pyjama Sets T86_2561C_N...,"[Harry Potter Hedwig Pyjamas (6-16 Yrs),100% c...",,2755,393.7
2,2765088,PRIKNIK Horn Red Electric Air Horn Compressor ...,"[Loud Dual Tone Trumpet Horn, Compatible With ...","Specifications: Color: Red, Material: Aluminiu...",7537,748.031495
3,1594019,ALISHAH Women's Cotton Ankle Length Leggings C...,[Made By 95%cotton and 5% Lycra which gives yo...,AISHAH Women's Lycra Cotton Ankel Leggings. Br...,2996,787.401574
4,283658,The United Empire Loyalists: A Chronicle of th...,,,6112,598.424


In [8]:
X = df_train
y = np.log1p(df_train["PRODUCT_LENGTH"])

In [9]:
print(X.memory_usage(deep = True))

Index                     128
PRODUCT_ID           17997584
TITLE               332001302
BULLET_POINTS      1061077327
DESCRIPTION         974629485
PRODUCT_TYPE_ID      17997584
PRODUCT_LENGTH       17997584
dtype: int64


In [10]:
X["DESCRIPTION"] = X["DESCRIPTION"].fillna("None")
X["TITLE"] = X["TITLE"].fillna("Unknown")
X["BULLET_POINTS"] = X["BULLET_POINTS"].fillna("Not Known")

In [11]:
print(X.memory_usage(deep = True))

Index                     128
PRODUCT_ID           17997584
TITLE               332001686
BULLET_POINTS      1089547703
DESCRIPTION        1008193534
PRODUCT_TYPE_ID      17997584
PRODUCT_LENGTH       17997584
dtype: int64


In [12]:
print("Encodings")
count = CountVectorizer(min_df=NAME_MIN_DF)
X_name = count.fit_transform(X["TITLE"])

Encodings


In [13]:
print("Bulletpoints Encoders")
unique_categories = pd.Series("/".join(X["BULLET_POINTS"].unique().astype("str")).split("/")).unique()
count_category = CountVectorizer()
X_bulletpoints = count_category.fit_transform(X["BULLET_POINTS"])

Bulletpoints Encoders


In [14]:
print("Descp encoders")
count_descp = TfidfVectorizer(max_features = MAX_FEAT_DESCP, 
                              ngram_range = (1,3),
                              stop_words = "english")
X_descp = count_descp.fit_transform(X["DESCRIPTION"])

Descp encoders


In [16]:
print("Dummy Encoders")
X_dummies = scipy.sparse.csr_matrix(pd.get_dummies(X[[
    "PRODUCT_ID", "PRODUCT_TYPE_ID"]], sparse = True).values)

X = scipy.sparse.hstack((X_dummies, 
                         X_descp,
                         X_bulletpoints,
                         X_name)).tocsr()

Dummy Encoders


In [17]:
print([X_dummies.shape, X_bulletpoints.shape, 
       X_name.shape, X_descp.shape])

[(2249698, 2), (2249698, 367567), (2249698, 79663), (2249698, 50000)]


In [34]:
X_train, X_val, y_train, y_val = train_test_split(X,y , shuffle=True, test_size=0.2)

In [35]:
model = Ridge(solver = "lsqr", fit_intercept=False)

In [36]:
print("Fitting Model")
model.fit(X_train, y_train)

Fitting Model


Ridge(fit_intercept=False, solver='lsqr')

In [37]:
preds = model.predict(X_val)

In [38]:
y_pred = np.expm1(preds)

In [39]:
y_pred

array([3.25832134e+01, 3.89566588e-02, 1.19549435e+01, ...,
       1.32569254e+03, 1.41961565e+02, 6.31901251e+03])

In [44]:
y_actual = np.expm1(y_val)

In [45]:
from sklearn.metrics import mean_absolute_percentage_error

In [46]:
score = mean_absolute_percentage_error(y_actual, y_pred)

In [47]:
print(score)

6.870978744848017


In [50]:
print(score/100)

0.06870978744848016


In [51]:
final_score = max(0,100*(1-(score/100)))

In [52]:
print(final_score)

93.12902125515198
