In [1]:
import numpy as np
import pandas as pd
from sklearn.linear_model import Ridge, LogisticRegression
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer

import matplotlib.pyplot as plt
import seaborn as sns

In [2]:
data = pd.read_csv('/content/drive/MyDrive/Amazon_Hackathon/train.csv')

In [3]:
data.head(10)

Unnamed: 0,PRODUCT_ID,TITLE,BULLET_POINTS,DESCRIPTION,PRODUCT_TYPE_ID,PRODUCT_LENGTH
0,1925202,ArtzFolio Tulip Flowers Blackout Curtain for D...,[LUXURIOUS & APPEALING: Beautiful custom-made ...,,1650,2125.98
1,2673191,Marks & Spencer Girls' Pyjama Sets T86_2561C_N...,"[Harry Potter Hedwig Pyjamas (6-16 Yrs),100% c...",,2755,393.7
2,2765088,PRIKNIK Horn Red Electric Air Horn Compressor ...,"[Loud Dual Tone Trumpet Horn, Compatible With ...","Specifications: Color: Red, Material: Aluminiu...",7537,748.031495
3,1594019,ALISHAH Women's Cotton Ankle Length Leggings C...,[Made By 95%cotton and 5% Lycra which gives yo...,AISHAH Women's Lycra Cotton Ankel Leggings. Br...,2996,787.401574
4,283658,The United Empire Loyalists: A Chronicle of th...,,,6112,598.424
5,2152929,HINS Metal Bucket Shape Plant Pot for Indoor &...,"[Simple and elegant, great for displaying indo...",HINS Brings you the most Elegant Looking Pot w...,5725,950.0
6,413758,Ungifted: My Life and Journey,,,23,598.0
7,2026580,Delavala Self Adhesive Kitchen Backsplash Wall...,[HIGH QUALITY PVC MATERIAL: The kitchen alumin...,<p><strong>Aluminum Foil Stickers-good kitchen...,6030,984.251967
8,2050239,PUMA Cali Sport Clean Women's Sneakers White L...,[Style Name:-Cali Sport Clean Women's Sneakers...,,3302,393.7
9,2998633,Hexwell Essential oil for Home Fragrance Oil A...,[100% Pure And Natural Essential Oil Or Fragra...,"Transform your home, workplace or hotel room i...",8201,393.700787


In [5]:
import tensorflow as tf
tf.test.gpu_device_name()

'/device:GPU:0'

In [6]:
data.shape

(2249698, 6)

In [7]:
data.isnull().sum()

PRODUCT_ID               0
TITLE                   12
BULLET_POINTS       837364
DESCRIPTION        1157381
PRODUCT_TYPE_ID          0
PRODUCT_LENGTH           0
dtype: int64

In [8]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2249698 entries, 0 to 2249697
Data columns (total 6 columns):
 #   Column           Dtype  
---  ------           -----  
 0   PRODUCT_ID       int64  
 1   TITLE            object 
 2   BULLET_POINTS    object 
 3   DESCRIPTION      object 
 4   PRODUCT_TYPE_ID  int64  
 5   PRODUCT_LENGTH   float64
dtypes: float64(1), int64(2), object(3)
memory usage: 103.0+ MB


In [9]:
data['PRODUCT_LENGTH'] = np.log1p(data['PRODUCT_LENGTH'])

In [10]:
value_count_1 = data['PRODUCT_TYPE_ID'].value_counts()

In [11]:
print(value_count_1)

1        121199
12064     57554
0         50505
123       31527
6104      29690
          ...  
7239          1
12536         1
4114          1
3229          1
4505          1
Name: PRODUCT_TYPE_ID, Length: 12907, dtype: int64


In [12]:
val_count_1 = value_count_1[value_count_1==1]

In [13]:
print(val_count_1)

4941     1
876      1
9075     1
8993     1
8063     1
        ..
7239     1
12536    1
4114     1
3229     1
4505     1
Name: PRODUCT_TYPE_ID, Length: 1892, dtype: int64


In [14]:
val_count_1.sum()

1892

In [15]:
data[['TITLE', 'BULLET_POINTS', 'DESCRIPTION']] = data[['TITLE', 'BULLET_POINTS', 'DESCRIPTION']].fillna('')


In [16]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2249698 entries, 0 to 2249697
Data columns (total 6 columns):
 #   Column           Dtype  
---  ------           -----  
 0   PRODUCT_ID       int64  
 1   TITLE            object 
 2   BULLET_POINTS    object 
 3   DESCRIPTION      object 
 4   PRODUCT_TYPE_ID  int64  
 5   PRODUCT_LENGTH   float64
dtypes: float64(1), int64(2), object(3)
memory usage: 103.0+ MB


In [17]:
data['TITLE'].nunique()

2210763

In [18]:
small_data = data.sample(n=150000, random_state=42)

In [19]:
small_data.reset_index(drop=True, inplace=True)

In [20]:
small_data['TITLE'].value_counts()[:5]

Casotec Back Cover for Mobile (Silicone_Multicolor)    6
Unknown Title                                          6
Poems                                                  5
Casotec Back Cover for Mobile (Plastic_Multicolor)     5
Greatest Hits                                          4
Name: TITLE, dtype: int64

In [21]:
cvect = CountVectorizer()
X_train_name = cvect.fit_transform(small_data['TITLE'])

In [22]:
print(X_train_name.shape)

(150000, 147662)


In [23]:
tfidf_descp = TfidfVectorizer(max_features=50000, ngram_range=(1, 2), stop_words='english')

X_train_descp = tfidf_descp.fit_transform(small_data['DESCRIPTION'])
X_train_bullet = tfidf_descp.fit_transform(small_data['BULLET_POINTS'])

In [24]:
print(X_train_descp.shape)
print(X_train_bullet.shape)

(150000, 50000)
(150000, 50000)


In [25]:
from sklearn.preprocessing import LabelBinarizer

lb_title = LabelBinarizer(sparse_output=True)
X_train_title = lb_title.fit_transform(small_data['TITLE'])

lb_descp = LabelBinarizer(sparse_output=True)
X_train_description = lb_descp.fit_transform(small_data['DESCRIPTION'])

lb_bull_pont = LabelBinarizer(sparse_output=True)
X_train_bull_point = lb_bull_pont.fit_transform(small_data['BULLET_POINTS'])

lb_prod_typ_id = LabelBinarizer(sparse_output=True)
X_train_prod_typ = lb_prod_typ_id.fit_transform(small_data['PRODUCT_TYPE_ID'])

In [26]:
from scipy.sparse import hstack
sparse_matrix_list = (X_train_name, X_train_descp, X_train_title, X_train_description, X_train_bull_point, X_train_prod_typ)

X_train = hstack(sparse_matrix_list).tocsr()
print(type(X_train), X_train.shape)

<class 'scipy.sparse._csr.csr_matrix'> (150000, 500135)


In [27]:
def rmsle(y, y_pred):
    return np.sqrt(np.mean(np.power(np.log1p(y) - np.log1p(y_pred), 2)))

def evaluate_orig_price(y_test, preds):
    preds_exmpm = np.expm1(preds)
    y_test_exmpm = np.expm1(y_test)
    
    return rmsle(y_test_exmpm, preds_exmpm)

In [28]:
def model_train_predict(model, matrix_list):
    X = hstack(matrix_list).tocsr()
    X_train, X_test, y_train, y_test = train_test_split(X, small_data['PRODUCT_LENGTH'], test_size=0.2)

    model.fit(X_train, y_train)
    preds = model.predict(X_test)
    
    import gc
    del X, X_train, X_test, y_train
    gc.collect()
    
    return preds, y_test

In [29]:
linear_model = Ridge(solver='lsqr', fit_intercept=False)

sparse_matrix_list = (X_train_name, X_train_title, X_train_description, X_train_bull_point, X_train_prod_typ)
linear_preds, y_test = model_train_predict(model=linear_model, matrix_list=sparse_matrix_list)
print('Description:', evaluate_orig_price(y_test, linear_preds))

sparse_matrix_list = (X_train_name, X_train_descp, X_train_description, X_train_bull_point, X_train_prod_typ)
linear_preds, y_test = model_train_predict(model=linear_model, matrix_list=sparse_matrix_list)
print('Description:', evaluate_orig_price(y_test, linear_preds))

Description: 1.543980236007832
Description: 1.2130436403361713


In [30]:
from lightgbm import LGBMRegressor

lgbm_model = LGBMRegressor(n_estimators=200, learning_rate=0.5, num_leaves=125)

sparse_matrix_list = (X_train_name, X_train_descp, X_train_title, X_train_description, X_train_bull_point, X_train_prod_typ)
lgbm_preds, y_test = model_train_predict(model=lgbm_model, matrix_list=sparse_matrix_list)
print('LightGBM rmsle :', evaluate_orig_price(y_test, lgbm_preds))

LightGBM rmsle : 0.794106100016557


In [31]:
test_data = pd.read_csv('/content/drive/MyDrive/Amazon_Hackathon/test.csv')

In [None]:
test_data.head()

In [32]:
test_data.isnull().sum()

PRODUCT_ID              0
TITLE                   5
BULLET_POINTS      275922
DESCRIPTION        380001
PRODUCT_TYPE_ID         0
dtype: int64

In [33]:
test_data[['TITLE', 'BULLET_POINTS', 'DESCRIPTION']] = test_data[['TITLE', 'BULLET_POINTS', 'DESCRIPTION']].fillna('')

In [34]:
test_data.isnull().sum()

PRODUCT_ID         0
TITLE              0
BULLET_POINTS      0
DESCRIPTION        0
PRODUCT_TYPE_ID    0
dtype: int64

In [35]:
X_test_name = cvect.transform(test_data['TITLE'])

In [36]:
X_test_name.shape

(734736, 147662)

In [37]:
X_test_descp = tfidf_descp.transform(test_data['DESCRIPTION'])

In [38]:
X_test_bullet = tfidf_descp.transform(test_data['BULLET_POINTS'])

In [39]:
X_test_descp.shape

(734736, 50000)

In [40]:
X_test_bullet.shape

(734736, 50000)

In [41]:
X_test_title = lb_title.transform(test_data['TITLE'])

In [42]:
X_test_description = lb_descp.transform(test_data['DESCRIPTION'])

In [43]:
X_test_bull_point = lb_bull_pont.transform(test_data['BULLET_POINTS'])

In [44]:
X_test_prod_typ = lb_prod_typ_id.transform(test_data['PRODUCT_TYPE_ID'])

In [45]:
sparse_matrix_list = (X_train_name, X_train_descp, X_train_title, X_train_description, X_train_bull_point, X_train_prod_typ)
X_train = hstack(sparse_matrix_list).tocsr()
X_train

<150000x500135 sparse matrix of type '<class 'numpy.float64'>'
	with 7450123 stored elements in Compressed Sparse Row format>

In [46]:
y_train = small_data['PRODUCT_LENGTH']
y_train

0         6.200765
1         6.216606
2         1.098612
3         8.055475
4         6.552508
            ...   
149995    6.765202
149996    6.311735
149997    7.170888
149998    6.232448
149999    7.090910
Name: PRODUCT_LENGTH, Length: 150000, dtype: float64

In [47]:
lgbm_model = LGBMRegressor(n_estimators=200, learning_rate=0.5, num_leaves=125)
lgbm_model.fit(X_train, y_train)

In [48]:
sparse_matrix_list = (X_test_name, X_test_descp, X_test_title, X_test_description, X_test_bull_point, X_test_prod_typ)
X_test = hstack(sparse_matrix_list).tocsr()

In [49]:
preds = lgbm_model.predict(X_test)
preds

array([6.48903695, 8.61192203, 6.4889331 , ..., 6.75520265, 6.70721856,
       6.31086698])

In [50]:
preds = np.expm1(preds)
preds

array([ 656.88947893, 5495.8035674 ,  656.82115657, ...,  857.5137111 ,
        817.29144471,  549.52203246])

In [51]:
submission = pd.read_csv('/content/drive/MyDrive/Amazon_Hackathon/sample_submission.csv')
submission

Unnamed: 0,PRODUCT_ID,PRODUCT_LENGTH
0,604373,701.093794
1,1729783,734.506163
2,1871949,741.360258
3,1107571,730.327767
4,624253,666.847946
...,...,...
734731,921419,733.838809
734732,2456362,746.810825
734733,841529,691.127128
734734,1190194,757.643591


In [52]:
submission.loc[:, 'PRODUCT_LENGTH'] = preds
submission

Unnamed: 0,PRODUCT_ID,PRODUCT_LENGTH
0,604373,656.889479
1,1729783,5495.803567
2,1871949,656.821157
3,1107571,87.715583
4,624253,601.709797
...,...,...
734731,921419,466.000320
734732,2456362,638.168324
734733,841529,857.513711
734734,1190194,817.291445


In [53]:
submission.to_csv('submission.csv', index=False)