In [1]:
import warnings
warnings.filterwarnings('ignore')

import os
import shutil
import datetime
import gc
from tqdm import tqdm

import pandas as pd
import numpy as np

%matplotlib inline
import matplotlib.pyplot as plt
import seaborn as sns
sns.set(style='whitegrid')


import scipy
from scipy import hstack


from sklearn.metrics import mean_squared_error as mse
from math import sqrt
from sklearn.linear_model import Ridge
from sklearn.svm import SVR
from sklearn.ensemble import RandomForestRegressor
from lightgbm import LGBMRegressor

from sklearn.model_selection import RandomizedSearchCV 
from scipy.stats import randint as sp_randint
from scipy.stats import uniform
#from sklearn.feature_selection.univariate_selection import SelectKBest, f_regression

**Loading data**

In [2]:
df_train = pd.read_csv('train_final.csv')

df_test = pd.read_csv('test_final.csv')

In [3]:
print('Shape of train data: ', df_train.shape)

print('Shape of test data: ', df_test.shape)

Shape of train data:  (1481075, 59)
Shape of test data:  (693359, 58)


In [4]:
df_train.head()

Unnamed: 0,train_id,item_condition_id,brand_name,price,shipping,gencat_name,subcat1_name,subcat2_name,preprocessed_name,preprocessed_description,...,gencat_mean_price,subcat1_mean_price,subcat2_mean_price,condition_mean_price,brand_median_price,name_median_price,gencat_median_price,subcat1_median_price,subcat2_median_price,condition_median_price
0,0,3,missing,10.0,1,Men,Tops,Tshirts,mlb cincinnati reds t shirt size xl,no description yet,...,34.73494,19.014216,18.368301,26.557241,14.0,15.5,21.0,14.0,14.0,16.0
1,1,3,Razer,52.0,0,Electronics,ComputersTablets,ComponentsParts,razer blackwidow chroma keyboard,this keyboard great condition works like came ...,...,35.190558,87.970533,42.9139,26.557241,39.5,40.0,15.0,40.0,25.0,16.0
2,2,1,Target,10.0,1,Women,TopsBlouses,Blouse,ava viv blouse,adorable top hint lace key hole back the pale ...,...,28.902679,18.249287,15.671262,26.499502,12.0,14.0,19.0,14.0,12.0,18.0
3,3,1,missing,35.0,1,Home,HomeDcor,HomeDcorAccents,leather horse statues,new tags leather horses retail rm stand foot h...,...,24.551068,21.581724,22.203802,26.499502,14.0,17.0,18.0,16.0,16.0,18.0
4,4,1,missing,44.0,0,Women,Jewelry,Necklaces,24k gold plated rose,complete certificate authenticity,...,28.902679,27.516272,25.597873,26.499502,14.0,18.0,19.0,14.0,12.0,18.0


In [5]:
df_test.head()

Unnamed: 0,test_id,item_condition_id,brand_name,shipping,gencat_name,subcat1_name,subcat2_name,preprocessed_name,preprocessed_description,name_first,...,gencat_mean_price,subcat1_mean_price,subcat2_mean_price,condition_mean_price,brand_median_price,name_median_price,gencat_median_price,subcat1_median_price,subcat2_median_price,condition_median_price
0,0,1,missing,1,Women,Jewelry,Rings,breast cancer i fight like a girl ring,size 7,breast,...,28.902679,27.516272,32.96056,26.499502,14.0,10.0,19.0,14.0,15.0,18.0
1,1,1,missing,1,Other,Officesupplies,ShippingSupplies,25 pcs new 7 5 x12 kraft bubble mailers,25 pcs new 7 5 x12 kraft bubble mailers lined ...,25,...,20.819917,16.719531,11.27582,26.499502,14.0,8.0,14.0,11.0,9.0,18.0
2,2,1,Coach,1,VintageCollectibles,BagsandPurses,Handbag,coach bag,brand new coach bag bought rm coach outlet,coach,...,27.345891,61.810448,139.664714,26.499502,31.0,29.0,16.0,28.0,55.5,18.0
3,3,2,missing,0,Women,Sweaters,Cardigan,floral kimono,floral kimono never worn lightweight perfect h...,floral,...,28.902679,26.29972,26.934284,27.584015,14.0,13.0,19.0,20.0,17.0,17.0
4,4,3,missing,1,Other,Books,ReligionSpirituality,life after death,rediscovering life loss loved one tony cooke p...,life,...,20.819917,16.209066,13.358423,26.557241,14.0,16.0,14.0,11.0,11.0,16.0


###  Train, Test split for cross validation

In [6]:
y = np.log1p(df_train['price'])
X=df_train.drop(['price'], axis=1)

In [7]:
from sklearn.model_selection import train_test_split

x_train, x_cv , y_train, y_cv = train_test_split(X, y, test_size=0.2, random_state=42)

print('Train size: {}, CV size: {}, Test size: {}' .format(x_train.shape, x_cv.shape, df_test.shape))

Train size: (1184860, 58), CV size: (296215, 58), Test size: (693359, 58)


###  Categorical features

* One-hot encoding of brand_name, gencat_name, subcat1_name, subcat2_name.


In [8]:
import re

def clean_category(categorical_values):
    '''takes categorical column values as arguments and returns list of cleaned categories'''
    
    catogories = list(categorical_values)

    categorical_values_list = []
    for i in tqdm(catogories):
        i = re.sub('[^A-Za-z0-9]+', ' ', i)
        i = i.replace(' ','')
        i = i.replace('&','_')
        categorical_values_list.append(i.strip())
    
    return categorical_values_list 

In [9]:
#Cleaning brand name before using count vectorizer
# Using same preprocessing as used earlier for categories: 'clean_cat()' function

x_train['brand_name'] = clean_category(x_train['brand_name'].values)

x_cv['brand_name'] = clean_category(x_cv['brand_name'].values)

df_test['brand_name'] = clean_category(df_test['brand_name'].values)

100%|████████████████████████████████████████████████████████████████████| 1184860/1184860 [00:03<00:00, 391476.96it/s]
100%|██████████████████████████████████████████████████████████████████████| 296215/296215 [00:00<00:00, 381151.10it/s]
100%|██████████████████████████████████████████████████████████████████████| 693359/693359 [00:01<00:00, 397183.11it/s]


In [10]:
from sklearn.feature_extraction.text import CountVectorizer 

vectorizer = CountVectorizer(lowercase=False, binary=True)
vectorizer.fit(x_train['brand_name'].values)

train_brand_oneHot = vectorizer.transform(x_train['brand_name'].values)

cv_brand_oneHot = vectorizer.transform(x_cv['brand_name'].values)

test_brand_oneHot = vectorizer.transform(df_test['brand_name'].values)

print("Shape of matrices after one hot encoding")

print(train_brand_oneHot.shape, "\n", cv_brand_oneHot.shape  ,"\n", test_brand_oneHot.shape)

Shape of matrices after one hot encoding
(1184860, 4509) 
 (296215, 4509) 
 (693359, 4509)


In [11]:
vectorizer = CountVectorizer(lowercase=False, binary=True)
vectorizer.fit(x_train['gencat_name'].values)

train_gencat_oneHot = vectorizer.transform(x_train['gencat_name'].values)

cv_gencat_oneHot = vectorizer.transform(x_cv['gencat_name'].values)

test_gencat_oneHot = vectorizer.transform(df_test['gencat_name'].values)

print("Shape of matrices after one hot encoding")

print(train_gencat_oneHot.shape, "\n", cv_gencat_oneHot.shape, "\n", test_gencat_oneHot.shape)

Shape of matrices after one hot encoding
(1184860, 11) 
 (296215, 11) 
 (693359, 11)


In [12]:
vectorizer = CountVectorizer(lowercase=False, binary=True)
vectorizer.fit(x_train['subcat1_name'].values)

train_subcat1_oneHot = vectorizer.transform(x_train['subcat1_name'].values)

cv_subcat1_oneHot = vectorizer.transform(x_cv['subcat1_name'].values)

test_subcat1_oneHot = vectorizer.transform(df_test['subcat1_name'].values)

print("Shape of matrices after one hot encoding")
print(train_subcat1_oneHot.shape, "\n", cv_subcat1_oneHot.shape, "\n", test_subcat1_oneHot.shape)

Shape of matrices after one hot encoding
(1184860, 114) 
 (296215, 114) 
 (693359, 114)


In [13]:
vectorizer = CountVectorizer(lowercase=False, binary=True)
vectorizer.fit(x_train['subcat2_name'].values)

train_subcat2_oneHot = vectorizer.transform(x_train['subcat2_name'].values)

cv_subcat2_oneHot = vectorizer.transform(x_cv['subcat2_name'].values)

test_subcat2_oneHot = vectorizer.transform(df_test['subcat2_name'].values)

print("Shape of matrices after one hot encoding")
print(train_subcat2_oneHot.shape, "\n", cv_subcat2_oneHot.shape, "\n", test_subcat2_oneHot.shape)

Shape of matrices after one hot encoding
(1184860, 860) 
 (296215, 860) 
 (693359, 860)


### Tfidf vectorization on text features

* 1-3 grams of name<br>

* 1-3 grams of item_description

In [14]:
from sklearn.feature_extraction.text import TfidfVectorizer

vectorizer = TfidfVectorizer(ngram_range=(1, 3), min_df=3, max_features=250000)
vectorizer.fit(x_train['preprocessed_name'].values)

train_name_tfidf = vectorizer.transform(x_train['preprocessed_name'].values)

cv_name_tfidf = vectorizer.transform(x_cv['preprocessed_name'].values)

test_name_tfidf = vectorizer.transform(df_test['preprocessed_name'].values.astype('U'))

print("Shape of matrices after vectorization")
print(train_name_tfidf.shape, "\n", cv_name_tfidf.shape, "\n", test_name_tfidf.shape)

Shape of matrices after vectorization
(1184860, 250000) 
 (296215, 250000) 
 (693359, 250000)


In [15]:
vectorizer = TfidfVectorizer(ngram_range=(1, 3), min_df=5, max_features=500000)
vectorizer.fit(x_train['preprocessed_description'].values)

train_description_tfidf = vectorizer.transform(x_train['preprocessed_description'].values)

cv_description_tfidf = vectorizer.transform(x_cv['preprocessed_description'].values)

test_description_tfidf = vectorizer.transform(df_test['preprocessed_description'].values.astype('U'))

print("Shape of matrices after vectorization")
print(train_description_tfidf.shape, "\n", cv_description_tfidf.shape, "\n", test_description_tfidf.shape)

Shape of matrices after vectorization
(1184860, 500000) 
 (296215, 500000) 
 (693359, 500000)


## Normalize numerical features

In [16]:
cols = set(x_train.columns.values) - {'train_id'}

skip_cols = {'preprocessed_name', 'item_condition_id', 'brand_name',
  'shipping', 'preprocessed_description', 'gencat_name',
  'subcat1_name', 'subcat2_name', 'name_first', 'price_in_name'}

cols_to_normalize = cols - skip_cols
print("Normalizing following columns: \n ", cols_to_normalize)

def normalize(df):
    result1 = df.copy()
    for feature_name in df.columns:
        if (feature_name in cols_to_normalize):
            max_value = df[feature_name].max()
            min_value = df[feature_name].min()
            result1[feature_name] = (df[feature_name] - min_value) / (max_value - min_value)
    return result1


Normalizing following columns: 
  {'desc_word_count', 'NameUpper', 'DescriptionPunctCount', 'gencat_mean_price', 'brand_name_count', 'DescriptionUpperRatio', 'mean_name', 'DescriptionUpper', 'weird_characters_name', 'name_desc_len_ratio', 'stopword_ratio_desc', 'NameDigitCount', 'subcat1_name_count', 'subcat1_mean_price', 'des_len', 'subcat2_median_price', 'NameUpperRatio', 'DescriptionLowerRatio', 'num_sum', 'prices_count', 'subcat2_name_count', 'mean_des', 'name_word_count', 'gencat_name_count', 'brand_median_price', 'gencat_median_price', 'condition_mean_price', 'name_first_count', 'name_len', 'subcat1_median_price', 'brand_mean_price', 'NamePunctCount', 'DescriptionLower', 'NameLowerRatio', 'NamePunctCountRatio', 'DescriptionDigitCount', 'name_letters_per_word', 'name_mean_price', 'name_median_price', 'condition_median_price', 'NameDigitCountRatio', 'DescriptionDigitCountRatio', 'weird_characters_desc', 'subcat2_mean_price', 'DescriptionPunctCountRatio', 'NameLower', 'desc_letters_

In [17]:
train_normalized = normalize(x_train)

cv_normalized = normalize(x_cv)

test_normalized = normalize(df_test)

### Remove non-features from dataframes

In [18]:
#Separating and storing all numerical features

X_tr = train_normalized[list(cols_to_normalize)]
X_val = cv_normalized[list(cols_to_normalize)]
X_te = test_normalized[list(cols_to_normalize)]


In [19]:
x_tr_temp=pd.get_dummies(train_normalized[['item_condition_id', 'shipping', 'price_in_name']], sparse=True)
x_cv_temp=pd.get_dummies(cv_normalized[['item_condition_id', 'shipping', 'price_in_name']], sparse=True)
x_test_temp=pd.get_dummies(test_normalized[['item_condition_id', 'shipping', 'price_in_name']], sparse=True)

In [20]:
from scipy.sparse import csr_matrix

# Storing categorical features to sparse matrix

X_tr_cat = csr_matrix(x_tr_temp.values , dtype=np.int8)

X_cv_cat = csr_matrix(x_cv_temp.values, dtype=np.int8)

X_te_cat = csr_matrix(x_test_temp.values , dtype=np.int8)

print(X_tr_cat.shape, X_cv_cat.shape, X_te_cat.shape)

(1184860, 3) (296215, 3) (693359, 3)


###  Consolidate all features to a sparse matrix

In [21]:
from scipy.sparse import hstack

# stack all categorical and text sparse matrices

train_sparse = hstack((train_brand_oneHot, train_gencat_oneHot, train_subcat1_oneHot, train_subcat2_oneHot, \
               train_name_tfidf, train_description_tfidf, X_tr_cat)).tocsr()

cv_sparse = hstack((cv_brand_oneHot, cv_gencat_oneHot, cv_subcat1_oneHot, cv_subcat2_oneHot, \
               cv_name_tfidf, cv_description_tfidf, X_cv_cat)).tocsr()

test_sparse = hstack((test_brand_oneHot, test_gencat_oneHot, test_subcat1_oneHot, test_subcat2_oneHot, \
               test_name_tfidf, test_description_tfidf, X_te_cat)).tocsr()

In [22]:
print(train_sparse.shape, cv_sparse.shape, test_sparse.shape)

(1184860, 755497) (296215, 755497) (693359, 755497)


In [23]:
# stack dense feature matrix with categorical and text vectors

X_train = hstack((X_tr.values, train_sparse)).tocsr()

X_cv = hstack((X_val.values, cv_sparse)).tocsr()

X_test = hstack((X_te.values, test_sparse)).tocsr()

In [24]:
print('X_Train size: {}, X_CV size: {}, X_Test size: {}' .format(X_train.shape, X_cv.shape, X_test.shape))

X_Train size: (1184860, 755544), X_CV size: (296215, 755544), X_Test size: (693359, 755544)


In [25]:
print('y_Train size: {}, y_CV size: {}' .format(y_train.shape, y_cv.shape))

y_Train size: (1184860,), y_CV size: (296215,)


## Modeling

### 4. LightGBM Regression

#### Training and testing using best parameters 

In [26]:
model = LGBMRegressor(learning_rate=0.3745401188473625, max_depth=14,
              min_child_weight=0.3668695797323276, n_estimators=1395,
              num_leaves=40, random_state=42, subsample=0.9, n_jobs=-1)

model.fit(X_train, y_train)

LGBMRegressor(learning_rate=0.3745401188473625, max_depth=14,
              min_child_weight=0.3668695797323276, n_estimators=1395,
              num_leaves=40, random_state=42, subsample=0.9)

In [27]:
lgb_preds_tr = model.predict(X_train)
lgb_preds_cv = model.predict(X_cv)


print('Train RMSLE:', sqrt(mse(y_train, lgb_preds_tr)))

print("Cross validation RMSLE: ", sqrt(mse(y_cv, lgb_preds_cv)))

Train RMSLE: 0.367618141036234
Cross validation RMSLE:  0.429925315705115


In [34]:
lgb_preds_te = model.predict(X_test)

In [38]:
#predicted price
lgb_preds_te[:10]

array([2.65666576, 2.94559861, 4.33771795, 3.05841033, 2.49436811,
       2.69120987, 2.90589221, 3.98698268, 4.17450297, 2.61740287])

In [39]:
#predicted price
lgb_preds_te[50:60]

array([2.08535089, 3.57037173, 2.6698779 , 3.38802678, 2.88470724,
       2.77345203, 2.69827774, 2.52460266, 3.04061196, 2.74620167])

In [40]:
#predicted price
lgb_preds_te[10:]

array([3.74392025, 2.83698055, 3.19565149, ..., 2.67891744, 2.75119776,
       2.89376643])