In [3]:
import pandas as pd
import numpy as np
##import gensim     ## use it while applying Glove, Word2Vec etc

##### Reading from pre-processed train & test datasets. Pre-processing such as EDA, Text-Preprocessing has been applied previously to the below datasets

In [4]:
train = pd.read_csv('processed_train.csv')
test = pd.read_csv('processed_test.csv')

##### Since we have split category_name feature into subcategory columns, there is no need of orginal 'cateogry_name' column in our dataset. Hence we are dropping it. Similary for 'item_description' & 'name' column we have tokenised the words seperately. 

In [5]:
## deleting redundant columns in train set
del train["Unnamed: 0"]
del train["name"]
del train["category_name"]
del train["item_description"]

In [6]:
## deleting redundant columns in test set
del test['Unnamed: 0']
del test['category_name']

In [7]:
train.head()

Unnamed: 0,train_id,item_condition_id,brand_name,price,shipping,main_cat,sub_cat1,sub_cat2,log_price,description_wc,name_wc,tokenized_name,tokenized_description
0,204360,3,The Children's Place,10.0,0,Kids,Girls (4+),Tops & T-Shirts,2.397895,32,5,"['3', 'girl', 't', 'shirt']","['cute', 'shirt', 'littl', 'girl', 'size', 'br..."
1,1191320,3,Topps,17.0,1,Vintage & Collectibles,Trading Cards,Sports,2.890372,14,5,"['4', 'old', 'basebal', 'card']","['includ', 'hall', 'fame', 'player', 'furillo'..."
2,270664,2,Fox Racing,15.0,1,Men,Tops,T-shirts,2.772589,3,5,"['alpinestar', 'men', 'tee', 'shirt']","['descript', 'yet']"
3,823747,2,Not Known,75.0,1,Women,Jeans,"Slim, Skinny",4.330733,3,7,"['hold', 'distress', 'short', 'bundl']",['hold']
4,852490,3,Not Known,9.0,0,Women,Tops & Blouses,Knit Top,2.302585,6,6,"['junior', 'plaid', 'tie', 'front', 'top']","['cute', 'lace', 'plaid', 'cotton', 'top']"


In [8]:
test.head()

Unnamed: 0,id,name,item_condition_id,brand_name,shipping,item_description,main_cat,sub_cat1,sub_cat2
0,1048243,Herschel Retreat Backpack,2,Urban Outfitters,0,In great condition! Only issue is magnet that ...,Men,Men's Accessories,"Backpacks, Bags & Briefcases"
1,86986,2 pairs of Lebron,3,Nike,1,Lebron soldier 10 size 10.5 9/10 condition Leb...,Men,Shoes,Athletic
2,698316,Mario Kart Double Dash with bonus disc,3,Nintendo,1,Good shape work no problem,Vintage & Collectibles,Electronics,Video Game
3,268868,Tokidoki unicorno kaiju and strawberry,2,tokidoki,1,Both new with foil and box.,Vintage & Collectibles,Collectibles,Figurine
4,1360398,"Like Ugg Cardy, grey knit top boot",2,,0,"Brand new, not even a foot near them! Size 8 b...",Women,Shoes,Boots


##### For applying Linear Regression mdoel, we have to seperate target variable from original dataset and feed it seperately. And test dataset shouldn't contain the target variable.

In [9]:
## seperating target column price from train dataset for applying Regression
x_train = train.drop(['log_price'], axis=1)
y_train = train['log_price']

In [10]:
## filling missing values for brand_name and item_description similar to train dataset
test['brand_name'] = test['brand_name'].fillna('Not Known')
test['item_description'] = test['item_description'].fillna('No Description Yet')

In [11]:
## test dataset without target variable price
x_test = test.copy()

##### Lets apply One-hot encoding for categorical columns for train & test dataset
Features brand_name, item_condition_id, shipping, main_cat, sub_cat1, sub_cat2 are applied with one-hot encoding. For item_description feature, we shall deal with it later with word embeddings

In [12]:
## One - hot encoding 
from sklearn.preprocessing import OneHotEncoder

def onehot_encoding(feature):
    encoder = OneHotEncoder(handle_unknown='ignore')
    x_train_onehot = encoder.fit_transform(x_train[feature].values.reshape(-1,1))
    x_test_onehot = encoder.transform(x_test[feature].values.reshape(-1,1))
    return x_train_onehot, x_test_onehot


x_train_brand, x_test_brand = onehot_encoding('brand_name')
x_train_condition, x_test_condition = onehot_encoding('item_condition_id')
x_train_shipping, x_test_shipping = onehot_encoding('shipping')
x_train_main_cat, x_test_main_cat = onehot_encoding('main_cat')
x_train_subcat_1, x_test_subcat_1 = onehot_encoding('sub_cat1')
x_train_subcat_2, x_test_subcat_2 = onehot_encoding('sub_cat2')

### Applying CountVectorizer word embedding for item_description feature

###### In train dataset, the item_description column has been tokenized into lists of words. For test dataset, we are passing the original item_description column

In [13]:
from sklearn.feature_extraction.text import CountVectorizer

bow = CountVectorizer(min_df=10, ngram_range=(1,2), binary=True)
x_train_bow_desc = bow.fit_transform(x_train['tokenized_description'])
x_test_bow_desc = bow.transform(x_test['item_description'])

In [32]:
x_train_bow_desc

<629308x160619 sparse matrix of type '<class 'numpy.int64'>'
	with 17310000 stored elements in Compressed Sparse Row format>

In [33]:
x_test_bow_desc

<444761x160619 sparse matrix of type '<class 'numpy.int64'>'
	with 6501338 stored elements in Compressed Sparse Row format>

In [14]:
## For stacking sparse matrices horizontally
from scipy.sparse import hstack
x_train_final = hstack((x_train_brand, x_train_condition, x_train_shipping, x_train_main_cat, x_train_subcat_1, x_train_subcat_2, x_train_bow_desc))
x_test_final = hstack((x_test_brand, x_test_condition, x_test_shipping, x_test_main_cat, x_test_subcat_1, x_test_subcat_2, x_test_bow_desc))


### Applying Linear Regression Model

In [2]:
## log1p --> log(x+1)  for dealing with 0 values in logarithms
## Use this if target var price has not been applied with log before
##y_train_log = np.log1p(y_train)

In [15]:
## model fitted by minimizing loss with SGD (Stochastic Gradient Descent)  (SGD converges faster)
## We are using Randomised Search for tuning hyperparameters, it uses
## random combinations to find the best solution for the model (faster than Grid Search)

from sklearn.model_selection import RandomizedSearchCV
from sklearn.linear_model import SGDRegressor

LR = SGDRegressor(loss='squared_loss', random_state=42)
alpha = [10**i for i in range(-7,2)]
params = {'alpha':alpha}
RS_LR = RandomizedSearchCV(LR, params, scoring='neg_mean_squared_error', cv=4, n_jobs=-1, random_state=42)
RS_LR.fit(x_train_final, y_train)



RandomizedSearchCV(cv=4, estimator=SGDRegressor(random_state=42), n_jobs=-1,
                   param_distributions={'alpha': [1e-07, 1e-06, 1e-05, 0.0001,
                                                  0.001, 0.01, 0.1, 1, 10]},
                   random_state=42, scoring='neg_mean_squared_error')

In [16]:
from sklearn.linear_model import SGDRegressor
LR = SGDRegressor(loss='squared_loss', alpha=1e-07, random_state=42)
LR.fit(x_train_final, y_train)
y_pred_lr = LR.predict(x_test_final)


In [17]:
## Predicted values of x_test dataset (numpy array)
y_pred_lr

array([3.24266009, 3.81330045, 2.82793578, ..., 2.96192282, 3.64058056,
       2.89248651])

##### Building CSV file from predicted price results
Submission.csv should contain two columns : id, price

In [18]:
## Constructing pandas dataframe from above numpy array
df = pd.DataFrame(y_pred_lr)
df.head()

Unnamed: 0,0
0,3.24266
1,3.8133
2,2.827936
3,2.993224
4,3.450577


In [21]:
test_id = test['id']

In [22]:
## merging columns containing test_id and predicted prices
result = pd.concat([test_id, df], axis = 1)

In [23]:
result.head()

Unnamed: 0,id,0
0,1048243,3.24266
1,86986,3.8133
2,698316,2.827936
3,268868,2.993224
4,1360398,3.450577


In [26]:
## create csv file for submission
## column 1 --> id
## column 2 --> price
result.to_csv('submission_kaggle.csv')