In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

In [None]:
%matplotlib inline
import matplotlib.pyplot as plt
plt.style.use(style='ggplot')
plt.rcParams['figure.figsize'] = (10, 6)

import seaborn as sns

In [None]:
from sklearn.linear_model import Ridge, LogisticRegression
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer

# Data Import

Extract the 7z files on kaggle servers

In [None]:
!apt-get install p7zip
!p7zip -d -f -k /kaggle/input/mercari-price-suggestion-challenge/train.tsv.7z
!p7zip -d -f -k /kaggle/input/mercari-price-suggestion-challenge/test.tsv.7z
!p7zip -d -f -k /kaggle/input/mercari-price-suggestion-challenge/sample_submission.csv.7z

In [None]:
!unzip /kaggle/input/mercari-price-suggestion-challenge/sample_submission_stg2.csv.zip
!unzip /kaggle/input/mercari-price-suggestion-challenge/test_stg2.tsv.zip

In [None]:
train = pd.read_csv('train.tsv', sep = '\t')
test = pd.read_csv('test_stg2.tsv', sep='\t')

In [None]:
print ("Train data shape:", train.shape)
print ("Test data shape:", test.shape)

## Data Exploration

In [None]:
train.head(10)

In [None]:
train.info()

Columns Category_name, brand_name and item_description have nulls

## Target distribution

We are trying to predict the items price, so we will investigate the variable distribution and check if it has skwenwss.

In [None]:
train.price.describe().apply(lambda x: format(x, 'f'))

We have some prices that exceed the third quartile range, they may be high price products or outliers. 

In [None]:
np.percentile(train.price, 99)

In [None]:
print ("Skew is:", train.price.skew())
sns.displot(train.price, kde=True)
plt.show()

Take log1p to the price variable

In [None]:
sns.displot(np.log1p(train['price']))

In [None]:
np.log1p(train['price']).hist()

So our price is skewed, we will apply the log transform on it.

In [None]:
train['price'] = np.log1p(train['price'])

Now we will explore the other variables and thier values

In [None]:
train['shipping'].value_counts()

In [None]:
train['item_description'].value_counts().head()

In [None]:
train['item_condition_id'].value_counts()

In [None]:
train['brand_name'].value_counts().head()

In [None]:
train['category_name'].value_counts().head(10)

we may need to split the categories so that we can match the common ones and get better insights from the data

## Categories handling

In [None]:
def category_split(category_name):
    try:
        return category_name.split('/')
    except:
        return ['Missing', 'Missing', 'Missing']

In [None]:
train['main_cat'], train['sub_cat'], train['item_cat'] = zip(*train['category_name'].apply(lambda x: category_split(x)))
test['main_cat'], test['sub_cat'], test['item_cat'] = zip(*test['category_name'].apply(lambda x: category_split(x)))

# Missing Data 

It doesn't seem to be a good practise if we filled or imputed data in predection, so we will fill all missing with missing.

In [None]:
train['category_name'] = train['category_name'].fillna(value='Missing')
train['brand_name'] = train['brand_name'].fillna(value='Missing')
train['item_description'] = train['item_description'].fillna(value='Missing')

In [None]:
test['category_name'] = test['category_name'].fillna(value='Missing')
test['brand_name'] = test['brand_name'].fillna(value='Missing')
test['item_description'] = test['item_description'].fillna(value='Missing')

# Vectorization

### Vectorize name

Vectorize name column using BoW Model. 

In [None]:
cnt_vec = CountVectorizer()

X_train_name = cnt_vec.fit_transform(train['name'])
X_test_name = cnt_vec.transform(test['name'])

In [None]:
print(X_train_name.shape)
print(X_test_name.shape)

### Vectorize Item Description 

Vectorize Item Description using TF-IDF Model.

In [None]:
tfidf_descp = TfidfVectorizer(max_features=50000, ngram_range=(1, 3), stop_words='english')

X_train_descp = tfidf_descp.fit_transform(train['item_description'])
X_test_descp = tfidf_descp.transform(test['item_description'])

### One-hot encoding via LabelBinarizer

To handel sparsity in matrix, we will apply One-Hot Encoding on categorical variables.

In [None]:
from sklearn.preprocessing import LabelBinarizer

Apply labelBinarizer on all variables on train and test dataset.

In [None]:
lb_brand_name = LabelBinarizer(sparse_output=True)
X_train_brand = lb_brand_name.fit_transform(train['brand_name'])
X_test_brand = lb_brand_name.transform(test['brand_name'])

lb_item_cond_id = LabelBinarizer(sparse_output=True)
X_train_item_condition_id = lb_item_cond_id.fit_transform(train['item_condition_id'])
X_test_item_condition_id = lb_item_cond_id.transform(test['item_condition_id'])

lb_shipping = LabelBinarizer(sparse_output=True)
X_train_shipping = lb_shipping.fit_transform(train['shipping'])
X_test_shipping = lb_shipping.transform(test['shipping'])

In [None]:
lb_main_cat = LabelBinarizer(sparse_output=True)
X_train_main_cat = lb_main_cat.fit_transform(train['main_cat'])
X_test_main_cat = lb_main_cat.transform(test['main_cat'])

lb_sub_cat = LabelBinarizer(sparse_output=True)
X_train_sub_cat = lb_sub_cat.fit_transform(train['sub_cat'])
X_test_sub_cat = lb_sub_cat.transform(test['sub_cat'])

lb_item_cat = LabelBinarizer(sparse_output=True)
X_train_item_cat = lb_item_cat.fit_transform(train['item_cat'])
X_test_item_cat = lb_item_cat.transform(test['item_cat'])

Print all the columns shape so we know the size of our data after encoding.

In [None]:
# Full dataframe printing
print(type(X_train_brand), type(X_train_item_condition_id), type(X_train_shipping))
print(type(X_test_brand), type(X_test_item_condition_id), type(X_test_shipping))

In [None]:
# Train dataframe printing
print('X_train_brand shape:', X_train_brand.shape)
print('X_train_item_cond_id shape:', X_train_item_condition_id.shape)
print('X_train_shipping shape:', X_train_shipping.shape)
print('X_train_main_cat shape:', X_train_main_cat.shape)
print('X_train_sub_cat shape:', X_train_sub_cat.shape)
print('X_train_item_cat shape:', X_train_item_cat.shape)

In [None]:
# Test dataframe printing
print('X_test_brand shape:', X_test_brand.shape)
print('X_test_item_cond_id shape:', X_test_item_condition_id.shape)
print('X_test_shipping shape:', X_test_shipping.shape)
print('X_test_main_cat shape:', X_test_main_cat.shape)
print('X_test_sub_cat shape:', X_test_sub_cat.shape)
print('X_test_item_cat shape:', X_test_item_cat.shape)

## Stacking X_Train

Stack X_train dataframe for printing purpose.

In [None]:
from scipy.sparse import hstack
import gc

In [None]:
sparse_matrix_list = (X_train_name, X_train_descp, X_train_brand, 
                      X_train_item_condition_id, X_train_shipping, 
                      X_train_main_cat, X_train_sub_cat, X_train_item_cat)

Convert this matrix to Compressed Sparse Row forma

In [None]:
X_train = hstack(sparse_matrix_list).tocsr()
print(type(X_train), X_train.shape)

Garbage Collecting

In [None]:
del X_train
gc.collect()

# Evalutation

In [None]:
def rmsle(y, y_pred):
    return np.sqrt(np.mean(np.power(np.log1p(y) - np.log1p(y_pred), 2)))

def evaluate_orig_price(y_test, preds):
    preds_exmpm = np.expm1(preds)
    y_test_exmpm = np.expm1(y_test)
    
    return rmsle(y_test_exmpm, preds_exmpm)

In [None]:
def model_train_predict(model, matrix_list):
    X = hstack(matrix_list).tocsr()
    X_train, X_test, y_train, y_test = train_test_split(X, train['price'], test_size=0.2)

    model.fit(X_train, y_train)
    preds = model.predict(X_test)
    
    del X, X_train, X_test, y_train
    gc.collect()
    
    return preds, y_test

# Modeling

## Ridge Regression Model

In [None]:
linear_model = Ridge(solver='lsqr', fit_intercept=False)

sparse_matrix_list = (X_train_name, X_train_brand, 
                      X_train_item_condition_id, X_train_shipping, 
                      X_train_main_cat, X_train_sub_cat, X_train_item_cat)

linear_preds, y_test = model_train_predict(model=linear_model, 
                                           matrix_list=sparse_matrix_list)

print('Item Description rmsle:', evaluate_orig_price(y_test, linear_preds))

sparse_matrix_list = (X_train_name, X_train_descp, X_train_brand, 
                      X_train_item_condition_id, X_train_shipping, 
                      X_train_main_cat, X_train_sub_cat, X_train_item_cat)


linear_preds, y_test = model_train_predict(model=linear_model, 
                                           matrix_list=sparse_matrix_list)
print('Item Description rmsle:', evaluate_orig_price(y_test, linear_preds))

# Prediction

In [None]:
sparse_matrix_list = (X_train_name, X_train_descp, X_train_brand, 
                      X_train_item_condition_id, X_train_shipping, 
                      X_train_main_cat, X_train_sub_cat, X_train_item_cat)

X_train = hstack(sparse_matrix_list).tocsr()
X_train

In [None]:
sparse_matrix_list = (X_test_name, X_test_descp, X_test_brand, 
                      X_test_item_condition_id, X_test_shipping, 
                      X_test_main_cat, X_test_sub_cat, X_test_item_cat)
X_test = hstack(sparse_matrix_list).tocsr()

In [None]:
y_train = train['price']
y_train

In [None]:
linear_model.fit(X_train, y_train)

In [None]:
preds = linear_model.predict(X_test)
preds

In [None]:
preds = np.expm1(preds)
preds

In [None]:
submission = pd.read_csv('sample_submission_stg2.csv')
submission

In [None]:
submission.loc[:, 'price'] = preds
submission

In [None]:
submission.to_csv('submission.csv', index=False)