In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

In [None]:
!apt-get install p7zip
!p7zip -d -f -k /kaggle/input/mercari-price-suggestion-challenge/train.tsv.7z
!p7zip -d -f -k /kaggle/input/mercari-price-suggestion-challenge/test.tsv.7z
!p7zip -d -f -k /kaggle/input/mercari-price-suggestion-challenge/sample_submission.csv.7z

In [None]:
!unzip /kaggle/input/mercari-price-suggestion-challenge/sample_submission_stg2.csv.zip
!unzip /kaggle/input/mercari-price-suggestion-challenge/test_stg2.tsv.zip

# Data preparation

In [None]:
import math
from sklearn.model_selection import train_test_split, cross_val_score, GridSearchCV
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.linear_model import SGDRegressor
from sklearn.metrics import mean_squared_error

import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline

In [None]:
train_df = pd.read_csv('train.tsv', sep='\t')
train_df.shape

In [None]:
test_df = pd.read_csv('test_stg2.tsv', sep='\t')
test_df.shape

In [None]:
train_df.head()

In [None]:
train_df.info()

# Target's distribution

In [None]:
plt.figure(figsize=(6, 4))
sns.distplot(train_df['price'], kde=False)

In [None]:
sns.distplot(np.log1p(train_df['price']), kde=False)

## Apply log1p to price

In [None]:
train_df['price'] = np.log1p(train_df['price'])

# Overview other features

In [None]:
train_df['shipping'].value_counts()

In [None]:
train_df['item_condition_id'].value_counts()

In [None]:
train_df['item_description'].value_counts()

## 'No description yet'

In [None]:
train_df[train_df['item_description'] == 'No description yet']['item_description'].count()

# category_name

In [None]:
def split_cat(category_name):
    try:
        return category_name.split('/')
    except:
        return ['Other_Null', 'Other_Null', 'Other_Null']

In [None]:
train_df['cat_1'], train_df['cat_2'], train_df['cat_3'] = zip(*train_df['category_name'].apply(lambda x: split_cat(x)))
test_df['cat_1'], test_df['cat_2'], test_df['cat_3'] = zip(*test_df['category_name'].apply(lambda x: split_cat(x)))

In [None]:
print(train_df['cat_1'].nunique())
print(train_df['cat_2'].nunique())
print(train_df['cat_3'].nunique())

In [None]:
train_df['brand_name'] = train_df['brand_name'].fillna(value='Other_Null')
train_df['category_name'] = train_df['category_name'].fillna(value='Other_Null')
train_df['item_description'] = train_df['item_description'].fillna(value='Other_Null')

test_df['brand_name'] = test_df['brand_name'].fillna(value='Other_Null')
test_df['category_name'] = test_df['category_name'].fillna(value='Other_Null')
test_df['item_description'] = test_df['item_description'].fillna(value='Other_Null')

In [None]:
train_df.isnull().sum()

In [None]:
test_df.isnull().sum()

# brand_name

In [None]:
len(train_df['brand_name'].unique())

In [None]:
train_df['brand_name'].value_counts()[:5]

# name

In [None]:
train_df['name'].nunique()

In [None]:
train_df['name'].value_counts()[:10]

# item_description

In [None]:
train_df['item_description'].str.len().mean()

In [None]:
train_df['item_description'][:2]

# Vectorize name, item_description

In [None]:
cnt_vec = CountVectorizer()

X_train_name = cnt_vec.fit_transform(train_df['name'])
X_test_name = cnt_vec.transform(test_df['name'])

In [None]:
print(X_train_name.shape)
print(X_test_name.shape)

In [None]:
tfidf_descp = TfidfVectorizer(max_features=50000, ngram_range=(1, 3), stop_words='english')

X_train_descp = tfidf_descp.fit_transform(train_df['item_description'])
X_test_descp = tfidf_descp.transform(test_df['item_description'])

In [None]:
print(X_train_descp.shape)
print(X_test_descp.shape)

# One-hot encoding via LabelBinarizer (for CSR sparse matrix)

In [None]:
from sklearn.preprocessing import LabelBinarizer

lb_brand_name = LabelBinarizer(sparse_output=True)
X_train_brand = lb_brand_name.fit_transform(train_df['brand_name'])
X_test_brand = lb_brand_name.transform(test_df['brand_name'])

lb_item_cond_id = LabelBinarizer(sparse_output=True)
X_train_item_condition_id = lb_item_cond_id.fit_transform(train_df['item_condition_id'])
X_test_item_condition_id = lb_item_cond_id.transform(test_df['item_condition_id'])

lb_shipping = LabelBinarizer(sparse_output=True)
X_train_shipping = lb_shipping.fit_transform(train_df['shipping'])
X_test_shipping = lb_shipping.transform(test_df['shipping'])

lb_cat_1 = LabelBinarizer(sparse_output=True)
X_train_cat_1 = lb_cat_1.fit_transform(train_df['cat_1'])
X_test_cat_1 = lb_cat_1.transform(test_df['cat_1'])

lb_cat_2 = LabelBinarizer(sparse_output=True)
X_train_cat_2 = lb_cat_2.fit_transform(train_df['cat_2'])
X_test_cat_2 = lb_cat_2.transform(test_df['cat_2'])

lb_cat_3 = LabelBinarizer(sparse_output=True)
X_train_cat_3 = lb_cat_3.fit_transform(train_df['cat_3'])
X_test_cat_3 = lb_cat_3.transform(test_df['cat_3'])

In [None]:
print(type(X_train_brand), type(X_train_item_condition_id), type(X_train_shipping))
print(type(X_test_brand), type(X_test_item_condition_id), type(X_test_shipping))

print('X_train_brand shape:', X_train_brand.shape)
print('X_train_item_cond_id shape:', X_train_item_condition_id.shape)
print('X_train_shipping shape:', X_train_shipping.shape)
print('X_train_cat_1 shape:', X_train_cat_1.shape)
print('X_train_cat_2 shape:', X_train_cat_2.shape)
print('X_train_cat_3 shape:', X_train_cat_3.shape)

print('X_test_brand shape:', X_test_brand.shape)
print('X_test_item_cond_id shape:', X_test_item_condition_id.shape)
print('X_test_shipping shape:', X_test_shipping.shape)
print('X_test_cat_1 shape:', X_test_cat_1.shape)
print('X_test_cat_2 shape:', X_test_cat_2.shape)
print('X_test_cat_3 shape:', X_test_cat_3.shape)

## Preview X_train

In [None]:
from scipy.sparse import hstack
import gc

sparse_matrix_list = (X_train_name, X_train_descp, X_train_brand, X_train_item_condition_id, X_train_shipping, X_train_cat_1, X_train_cat_2, X_train_cat_3)

X_train = hstack(sparse_matrix_list).tocsr()
print(type(X_train), X_train.shape)

del X_train
gc.collect()

# Modeling

In [None]:
X = hstack(sparse_matrix_list).tocsr()
X_train, X_test, y_train, y_test = train_test_split(X, train_df['price'], test_size=0.2)
print(X_train.shape)
print(X_test.shape)
print(y_train.shape)
print(y_test.shape)

In [None]:
linear_model = SGDRegressor()
linear_model.fit(X_train, y_train)
y_pred = linear_model.predict(X_test)
print(math.sqrt(mean_squared_error(y_pred, y_test)))

In [None]:
# # Stochastic Gradient Descent Regression
# # Using GridSearchCV to find out the best hyperparameters

# parameters = {'eta0': [0.03, 0.01, 0.003, 0.001, 0.0003],
#              'penalty': ['l1', 'l2', 'elasticnet'],
#              'alpha': [1, 0.3, 0.1, 0.03, 0.01, 0.003, 0.001, 0.0003]}
# sgd_reg = SGDRegressor(random_state=42)

# clf = GridSearchCV(sgd_reg, parameters, verbose=1)
# clf.fit(X_train, y_train)

In [None]:
y_pred = linear_model.predict(X_train)
print("SGD Regression Train RMSE: ", math.sqrt(mean_squared_error(y_train, y_pred)))
y_pred = linear_model.predict(X_test)
print("SGD Regression Test RMSE: ", math.sqrt(mean_squared_error(y_test, y_pred)))