In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

In [None]:
! pip install py7zr

In [None]:
# To support both python 2 and python 3
from __future__ import division, print_function, unicode_literals

# Common imports
import numpy as np
import os

# to make this notebook's output stable across runs
np.random.seed(42)

# To plot pretty figures
%matplotlib inline
import matplotlib as mpl
import matplotlib.pyplot as plt
mpl.rc('axes', labelsize=14)
mpl.rc('xtick', labelsize=12)
mpl.rc('ytick', labelsize=12)

from sklearn.linear_model import Ridge, LogisticRegression
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer


# data analysis and wrangling
import pandas as pd
import numpy as np
import random as rnd

# visualization
import seaborn as sns
import matplotlib.pyplot as plt
%matplotlib inline

# machine learning
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC, LinearSVC
from sklearn.ensemble import RandomForestClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.linear_model import Perceptron
from sklearn.linear_model import SGDClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn import preprocessing

In [None]:
import py7zr
with py7zr.SevenZipFile('/kaggle/input/mercari-price-suggestion-challenge/train.tsv.7z', mode='r') as z:
    z.extractall()


In [None]:
import py7zr
with py7zr.SevenZipFile('/kaggle/input/mercari-price-suggestion-challenge/test.tsv.7z', mode='r') as z:
    z.extractall()

In [None]:
#LOAD DATA
print("Loading data...")
train = pd.read_table("../working/train.tsv")
test = pd.read_table("../working/test.tsv")
print(train.shape)
print(test.shape)

In [None]:
train

In [None]:
train.describe()

In [None]:
train.describe(include = ['O'])

In [None]:
train.info()

In [None]:
%matplotlib inline
import matplotlib.pyplot as plt
train.hist(bins=50, figsize=(20,15))
plt.show()

In [None]:
sns.distplot(np.log1p(train['price']), kde=False)

In [None]:
train['price'] = np.log1p(train['price'])

In [None]:
train['shipping'].value_counts()

In [None]:
train['item_condition_id'].value_counts()

In [None]:
def split_cat(category_name):
    try:
        return category_name.split('/')
    except:
        return ['Others', 'Others', 'Others']

In [None]:
train['cat_top'], train['cat_sub'], train['item'] = zip(*train['category_name'].apply(lambda x: split_cat(x)))
test['cat_top'], test['cat_sub'], test['item'] = zip(*test['category_name'].apply(lambda x: split_cat(x)))

In [None]:
print(train['cat_top'].nunique())
print(train['cat_sub'].nunique())
print(train['item'].nunique())


In [None]:
train['brand_name'] = train['brand_name'].fillna(value='Other_Null')
train['category_name'] = train['category_name'].fillna(value='Other_Null')
train['item_description'] = train['item_description'].fillna(value='Other_Null')

test['brand_name'] = test['brand_name'].fillna(value='Other_Null')
test['category_name'] = test['category_name'].fillna(value='Other_Null')
test['item_description'] = test['item_description'].fillna(value='Other_Null')

In [None]:
train.isnull().sum()

In [None]:
test.isnull().sum()

In [None]:
train['brand_name'].value_counts().head()

In [None]:
train['name'].value_counts().head()

In [None]:
train['item_description'].str.len().mean()

**Vectorization**

In [None]:
cnt_vec = CountVectorizer()

train_name = cnt_vec.fit_transform(train['name'])
test_name = cnt_vec.transform(test['name'])

In [None]:
tfidf_descp = TfidfVectorizer(max_features=50000, ngram_range=(1, 3), stop_words='english')

train_descp = tfidf_descp.fit_transform(train['item_description'])
test_descp = tfidf_descp.transform(test['item_description'])

In [None]:
train_descp.shape

**LabelBinarizer**

In [None]:
from sklearn.preprocessing import LabelBinarizer

lb_brand_name = LabelBinarizer(sparse_output=True)
X_train_brand = lb_brand_name.fit_transform(train['brand_name'])
X_test_brand = lb_brand_name.transform(test['brand_name'])

lb_item_cond_id = LabelBinarizer(sparse_output=True)
X_train_item_condition_id = lb_item_cond_id.fit_transform(train['item_condition_id'])
X_test_item_condition_id = lb_item_cond_id.transform(test['item_condition_id'])

lb_shipping = LabelBinarizer(sparse_output=True)
X_train_shipping = lb_shipping.fit_transform(train['shipping'])
X_test_shipping = lb_shipping.transform(test['shipping'])

lb_cat_dae = LabelBinarizer(sparse_output=True)
X_train_cat_dae = lb_cat_dae.fit_transform(train['cat_top'])
X_test_cat_dae = lb_cat_dae.transform(test['cat_top'])

lb_cat_jung = LabelBinarizer(sparse_output=True)
X_train_cat_jung = lb_cat_jung.fit_transform(train['cat_sub'])
X_test_cat_jung = lb_cat_jung.transform(test['cat_sub'])

lb_cat_so = LabelBinarizer(sparse_output=True)
X_train_cat_so = lb_cat_so.fit_transform(train['item'])
X_test_cat_so = lb_cat_so.transform(test['item'])

In [None]:
from scipy.sparse import hstack
import gc

sparse_matrix_list = (train_name, train_descp, X_train_brand, X_train_item_condition_id, X_train_shipping, X_train_cat_dae, X_train_cat_jung, X_train_cat_so)

X_train = hstack(sparse_matrix_list).tocsr()
print(type(X_train), X_train.shape)

del X_train
gc.collect()

In [None]:
def rmsle(y, y_pred):
    return np.sqrt(np.mean(np.power(np.log1p(y) - np.log1p(y_pred), 2)))

def evaluate_orig_price(y_test, preds):
    preds_exmpm = np.expm1(preds)
    y_test_exmpm = np.expm1(y_test)
    
    return rmsle(y_test_exmpm, preds_exmpm)

In [None]:
def model_train_predict(model, matrix_list):
    X = hstack(matrix_list).tocsr()
    X_train, X_test, y_train, y_test = train_test_split(X, train['price'], test_size=0.2)

    model.fit(X_train, y_train)
    preds = model.predict(X_test)
    
    del X, X_train, X_test, y_train
    gc.collect()
    
    return preds, y_test

**Ridge**

In [None]:
linear_model = Ridge(solver='lsqr', fit_intercept=False)

sparse_matrix_list = (train_name, X_train_brand, X_train_item_condition_id, X_train_shipping, X_train_cat_dae, X_train_cat_jung, X_train_cat_so)
linear_preds, y_test = model_train_predict(model=linear_model, matrix_list=sparse_matrix_list)
print('Ridge rmsle without descreption:', evaluate_orig_price(y_test, linear_preds))

sparse_matrix_list = (train_name, train_descp, X_train_brand, X_train_item_condition_id, X_train_shipping, X_train_cat_dae, X_train_cat_jung, X_train_cat_so)
linear_preds, y_test = model_train_predict(model=linear_model, matrix_list=sparse_matrix_list)
print('rmsle with Item Description  :', evaluate_orig_price(y_test, linear_preds))

**SGD**

In [None]:
from sklearn.linear_model import SGDRegressor
sgd_reg = SGDRegressor(penalty='l2',  alpha=1, max_iter=1000, early_stopping=False, learning_rate='invscaling', eta0=0.01)
sgd_preds, y_test = model_train_predict(model=sgd_reg, matrix_list=sparse_matrix_list)
print('Ridge rmsle without descreption:', evaluate_orig_price(y_test, sgd_preds))


In [None]:
sparse_matrix_list = (train_name, train_descp, X_train_brand, X_train_item_condition_id, X_train_shipping, X_train_cat_dae, X_train_cat_jung, X_train_cat_so)
sgd_preds, y_test = model_train_predict(model=sgd_reg, matrix_list=sparse_matrix_list)
print('rmsle with Item Description  :', evaluate_orig_price(y_test, sgd_preds))

**Elastic Net**

In [None]:
from sklearn.linear_model import ElasticNet
elastic_net = ElasticNet(alpha=0.1, l1_ratio=0.5, random_state=42)
elastic_preds, y_test = model_train_predict(model=elastic_net, matrix_list=sparse_matrix_list)
print('rmsle with Item Description  :', evaluate_orig_price(y_test, elastic_preds))

In [None]:
sparse_matrix_list = (test_name, test_descp, X_test_brand, X_test_item_condition_id, X_test_shipping, X_test_cat_dae, X_test_cat_jung, X_test_cat_so)
X_test = hstack(sparse_matrix_list).tocsr()

In [None]:
preds = linear_model.predict(X_test)
preds = np.expm1(preds)
preds

In [None]:
import py7zr
with py7zr.SevenZipFile('/kaggle/input/mercari-price-suggestion-challenge/sample_submission.csv.7z', mode='r') as z:
    z.extractall()

In [None]:
submission = pd.read_csv('../working/sample_submission.csv')
submission

In [None]:
submission.loc[:, 'price'] = preds
submission

In [None]:
submission.to_csv('submission.csv', index=False)