In [None]:
! pip install py7zr
import py7zr
with py7zr.SevenZipFile('/kaggle/input/mercari-price-suggestion-challenge/train.tsv.7z', mode='r') as z:
    z.extractall()
!unzip /kaggle/input/mercari-price-suggestion-challenge/sample_submission_stg2.csv.zip
!unzip /kaggle/input/mercari-price-suggestion-challenge/test_stg2.tsv.zip
import numpy as np
import pandas as pd
import sklearn
from sklearn.utils import shuffle
import pickle
import os
import warnings
warnings.filterwarnings('ignore')

In [None]:
train_data = pd.read_table('../working/train.tsv')
train_data = shuffle(train_data, random_state=0)
print(train_data.shape)
# train_data = train_data[:100]
# print(train_data.shape)
train_data.head()

Tập test data thì không có cột **price** vì đây là giá trị cần dự đoán để submit.

In [None]:
test_data = pd.read_table('../working/test_stg2.tsv')
print(test_data.shape)
# test_data = test_data[:100]
test_data.head()

In [None]:
print("Train dataset:\n" + str(train_data['item_condition_id'].value_counts() / train_data.shape[0]))
print("\nTest dataset:\n" + str(test_data['item_condition_id'].value_counts() / test_data.shape[0]))

In [None]:
print("Train dataset:\n" + str(train_data['brand_name'].value_counts().head() / train_data.shape[0]))
print("\nTest dataset:\n" + str(test_data['brand_name'].value_counts().head() / test_data.shape[0]))

In [None]:
print("Train dataset:\n" + str(train_data['shipping'].value_counts() / train_data.shape[0]))
print("\nTest dataset:\n" + str(test_data['shipping'].value_counts() / test_data.shape[0]))

In [None]:
import seaborn

seaborn.distplot(train_data['price'])

In [None]:
seaborn.distplot(np.log1p(train_data.price))

In [None]:
train_data['log_price'] = np.log1p(train_data.price)
train_data.iloc[0]

In [None]:
def split_cat(category_name):
    try:
        return category_name.split('/')
    except:
        return ['Others', 'Others', 'Others']
train_data['cat_top'], train_data['cat_sub'], train_data['cat_item'] = zip(*train_data['category_name'].apply(lambda x: split_cat(x)))
test_data['cat_top'], test_data['cat_sub'], test_data['cat_item'] = zip(*test_data['category_name'].apply(lambda x: split_cat(x)))

In [None]:
train_data.isnull().sum()

In [None]:
test_data.isnull().sum()

In [None]:
train_data['category_name'] = train_data['category_name'].fillna(value='Null')
train_data['brand_name'] = train_data['brand_name'].fillna(value='Null')
train_data['item_description'] = train_data['item_description'].fillna(value='Null')

test_data['category_name'] = test_data['category_name'].fillna(value='Null')
test_data['brand_name'] = test_data['brand_name'].fillna(value='Null')
test_data['item_description'] = test_data['item_description'].fillna(value='Null')

In [None]:
import re
def clean_text(text):
    """
    Applies some pre-processing on the given text.

    Steps :
    - Removing HTML tags
    - Removing punctuation
    - Lowering text
    """
    
    # remove HTML tags
    text = re.sub(r'<.*?>', '', text)
    
    # remove the characters [\], ['] and ["] using the resub method:
    text = re.sub(r'\\', '', text)

    text = re.sub(r'\"', '', text)   

    text = re.sub(r'\'', '', text)    
    
    # convert text to lowercase
    text = text.strip().lower()
    
    # replace punctuation characters with spaces
    filters='!"\'#$%&()*+,-./:;<=>?@[\\]^_`{|}~\t\n'
    translate_dict = dict((c, " ") for c in filters)
    translate_map = str.maketrans(translate_dict)
    text = text.translate(translate_map)

    return text

# Example
clean_text("<html>This is is not a\" sentence.<\html>").split()

In [None]:
train_data.nunique()

In [None]:
from sklearn.feature_extraction.text import CountVectorizer
vectorizer = CountVectorizer(stop_words="english",
                            preprocessor=clean_text)

# train_name = vectorizer.fit_transform(train_data['name'])
# test_name = vectorizer.transform(test_data['name'])
# pickle.dump(train_name, open("train_name.pickle", "wb"))
# pickle.dump(test_name, open("test_name.pickle", "wb"))

train_name = pickle.load(open(os.path.join('/kaggle/input/trainedsparsematrix', 'train_name.pickle'), 'rb'))
test_name = pickle.load(open(os.path.join('/kaggle/input/trainedsparsematrix', 'test_name.pickle'), 'rb'))

In [None]:
from sklearn.feature_extraction.text import TfidfVectorizer
tfidf = TfidfVectorizer(stop_words="english",
                             preprocessor=clean_text,
                             ngram_range=(1, 2))

# train_des = tfidf.fit_transform(train_data['item_description'])
# test_des = tfidf.transform(test_data['item_description'])
# pickle.dump(train_des, open("train_des.pickle", "wb"))
# pickle.dump(test_des, open("test_des.pickle", "wb"))

train_des = pickle.load(open(os.path.join('/kaggle/input/trainedsparsematrix', 'train_des.pickle'), 'rb'))
test_des = pickle.load(open(os.path.join('/kaggle/input/trainedsparsematrix', 'test_des.pickle'), 'rb'))

In [None]:
from sklearn.preprocessing import LabelBinarizer

lb_brand = LabelBinarizer(sparse_output=True)

# train_brand = lb_brand.fit_transform(train_data['brand_name'])
# test_brand = lb_brand.transform(test_data['brand_name'])
# pickle.dump(train_brand, open("train_brand.pickle", "wb"))
# pickle.dump(test_brand, open("test_brand.pickle", "wb"))

train_brand = pickle.load(open(os.path.join('/kaggle/input/trainedsparsematrix', 'train_brand.pickle'), 'rb'))
test_brand = pickle.load(open(os.path.join('/kaggle/input/trainedsparsematrix', 'test_brand.pickle'), 'rb'))

In [None]:
lb_condition_id = LabelBinarizer(sparse_output=True)

# train_condition_id = lb_condition_id.fit_transform(train_data['item_condition_id'])
# test_condition_id = lb_condition_id.transform(test_data['item_condition_id'])
# pickle.dump(train_condition_id, open("train_condition_id.pickle", "wb"))
# pickle.dump(test_condition_id, open("test_condition_id.pickle", "wb"))

train_condition_id = pickle.load(open(os.path.join('/kaggle/input/trainedsparsematrix', 'train_condition_id.pickle'), 'rb'))
test_condition_id = pickle.load(open(os.path.join('/kaggle/input/trainedsparsematrix', 'test_condition_id.pickle'), 'rb'))

In [None]:
lb_shipping = LabelBinarizer(sparse_output=True)

# train_shipping = lb_shipping.fit_transform(train_data['shipping'])
# test_shipping = lb_shipping.transform(test_data['shipping'])
# pickle.dump(train_shipping, open("train_shipping.pickle", "wb"))
# pickle.dump(test_shipping, open("test_shipping.pickle", "wb"))

train_shipping = pickle.load(open(os.path.join('/kaggle/input/trainedsparsematrix', 'train_shipping.pickle'), 'rb'))
test_shipping = pickle.load(open(os.path.join('/kaggle/input/trainedsparsematrix', 'test_shipping.pickle'), 'rb'))

In [None]:
lb_cat_top = LabelBinarizer(sparse_output=True)

# train_cat_top = lb_cat_top.fit_transform(train_data['cat_top'])
# test_cat_top = lb_cat_top.transform(test_data['cat_top'])
# pickle.dump(train_cat_top, open("train_cat_top.pickle", "wb"))
# pickle.dump(test_cat_top, open("test_cat_top.pickle", "wb"))

train_cat_top = pickle.load(open(os.path.join('/kaggle/input/trainedsparsematrix', 'train_cat_top.pickle'), 'rb'))
test_cat_top = pickle.load(open(os.path.join('/kaggle/input/trainedsparsematrix', 'test_cat_top.pickle'), 'rb'))

In [None]:
lb_cat_sub = LabelBinarizer(sparse_output=True)

# train_cat_sub = lb_cat_sub.fit_transform(train_data['cat_sub'])
# test_cat_sub = lb_cat_sub.transform(test_data['cat_sub'])
# pickle.dump(train_cat_sub, open("train_cat_sub.pickle", "wb"))
# pickle.dump(test_cat_sub, open("test_cat_sub.pickle", "wb"))

train_cat_sub = pickle.load(open(os.path.join('/kaggle/input/trainedsparsematrix', 'train_cat_sub.pickle'), 'rb'))
test_cat_sub = pickle.load(open(os.path.join('/kaggle/input/trainedsparsematrix', 'test_cat_sub.pickle'), 'rb'))

In [None]:
lb_cat_item = LabelBinarizer(sparse_output=True)

# train_cat_item = lb_cat_item.fit_transform(train_data['cat_item'])
# test_cat_item = lb_cat_item.transform(test_data['cat_item'])
# pickle.dump(train_cat_item, open("train_cat_item.pickle", "wb"))
# pickle.dump(test_cat_item, open("test_cat_item.pickle", "wb"))

train_cat_item = pickle.load(open(os.path.join('/kaggle/input/trainedsparsematrix', 'train_cat_item.pickle'), 'rb'))
test_cat_item = pickle.load(open(os.path.join('/kaggle/input/trainedsparsematrix', 'test_cat_item.pickle'), 'rb'))

In [None]:
def evaluate(preds, y_test):
    return np.sqrt(np.mean(np.power(preds - y_test, 2)))

In [None]:
from scipy.sparse import hstack
from sklearn.model_selection import train_test_split

train_features = (train_name, train_des, train_brand, train_condition_id, train_shipping, train_cat_top, train_cat_sub, train_cat_item)
X = hstack(train_features).tocsr()
X_train, X_test, y_train, y_test = train_test_split(X, train_data['log_price'], test_size=0.2, random_state=0)

test_features = (test_name, test_des, test_brand, test_condition_id, test_shipping, test_cat_top,  test_cat_sub, test_cat_item)
test_features = hstack(test_features).tocsr()

In [None]:
from sklearn.linear_model import *
from sklearn.ensemble import RandomForestRegressor
from sklearn.neural_network import MLPRegressor
import xgboost as xgb

rmsle_result = 100
best_model = ''

In [None]:
def train(model, file_name):
    print('Start training ' + str(model).split('(')[0] + '...')
    model.fit(X_train, y_train)
    pickle.dump(model, open(file_name, 'wb'))
def loss(model, X_test, y_test):
    print('Root Mean Squared Error: ', end = '')
    result = evaluate(model.predict(X_test), y_test)
    print(result)
    return result
def load(file_name):
    model = pickle.load(open(os.path.join('/kaggle/input/trained-model', file_name), 'rb'))
    return model

In [None]:
# model = LinearRegression()
# file_name = 'linear_regression.sav'

## TRAINING
# train(model, file_name)

# model = load(file_name)

## EVALUATE
# result = loss(model, X_test, y_test)

# if(result < rmsle_result):
#     rmsle_result = result
#     best_model = file_name

In [None]:
model = Ridge()
file_name = 'ridge.sav'

## TRAINING
# train(model, file_name)

model = load(file_name)

## EVALUATE
result = loss(model, X_test, y_test)

if(result < rmsle_result):
    rmsle_result = result
    best_model = file_name

In [None]:
model = SGDRegressor()
file_name = 'sgd_regressor.sav'

## TRAINING
# train(model, file_name)

model = load(file_name)

## EVALUATE
result = loss(model, X_test, y_test)

if(result < rmsle_result):
    rmsle_result = result
    best_model = file_name

In [None]:
import lightgbm as lgb
model = lgb.LGBMRegressor()
file_name = 'lgbm.sav'

## TRAINING
# train(model, file_name)

model = load(file_name)

## EVALUATE
result = loss(model, X_test, y_test)

if(result < rmsle_result):
    rmsle_result = result
    best_model = file_name

In [None]:
submission = pd.read_csv('../working/sample_submission_stg2.csv')

model = pickle.load(open(os.path.join('/kaggle/input/trained-model', best_model), 'rb'))
print('Use ' + best_model + ' model')
preds = model.predict(test_features)
preds = np.exp(preds) - 1
submission.loc[:, 'price'] = preds
submission

In [None]:
submission.to_csv('submission.csv', index=False)