In [None]:
import numpy as np 
import pandas as pd 

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

In [None]:
import torch
from torch.autograd import Variable
from torch import optim
import torch.nn as nn
from torch.utils.data import Dataset, DataLoader
from torchvision import transforms, utils
import torch.nn.functional as F
import time
import matplotlib.pyplot as plt
import matplotlib.gridspec as gridspec
import seaborn as sns
from sklearn.preprocessing import LabelEncoder, MinMaxScaler, StandardScaler
from sklearn.model_selection import train_test_split
import math
from nltk.corpus import stopwords

# 1. Unzip, load dữ liệu train/test
* Dữ liệu train nằm trong file train.tsv.7z
* Dữ liệu test nằm trong file test_stg2.tsv.zip

In [None]:
!apt-get install p7zip
!p7zip -d -f -k /kaggle/input/mercari-price-suggestion-challenge/train.tsv.7z
!unzip -o /kaggle/input/mercari-price-suggestion-challenge/test_stg2.tsv.zip
# !p7zip -d -f -k /kaggle/input/mercari-price-suggestion-challenge/test.tsv.7z

In [None]:
# Read data
train = pd.read_csv("train.tsv", sep='\t')
test = pd.read_csv("test_stg2.tsv", sep='\t')
# test = pd.read_csv("test.tsv", sep='\t')

# 2. Phân tích dữ liệu

In [None]:
# train shape & train info
print(train.shape)
train.info(memory_usage="deep")

In [None]:
train.describe()

In [None]:
print(test.shape)
test.info(memory_usage="deep")

In [None]:
test.describe()

In [None]:
# price analysis
train.price.describe()

In [None]:
# Item condition analysis
print(train['item_condition_id'].value_counts())
print('item_condition_id is null:', train['item_condition_id'].isnull().sum())

In [None]:
# Shipping analysis
print(train['shipping'].value_counts())
print('shipping is null:', train['shipping'].isnull().sum())

In [None]:
# Brand name analysis
print(train['brand_name'].value_counts())
print('brand_name isn\'t null:', train['brand_name'].count())
print('brand_name is null:', train['brand_name'].isnull().sum())

In [None]:
# Category name analysis
print(train['category_name'].value_counts())
print('category_name isn\'t null:', train['category_name'].count())
print('category_name is null:', train['category_name'].isnull().sum())

In [None]:
# split item category name into 3 different fields: general_cat, subcat_1, subcat_2
# eg:(Women/Athletic Apparel/Pants) => (Women), (Athletic Apparel), (Pants) 

def split_cat(text):
    try: return text.split("/")
    except: return ("None", "None", "None")
    
train['general_cat'], train['subcat_1'], train['subcat_2'] = zip(*train['category_name'].apply(lambda x: split_cat(x)))
test['general_cat'], test['subcat_1'], test['subcat_2'] = zip(*test['category_name'].apply(lambda x: split_cat(x)))

print(train['general_cat'].value_counts(),end="\n\n")
print(train['subcat_1'].value_counts(),end="\n\n")
print(train['subcat_2'].value_counts())

In [None]:
# Plotting some histograms of categorical Variables
plt.figure(figsize=(10,10))
plt.subplot(3,3,1)
count_classes_general_cat = pd.value_counts(train.general_cat, sort = True)
count_classes_general_cat.plot(kind = 'bar')
plt.title("General Category histogram")
plt.xlabel("Class")
plt.ylabel("Frequency")
# subcategory 1
plt.subplot(3,3,3)
count_classes_subcat_1 = pd.value_counts(train.subcat_1, sort = True)[:15]
count_classes_subcat_1.plot(kind = 'bar')
plt.title("Sub Category 1 histogram")
plt.xlabel("Class")
plt.ylabel("Frequency")
# subcategory 2
plt.subplot(3,3,9)
count_classes_subcat_2 = pd.value_counts(train.subcat_2, sort = True)[:15]
count_classes_subcat_2.plot(kind = 'bar')
plt.title("Sub Category 2 histogram")
plt.xlabel("Class")
plt.ylabel("Frequency")

In [None]:
# Description analysis
print(train['item_description'].value_counts())
print('item_description isn\'t null:', train['item_description'].count())
print('item_description is null:', train['item_description'].isnull().sum())

# 3. Xử lý dữ liệu:
- Xử lý dữ liệu trống
- Mã hóa 'brand_name', 'general_cat', 'subcat_1', 'subcat_2', 'category_name'
- Chuẩn hóa, lowercase, loại bỏ ký tự không hợp lệ, phân đoạn chuỗi kỹ tự của 'name' và 'item_description', mã hóa chúng và ghi vào trường mới đuôi _seq

In [None]:
# Handle missing values
def handle_missing(dataset):
    dataset.brand_name.fillna(value="None", inplace=True)
    dataset.item_description.fillna(value="None", inplace=True)
    dataset.category_name.fillna(value="None", inplace=True)
    return (dataset)

train = handle_missing(train)
test = handle_missing(test)
print(train.shape)
print(test.shape)
train.isnull().sum()

In [None]:
train.head()

In [None]:
# Normalize labels, encoder brand_name & category_name text data
def encode_text(column):
    le = LabelEncoder()
    le.fit(np.hstack([train[column], test[column]]))
    train[column+'_index'] = le.transform(train[column])
    test[column+'_index'] = le.transform(test[column])
    
encode_text('brand_name')
encode_text('general_cat')
encode_text("subcat_1")
encode_text('subcat_2')
encode_text('category_name')
train.head()

In [None]:
class Category:
    def __init__(self, name):
        self.name = name
        self.word2index = {}
        self.word2count = {}
        self.index2word = {0: "SOS", 1: "EOS"}
        self.n_words = 2 

    def addSentence(self, sentence):
        for word in sentence.split(' '):
            self.addWord(word)

    def addWord(self, word):
        if word not in self.word2index:
            self.word2index[word] = self.n_words
            self.word2count[word] = 1
            self.index2word[self.n_words] = word
            self.n_words += 1
        else:
            self.word2count[word] += 1

In [None]:
import re

# Lowercase, trim, and remove non-letter characters
def normalizeString(s):
    s = re.sub(r"([.!?])", r" \1", s)
    s = re.sub(r"[^a-zA-Z.!?]+", r" ", s)
    return s

def normalizeLine(sentence):
    return [normalizeString(s) for s in sentence.split('\t')]


def prepareData(lang1,data):
    
    input_cat = Category(lang1)
    print("Counting words:")
    print(input_cat.name, end=" ")
    for sentence in data:
        normalize_line = [normalizeString(s) for s in sentence.split('\t')]
        input_cat.addSentence(normalize_line[0])
        
    print(input_cat.n_words)
    return input_cat

def indexesFromSentence(lang, sentence):
    return [lang.word2index[word] for word in sentence.split(' ')]

def variableFromSentence(lang, sentence):
    indexes = indexesFromSentence(lang, sentence)
    return indexes


def token_fit(column):
    raw_text = np.hstack([(train[column]).str.lower(), (test[column]).str.lower()])
    cat1 = prepareData(column,raw_text)
    train[column + '_seq'] = [variableFromSentence(cat1,normalizeLine(sentence.lower())[0]) for sentence in train[column]]
    test[column + '_seq'] = [variableFromSentence(cat1,normalizeLine(sentence.lower())[0]) for sentence in test[column]]
    

In [None]:
token_fit('name')
token_fit('item_description')
train.head()

In [None]:
# handle price using log and scale, excep test data don't have 'price' so we make test taget =0
test["target"] = 0
train["target"] = np.log(train.price+1)
target_scaler = MinMaxScaler(feature_range=(-1, 1))
train["target"] = target_scaler.fit_transform(train.target.values.reshape(-1,1))
train.head(10)

In [None]:
pd.DataFrame(train.target).hist()

# 4. Chuẩn bị mô hình

In [None]:
# Split train/validation data
dtrain, dvalid = train_test_split(train, random_state=123, train_size=0.99)

In [None]:
#SEQUENCES VARIABLES ANALYSIS
max_name_seq = np.max([np.max(train.name_seq.apply(lambda x: len(x))), np.max(test.name_seq.apply(lambda x: len(x)))])
max_item_description_seq = np.max([np.max(train.item_description_seq.apply(lambda x: len(x)))
                                   , np.max(test.item_description_seq.apply(lambda x: len(x)))])
print("max name seq "+str(max_name_seq))
print("max item desc seq "+str(max_item_description_seq))

#EMBEDDINGS MAX VALUE
MAX_NAME_SEQ = 10
MAX_ITEM_DESC_SEQ = 75
MAX_TEXT_NAME = np.max([np.max(train.name_seq.max()) 
                   , np.max(test.name_seq.max())])+2
MAX_TEXT_ITEM = np.max([np.max(train.item_description_seq.max()) 
                   , np.max(test.item_description_seq.max())])+2
MAX_GEN_CATEGORY = np.max([train.general_cat_index.max(), test.general_cat_index.max()])+1
MAX_SUB_CAT1_CATEGORY = np.max([train.subcat_1_index.max(), test.subcat_1_index.max()])+1
MAX_SUB_CAT2_CATEGORY = np.max([train.subcat_2_index.max(), test.subcat_2_index.max()])+1
MAX_BRAND = np.max([train.brand_name_index.max(), test.brand_name_index.max()])+1
MAX_CONDITION = np.max([train.item_condition_id.max(), test.item_condition_id.max()])+1
MAX_CATEGORY_NAME = np.max([train.category_name_index.max(), test.category_name_index.max()])+1

In [None]:
def pad(tensor, length):
    if length > tensor.size(0):
        return torch.cat([tensor, tensor.new(length - tensor.size(0), *tensor.size()[1:]).zero_()])
    else:
        return torch.split(tensor, length, dim=0)[0]

In [None]:
# Convert ndarrays in sample to Tensors
class ToTensor(object):

    def __call__(self, sample):
        name, item_desc,brand_name,cat_name,general_category,subcat1_category,subcat2_category, \
        item_condition,shipping,target = sample['name'], sample['item_desc'], sample['brand_name'], \
        sample['cat_name'], sample['general_category'], sample['subcat1_category'], sample['subcat2_category'], \
        sample['item_condition'], sample['shipping'],sample['target']    
        return {'name': pad(torch.from_numpy(np.asarray(name)).long().view(-1),MAX_NAME_SEQ),
                'item_desc': pad(torch.from_numpy(np.asarray(item_desc)).long().view(-1),MAX_ITEM_DESC_SEQ),
               'brand_name':torch.from_numpy(np.asarray(brand_name)),
               'cat_name':torch.from_numpy(np.asarray(cat_name)),
               'general_category':torch.from_numpy(np.asarray(general_category)),
               'subcat1_category':torch.from_numpy(np.asarray(subcat1_category)),
               'subcat2_category':torch.from_numpy(np.asarray(subcat2_category)),
               'item_condition':torch.from_numpy(np.asarray(item_condition)),
               'shipping':torch.torch.from_numpy(np.asarray(shipping)),
               'target':torch.from_numpy(np.asarray(target))}

#  Define the Dataset to use in a DataLoader
class MercariDataset(Dataset):

    def __init__(self, data_pd, transform=None):
        self.mercari_frame = data_pd
        self.transform = transform

    def __len__(self):
        return len(self.mercari_frame)

    def __getitem__(self, idx):
        name = [self.mercari_frame.name_seq.iloc[idx]]
        item_desc = [self.mercari_frame.item_description_seq.iloc[idx]]
        brand_name = [self.mercari_frame.brand_name_index.iloc[idx]]
        cat_name = [self.mercari_frame.category_name_index.iloc[idx]]
        general_category = [self.mercari_frame.general_cat_index.iloc[idx]]
        subcat1_category = [self.mercari_frame.subcat_1_index.iloc[idx]]
        subcat2_category = [self.mercari_frame.subcat_2_index.iloc[idx]]
        item_condition = [self.mercari_frame.item_condition_id.iloc[idx]]
        shipping = [self.mercari_frame.shipping.iloc[idx]]
        target = [self.mercari_frame.target.iloc[idx]]
        sample = {'name': name,
                'item_desc': item_desc,
               'brand_name': brand_name,
               'cat_name': cat_name,   
               'general_category': general_category,
               'subcat1_category': subcat1_category,
               'subcat2_category': subcat2_category,
               'item_condition': item_condition,
               'shipping': shipping,
               'target': target}

        if self.transform:
            sample = self.transform(sample)

        return sample

In [None]:
# take data into batch dataloader 

### Test data
mercari_test =  MercariDataset(test,transform=transforms.Compose([ToTensor()]))           
test_sizes = len(mercari_test)
test_dataloaders = torch.utils.data.DataLoader(mercari_test, batch_size=50, shuffle=False)

### Train data
mercari_datasets = {'train': MercariDataset(dtrain,transform=transforms.Compose([ToTensor()])), 
                    'val': MercariDataset(dvalid,transform=transforms.Compose([ToTensor()]))}
dataset_sizes = {x: len(mercari_datasets[x]) for x in ['train', 'val']}
mercari_dataloaders = {x: torch.utils.data.DataLoader(mercari_datasets[x], batch_size=50, shuffle=True) for x in ['train', 'val']}


print("number of data in mercari train: ", dataset_sizes['train'])
print("number of batch in mercari train: ", dataset_sizes['train']/50)
print()

print("number of data in mercari validate: ", dataset_sizes['val'])
print("number of batch in mercari validate: ", dataset_sizes['val']/50)
print()


print("number of data in mercari test: ", test_sizes)
print("number of batch in mercari test: ", test_sizes/50)
print()


In [None]:
import sys

# Definition of the Pytorch Model
class RegressionNeural(nn.Module):
    def __init__(self, max_sizes):
        super(RegressionNeural, self).__init__()
        self.name_embedding = nn.Embedding(max_sizes['max_text_name'].item()+100000, 50)
        self.item_embedding = nn.Embedding(max_sizes['max_text_item'].item()+100000, 50)
        self.brand_embedding = nn.Embedding(max_sizes['max_brand'].item(), 10)
        self.gencat_embedding = nn.Embedding(max_sizes['max_gen_category'].item(), 10)
        self.subcat1_embedding = nn.Embedding(max_sizes['max_subcat1_category'].item(), 10)
        self.subcat2_embedding = nn.Embedding(max_sizes['max_subcat2_category'].item(), 10)
        self.condition_embedding = nn.Embedding(max_sizes['max_condition'].item(), 5)
        self.catname_embedding = nn.Embedding(max_sizes['max_cat_name'].item(), 10)
        
        self.conv1_name = nn.Conv1d(50, 1, 2, stride=1)
        self.conv2_name = nn.Conv1d(16, 8, 2, stride=1)
        self.conv3_name = nn.Conv1d(8, 4, 2, stride=1)
        
        self.conv1_item_desc = nn.Conv1d(50, 1, 5, stride=5) 
        self.conv2_item_desc = nn.Conv1d(64, 16, 5, stride=1)
        self.conv3_item_desc = nn.Conv1d(16, 4, 5, stride=1)
        
        self.dropout = nn.Dropout(p=0.2)
        
        self.input_fc1_count = 50 
        self.fc1 = nn.Linear(self.input_fc1_count, 64)
        self.fc2 = nn.Linear(64,32)
        self.fc3 = nn.Linear(32,1)
        
        self.relu = nn.ReLU()  
            
    def forward(self, x, batchsize):
        embed_name = self.name_embedding(x['name']) 
        embed_name = F.relu(self.conv1_name(embed_name.transpose(1,2)))
        embed_item = self.item_embedding(x['item_desc'])
        embed_item = F.relu(self.conv1_item_desc(embed_item.transpose(1,2)))
        embed_brand = self.brand_embedding(x['brand_name'])
        embed_gencat = self.gencat_embedding(x['general_category'])
        embed_subcat1 = self.subcat1_embedding(x['subcat1_category'])
        embed_subcat2 = self.subcat2_embedding(x['subcat2_category'])
        embed_condition = self.condition_embedding(x['item_condition'])
        embed_catname = self.catname_embedding(x['cat_name'])
        
        out = torch.cat((embed_brand.view(batchsize,-1), embed_catname.view(batchsize,-1), \
                         embed_condition.view(batchsize,-1),embed_name.view(batchsize,-1), \
                         embed_item.view(batchsize,-1),x['shipping']),1)
        
        out = (self.fc1(out))
        out = F.relu(self.dropout(out))
        out = (self.fc2(out))
        out = (self.dropout(out))
        out = self.fc3(out)
        return out

max_sizes = {'max_text_name':MAX_TEXT_NAME,'max_text_item':MAX_TEXT_ITEM,'max_name_seq':MAX_NAME_SEQ,'max_item_desc_seq':MAX_ITEM_DESC_SEQ, \
             'max_brand':MAX_BRAND,'max_cat_name':MAX_CATEGORY_NAME,'max_gen_category':MAX_GEN_CATEGORY,\
             'max_subcat1_category':MAX_SUB_CAT1_CATEGORY,'max_subcat2_category':MAX_SUB_CAT2_CATEGORY,\
             'max_condition':MAX_CONDITION} 

deep_learn_model = RegressionNeural(max_sizes)

In [None]:
max_sizes

In [None]:
def train_model(model, criterion, optimizer, num_epochs=1, print_every = 1000, device="cpu"):
    start = time.time()

    best_acc = 0.0
    print_loss_total = 0 

    for epoch in range(num_epochs):
        print('Epoch {}/{}'.format(epoch, num_epochs - 1))
        print('-' * 10)
        for phase in ['train', 'val']:
            if phase == 'train':
                model.train(True)  # Set model to training mode
            else:
                model.train(False)  # Set model to evaluate mode

            running_loss = 0.0
            num_batches = dataset_sizes[phase]/50.
            print("number of batch in", phase, np.uint64(num_batches))
            print("comming....")
            
            for i_batch, sample_batched in enumerate(mercari_dataloaders[phase]): 
            # get inputs
                inputs = {'name':Variable(sample_batched['name']).to(device), 
                          'item_desc':Variable(sample_batched['item_desc']).to(device), \
                        'brand_name':Variable(sample_batched['brand_name']).to(device), \
                        'cat_name':Variable(sample_batched['cat_name']).to(device), \
                        'general_category':Variable(sample_batched['general_category']).to(device), \
                        'subcat1_category':Variable(sample_batched['subcat1_category']).to(device), \
                        'subcat2_category':Variable(sample_batched['subcat2_category']).to(device), \
                        'item_condition':Variable(sample_batched['item_condition']).to(device), \
                        'shipping':Variable(sample_batched['shipping'].float()).to(device)}
                
                # get price
                prices = Variable(sample_batched['target'].float().to(device))  
        
                batch_size = len(sample_batched['shipping'])   
                optimizer.zero_grad()
                
                model.to(device)
                prices.to(device)
                
                outputs = model(inputs, batch_size)
                loss = criterion(outputs, prices)

                if phase == 'train':
                    loss.backward()
                    optimizer.step()

                running_loss += loss.data
                print_loss_total += loss.data                
                
                if (i_batch+1) % print_every == 0:
                    print_loss_avg = print_loss_total / print_every
                    print_loss_total = 0
                    print('(%d %d%%) Loss_avg: %.4f' % (i_batch, i_batch / num_batches*100, print_loss_avg), end=', ')
                    time_ongoing = time.time() - start
                    print('Training in {:.0f}m {:.0f}s'.format(time_ongoing // 60, time_ongoing % 60))
            epoch_loss = running_loss / num_batches
            print('{} Loss: {:.4f}'.format(phase, epoch_loss), end="\n\n")
            
        print()

    time_elapsed = time.time() - start
    print('Training complete in {:.0f}m {:.0f}s'.format(time_elapsed // 60, time_elapsed % 60))

    return model        

In [None]:
criterion = nn.MSELoss()
optimizer_ft = optim.SGD(deep_learn_model.parameters(), lr=0.001, momentum=0.9)


device = torch.device('cuda')
train_model(deep_learn_model,criterion,optimizer_ft,num_epochs=20, device=device)

In [None]:
# predict the model results against test data
def validate(model, dataloader, print_every = 500, device = 'cpu'):
    start = time.time()
    num_batches = len(dataloader)
    print('num batches: ',num_batches)
    
    y_pred_full = np.array([])
    for i_batch, sample_batched in enumerate(dataloader): 
        inputs = {'name':Variable(sample_batched['name']).to(device), 
                  'item_desc':Variable(sample_batched['item_desc']).to(device), \
                  'brand_name':Variable(sample_batched['brand_name']).to(device), \
                  'cat_name':Variable(sample_batched['cat_name']).to(device), \
                  'general_category':Variable(sample_batched['general_category']).to(device), \
                  'subcat1_category':Variable(sample_batched['subcat1_category']).to(device), \
                  'subcat2_category':Variable(sample_batched['subcat2_category']).to(device), \
                  'item_condition':Variable(sample_batched['item_condition']).to(device), \
                  'shipping':Variable(sample_batched['shipping'].float()).to(device)}
        batch_size = len(sample_batched['shipping'])
        
        model.to(device)
        try:
            outputs = model(inputs,batch_size)

            val_preds = target_scaler.inverse_transform(outputs.cpu().data.numpy())
            val_preds = np.exp(val_preds)-1
            y_pred = val_preds[:,0]
        except:
            print(i_batch, "err, make 0 price")
            y_pred = np.zeros(batch_size)
        
        y_pred_full= np.append(y_pred_full,y_pred)
        if (i_batch+1) % print_every == 0:
            print('(%d %d%%)' % (i_batch, i_batch / num_batches*100), end=",")
#             print('inputname shape, input item desc:', inputs['name'].shape, inputs['item_desc'].shap)
             
    return y_pred_full

In [None]:
y_pred_test = validate(deep_learn_model,test_dataloaders,device=device)

In [None]:
submit = pd.DataFrame({'test_id': test['test_id'], 
                    'price': y_pred_test})
submit.to_csv('submission.csv', index=False)

In [None]:
submit

In [None]:
# !pip install torchviz
# from torchviz import make_dot

# for i_batch, sample_batched in enumerate(mercari_dataloaders['train']): 
#     inputs = {'name':Variable(sample_batched['name']), 'item_desc':Variable(sample_batched['item_desc']), \
#         'brand_name':Variable(sample_batched['brand_name']), \
#         'cat_name':Variable(sample_batched['cat_name']), \
#         'general_category':Variable(sample_batched['general_category']), \
#         'subcat1_category':Variable(sample_batched['subcat1_category']), \
#         'subcat2_category':Variable(sample_batched['subcat2_category']), \
#         'item_condition':Variable(sample_batched['item_condition']), \
#         'shipping':Variable(sample_batched['shipping'].float())}
#     prices = Variable(sample_batched['target'].float())   
#     batch_size = len(sample_batched['shipping'])
#     if i_batch ==0:
#         a = inputs
#         break
    
    
# batch = a
# deep_learn_model.to('cpu')
# deep_learn_model.eval()
# yhat = deep_learn_model(batch, 50)
# make_dot(yhat, params=dict(list(deep_learn_model.named_parameters()))).render("rnn_torchviz")

In [None]:
from sklearn.metrics import mean_squared_log_error
def RMSLE(y_true:np.ndarray, y_pred:np.ndarray) -> np.float64:
    return np.sqrt(mean_squared_log_error(y_true, y_pred))

In [None]:
y_pred_val = validate(deep_learn_model,mercari_dataloaders['val'],device=device)
y_true_val = dvalid['price']

print('RMSLE in validate data: ', RMSLE(y_true_val, y_pred_val))

In [None]:
y_pred_train = validate(deep_learn_model,mercari_dataloaders['train'],device=device)
y_true_train = dtrain['price']

print('RMSLE in validate data: ', RMSLE(y_true_train, y_pred_train))