# Bài tập cuối kỳ môn học máy
* Họ và tên: Dương Quốc Thành
* MSSV: 18021176

# Import các thư viện cần thiết

In [None]:
import numpy as np 
import pandas as pd 
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.linear_model import Ridge, LogisticRegression
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.preprocessing import LabelBinarizer
import scipy
from scipy.sparse import csr_matrix, hstack
import gc
import os
from wordcloud import WordCloud
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))
        

# Giải nén dữ liệu

In [None]:
!apt-get install p7zip
!p7zip -d -f -k /kaggle/input/mercari-price-suggestion-challenge/train.tsv.7z
!p7zip -d -f -k /kaggle/input/mercari-price-suggestion-challenge/test.tsv.7z
!p7zip -d -f -k /kaggle/input/mercari-price-suggestion-challenge/sample_submission.csv.7z

!unzip /kaggle/input/mercari-price-suggestion-challenge/sample_submission_stg2.csv.zip
!unzip /kaggle/input/mercari-price-suggestion-challenge/test_stg2.tsv.zip

# Quan sát dữ liệu

In [None]:
train = pd.read_csv('train.tsv', sep='\t')
test = pd.read_csv('test_stg2.tsv', sep='\t')

print ("Train data shape:", train.shape)
print ("Test data shape:", test.shape)

In [None]:
#Hiển thị 20 phần tử đầu tiên của tập train
train.head(20)

In [None]:
#Quan sát tập train
train.info()
train.describe()


In [None]:
#Hiển thị 20 phần tử đầu tiên của tập test
test.head(20)

In [None]:
#Quan sát tập test
test.info()
test.describe()

# Phân tích dữ liệu

* Tính tổng số phần tử của từng hạng mục

In [None]:
print ("Train data:", train.nunique())
print ("---------------------")
print ("Test data:", test.nunique())

* Đếm các giá trị NULL trong 2 tạp dữ liệu train và test

In [None]:
print ("NUll in Train:", train.isnull().sum())
print("---------------------------------")
print ("NUll in Test:", test.isnull().sum())

* Thay thế các vị trí NULL bằng missing

In [None]:
train.category_name.fillna(value="missing", inplace=True)
train.brand_name.fillna(value="missing", inplace=True)
train.item_description.fillna(value="missing", inplace=True)

test.category_name.fillna(value="missing", inplace=True)
test.brand_name.fillna(value="missing", inplace=True)
test.item_description.fillna(value="missing", inplace=True)

* Quan sát dữ liệu sau khi thay thế NULL

In [None]:
print("Train after: ")
train.head(20)

In [None]:
print("Test after: ")
test.head(20)

* Chia nhỏ catelog_name thành 3 phần nhỏ tối ưu hơn

In [None]:
def split_category(category_name):
    try:
        sub1, sub2, sub3 = category_name.split('/')
        return sub1, sub2, sub3
    except:
        return "none", "none", "none"

train['subcat_1'], train['subcat_2'], train['subcat_3'] = zip(*train['category_name'].apply(lambda x: split_category(x)))
test['subcat_1'], test['subcat_2'], test['subcat_3'] = zip(*test['category_name'].apply(lambda x: split_category(x)))

print("There are %d unique subcat_1." % train['subcat_1'].nunique())
print("There are %d unique subcat_2." % train['subcat_2'].nunique())
print("There are %d unique subcat_3." % train['subcat_3'].nunique())

* Quan sát dữ liệu sau khi split category_name

In [None]:
train.head(20)

1. Phân tích cột brand_name

In [None]:
brands = train['brand_name'].value_counts()
print(brands[:20])

1. Phan tích cột price

So sánh phân phôi của Price và log(Price)

In [None]:
plt.figure(figsize=(15, 5))
plt.subplot(1, 2, 1)
plt.hist(train['price'], bins=50, range=[0, 200], edgecolor='blue')
plt.title('Phân phối của Price')
plt.xlabel('Price')
plt.ylabel('Số lượng')
plt.subplot(1, 2, 2)
plt.hist(np.log1p(train['price']), bins=50, edgecolor='blue')
plt.title('Phân phối của log(Price)')
plt.xlabel('log(Price)')
plt.ylabel('Số lượng')
plt.show()

Vì log(Price) có phân phối đều hơn còn Price bị lệch trái nên ta sẽ sử dụng log(Price)

In [None]:
train['log_price'] = np.log1p(train['price'])
train.head(20)


In [None]:
test_id = test['test_id']
target = np.log1p(train['price'])
train = train[[col for col in train.columns if col != 'price']]

print(train.shape)
print(target.shape)

1. Phân tích cột item_condition_id

In [None]:
ax = sns.countplot(x = 'item_condition_id',data=train, palette ='Blues_r')
ax.set_title("Tổng số lượng sản phẩm", fontsize = 14)

1. Phân tích cột shipping

In [None]:
train['shipping'].value_counts()

In [None]:
fig, axes = plt.subplots(2,1,figsize=(5,6), sharex=True)
axes[0].set_title('shipping')
sns.countplot(x='shipping', data=train, palette="ch:.25", color="c", ax=axes[0])
sns.violinplot(x='shipping', y='log_price', palette="ch:.25", data=train, ax=axes[1])
fig.tight_layout()


1. Phan tích cột item_description

In [None]:
wordcloud = WordCloud(width = 2400, height = 1200).generate(" ".join(train.item_description.astype(str)))
plt.figure(figsize = (13, 10))
plt.imshow(wordcloud)
plt.show()

# Vector hóa dữ liệu

1. Vector hóa name

In [None]:
n_vector = CountVectorizer()

Train_Vname = n_vector.fit_transform(train['name'])
Test_Vname = n_vector.transform(test['name'])

print(Train_Vname.shape)
print(Test_Vname.shape)

1. Vector hóa 'brand_name', 'category_name', 'subcat_1', 'subcat_2', 'subcat_3' 

In [None]:
lb = LabelBinarizer(sparse_output=True)
Train_Vbrand = lb.fit_transform(train['brand_name'])
Test_Vbrand = lb.transform(test['brand_name'])

print(Train_Vbrand.shape)
print(Test_Vbrand.shape)

In [None]:
Train_Vcategory = lb.fit_transform(train['category_name'])
Test_Vcategory = lb.transform(test['category_name'])

print(Train_Vcategory.shape)
print(Test_Vcategory.shape)

In [None]:
Train_subcat1 = lb.fit_transform(train['subcat_1'])
Test_subcat1 = lb.transform(test['subcat_1'])

Train_subcat2 = lb.fit_transform(train['subcat_2'])
Test_subcat2 = lb.transform(test['subcat_2'])

Train_subcat3 = lb.fit_transform(train['subcat_3'])
Test_subcat3 = lb.transform(test['subcat_3'])

1. Vector hóa item_décripstion

In [None]:
t_vector = TfidfVectorizer()

Train_Vdescription = t_vector.fit_transform(train['item_description'])
Test_Vdescription = t_vector.transform(test['item_description'])

print(Train_Vdescription.shape)
print(Test_Vdescription.shape)

1. Vector hóa item_condition_id và shipping

In [None]:
Train_Vcondition_id = scipy.sparse.csr_matrix(pd.get_dummies(train['item_condition_id'], sparse = True).values)
Test_Vcondition_id = scipy.sparse.csr_matrix(pd.get_dummies(test['item_condition_id'], sparse = True).values)

print(Train_Vcondition_id.shape)
print(Test_Vcondition_id.shape)

In [None]:
Train_Vshipping = scipy.sparse.csr_matrix(pd.get_dummies(train['shipping'], sparse = True).values)
Test_Vshipping = scipy.sparse.csr_matrix(pd.get_dummies(test['shipping'], sparse = True).values)

print(Train_Vshipping.shape)
print(Test_Vshipping.shape)

* Đưa dữ liệu về dạng matrix

In [None]:
M_train = scipy.sparse.hstack((Train_Vname, Train_Vbrand, Train_Vdescription, 
                                    Train_Vcategory, Train_Vcondition_id, Train_Vshipping,
                                    Train_subcat1,Train_subcat2, Train_subcat3)).tocsr()

print(M_train.shape)

In [None]:
M_test = scipy.sparse.hstack((Test_Vname, Test_Vbrand, Test_Vdescription, 
                                    Test_Vcategory, Test_Vcondition_id, Test_Vshipping,
                                    Test_subcat1,Test_subcat2, Test_subcat3)).tocsr()

print(M_test.shape)

In [None]:
gc.collect()

# Model

* Hàm tính rmsle

In [None]:
def rmsle(y, y_pred):
    return np.sqrt(np.mean(np.power(np.log1p(y) - np.log1p(y_pred), 2)))

* Model sử dụng Ridge Regression

In [None]:
from sklearn.linear_model import Ridge

ridge_model = Ridge()

ridge_model.fit(M_train, target)

prediction = ridge_model.predict(M_train)

gc.collect()

print("rmsle: "+str(rmsle(np.expm1(target), np.expm1(prediction))))


* Model sử dụng LGBM

In [None]:
import lightgbm as lgb

params = {'learning_rate': 0.3,
          'max_depth': 8,
          'num_leaves': 100,
          'verbosity': -1,
          'metric': 'RMSE',
          'min_child_samples': 836}

cls = lgb.LGBMRegressor(**params)
cls.fit(M_train, target)

prediction_lgb = cls.predict(M_train)

gc.collect()

print("rmsle_lgb: "+str(rmsle(target, prediction_lgb)))

# Submit

In [None]:
prediction_lgbm = np.expm1(cls.predict(M_test))
prediction_ridge = np.expm1(ridge_model.predict(M_test))

gc.collect()

In [None]:
submit = pd.DataFrame(test_id, columns=['test_id'])
submit['price'] = prediction_ridge
# submit['price'] = prediction

submit.head(20)

In [None]:
del test_id, prediction_lgbm, prediction_ridge, M_train, M_test
gc.collect()

In [None]:
submit.to_csv('./submission.csv', index=False)