In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import gc

In [None]:
import os
# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

In [None]:
# vstack - adds rows, hstack - adds columns
# csr_matrix - used to handle sparse matrix
from scipy.sparse import vstack, hstack, csr_matrix

In [None]:
# CountVectorizer - Simply, counts word frequencies 
# TFIDF - More importance/weights on "rare" words. Less importance/weights on "frequent" words
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer

In [None]:
# LabelBinarizer - Converts labels into numerical representation "G,B,R" -> [1,2,3]
from sklearn.preprocessing import LabelBinarizer

In [None]:
# Ridge - Reduces multicollinearity in regression. Applies L2 Regularization
from sklearn.linear_model import Ridge

In [None]:
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split, cross_val_score

In [None]:
%matplotlib inline
import matplotlib.pyplot as plt
plt.style.use(style='ggplot')
plt.rcParams['figure.figsize'] = (10, 6)

In [None]:
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

In [None]:
!apt-get install p7zip
!p7zip -d -f -k /kaggle/input/mercari-price-suggestion-challenge/train.tsv.7z
!p7zip -d -f -k /kaggle/input/mercari-price-suggestion-challenge/test.tsv.7z
!p7zip -d -f -k /kaggle/input/mercari-price-suggestion-challenge/sample_submission.csv.7z

In [None]:
!unzip /kaggle/input/mercari-price-suggestion-challenge/sample_submission_stg2.csv.zip
!unzip /kaggle/input/mercari-price-suggestion-challenge/test_stg2.tsv.zip

In [None]:
# Create training set
train = pd.read_csv('train.tsv', sep = '\t')
# Create testing set
test = pd.read_csv('test_stg2.tsv', sep='\t')

In [None]:
print ("Train data shape:", train.shape)
print ("Test data shape:", test.shape)

In [None]:
train.head(10)

In [None]:
# Create log price variable (Transformation)
y = np.log1p(train['price'])

In [None]:
train.info()

*category_name, brand_name, item_description có chứa giá trị null*

In [None]:
# Create combined set. You would want to apply count vectorizer on combined set so you can get the list of all possible words.
combined = pd.concat([train,test])

# Create the submission set (Only contains the test ID)
submission = test[['test_id']]

# Create size of train
train_size = len(train)

In [None]:
combined.shape

*tổng dữ liệu khi hợp nhất train và test là : 4943260*

In [None]:
combined_ML = combined.sample(frac=0.1).reset_index(drop=True)

In [None]:
combined_ML.shape

In [None]:
train.price.describe().apply(lambda x: format(x, 'f'))

* Giá trung bình trong tập dữ liệu là 26 đô la
* Giá trị trung vị trong tập dữ liệu là 17 Đô la
* Giá tối đa trong tập dữ liệu là 2000 Đô la
* Do tập dữ liệu sai lệch, giá trung bình là mức giá đáng tin cậy hơn để đánh giá.

In [None]:
np.percentile(train.price, 99)

**Loại bỏ chấm câu - punctuation**

In [None]:
from string import punctuation

In [None]:
punctuation

In [None]:
import string
def remove_punctuation(sentence: str) -> str:
    return sentence.translate(str.maketrans('', '', string.punctuation))

In [None]:
# Create a list of punctuation replacements
punctuation_symbols = []
for symbol in punctuation:
    punctuation_symbols.append((symbol, ''))
    
punctuation_symbols

**Loại bỏ chữ số**

In [None]:
def remove_digits(x):
    x = ''.join([i for i in x if not i.isdigit()])
    return x

**Loại bỏ từ dừng - stopword**

In [None]:
from nltk.corpus import stopwords

stop = stopwords.words('english')

def remove_stop_words(x):
    x = ' '.join([i for i in x.lower().split(' ') if i not in stop])
    return x

**Trường hợp với chữ thường**

In [None]:
def to_lower(x):
    return x.lower()

In [None]:
train.count()

In [None]:
train.dtypes

In [None]:
train.price.describe()

In [None]:
bins = [0, 10, 17, 29, 2001]
labels = ['q1','q2','q3','q4']
train['price_bin'] = pd.cut(train['price'], bins=bins, labels=labels)
train.groupby('price_bin')['price'].describe()

In [None]:
plt.figure(figsize=(12, 7))
plt.hist(train['price'], bins=50, range=[0,250], label='price')
plt.title('Price Distribution', fontsize=15)
plt.xlabel('Price', fontsize=15)
plt.ylabel('Samples', fontsize=15)
plt.xticks(fontsize=15)
plt.yticks(fontsize=15)
plt.legend(fontsize=15)
plt.show()

In [None]:
shipping = train[train['shipping']==1]['price']
no_shipping = train[train['shipping']==0]['price']

plt.figure(figsize=(12,7))
plt.hist(shipping, bins=50, range=[0,250], alpha=0.7, label='Price With Shipping')
plt.hist(no_shipping, bins=50, range=[0,250], alpha=0.7, label='Price With No Shipping')
plt.title('Price Distrubtion With/Without Shipping', fontsize=15)
plt.xlabel('Price')
plt.ylabel('Normalized Samples')
plt.xticks(fontsize=15)
plt.yticks(fontsize=15)
plt.legend(fontsize=15)
plt.show()

In [None]:
# Amount of unique brand names
train['brand_name'].nunique()

In [None]:
# Top 20 Brand Distribution
b20 = train['brand_name'].value_counts()[0:20].reset_index().rename(columns={'index': 'brand_name', 'brand_name':'count'})
ax = sns.barplot(x="brand_name", y="count", data=b20)
ax.set_xticklabels(ax.get_xticklabels(),rotation=90)
ax.set_title('Top 20 Brand Distribution', fontsize=15)
plt.show()

In [None]:
# Display Top 20 Expensive Brands By Mean Price
top20_brand = train.groupby('brand_name', axis=0).mean()
df_expPrice = pd.DataFrame(top20_brand.sort_values('price', ascending = False)['price'][0:20].reset_index())


ax = sns.barplot(x="brand_name", y="price", data=df_expPrice)
ax.set_xticklabels(ax.get_xticklabels(),rotation=90, fontsize=15)
ax.set_title('Top 20 Expensive Brand', fontsize=15)
plt.xticks(fontsize=15)
plt.yticks(fontsize=15)
plt.show()# Displayd frequency ratio of brand names
train['brand_name'].value_counts(1)

* Chúng em thấy có thể cần phải tách các danh mục để có thể so khớp các danh mục chung và có được thông tin chi tiết tốt hơn từ dữ liệu.
* Tách category_name thành category_main, category_sub1, category_sub2

In [None]:
def transform_category_name(category_name):
    try:
        main, sub1, sub2= category_name.split('/')
        return main, sub1, sub2
    except:
        return np.nan, np.nan, np.nan

train['category_main'], train['category_sub1'], train['category_sub2'] = zip(*train['category_name'].apply(transform_category_name))

cat_train = train[['category_main','category_sub1','category_sub2', 'price']]

cat_train.head()

In [None]:
# Electronics have the highest std
train.groupby('category_main')['price'].describe()

In [None]:
# Display distribution
train['category_main'].value_counts(1)

In [None]:
plt.figure(figsize=(17,10))
sns.countplot(y = train['category_main'], order = train['category_main'].value_counts().index, orient = 'v')
plt.title('Top 10 Categories Main', fontsize = 25)
plt.ylabel('Main Category', fontsize = 20)
plt.xlabel('Number of Items in Main Category', fontsize = 20)
plt.show()

* Women và Beauty chiếm phần lớn trong phân phối
* Women và Beauty chiếm 56% phân phối

In [None]:
#main = pd.DataFrame(cat_train['category_main'].value_counts()).reset_index().rename(columns={'index': 'main', 'category_main':'count'})
fig, axes = plt.subplots(figsize=(12, 7))
main = cat_train[cat_train["price"]<100]
# Use a color palette
ax = sns.boxplot( x=main["category_main"], y=main["price"], palette="Blues")
ax.set_xticklabels(ax.get_xticklabels(),rotation=90, fontsize=12)

plt.show()

In [None]:
# Create a "no_brand" column 
train['no_brand'] = train['brand_name'].isnull()

In [None]:
f, ax = plt.subplots(figsize=(15, 4))
sns.countplot(y='category_main', hue='no_brand', data=train).set_title('Category Distribution With/Without Brand');
plt.show()

* Ta thấy sản phẩm của "Men" và "Women" không có nhãn hiệu phân phối nhiều hơn có nhãn hiệu
* "Electronics" và "Sport & Outdoors" có nhãn hiệu phân phối nhiều hơn không nhãn hiệu
* ...

In [None]:
df = cat_train.groupby(['category_sub2'])['price'].agg(['mean']).reset_index().rename(columns={'index': 'main', 'category_main':'count'})
df= df.sort_values('mean', ascending=False).head(20)

plt.figure(figsize=(20, 15))
plt.barh(range(0,len(df)), df['mean'], align='center', alpha=0.5, color='r')
plt.yticks(range(0,len(df)), df['category_sub2'], fontsize=15)

plt.xlabel('Price', fontsize=15)
plt.ylabel('Sub Category 2', fontsize=15)
plt.title('Top 20 2nd Category (Mean Price)', fontsize=20)
plt.show()

* Top 20 giá trung bình của các nhãn category_sub2

In [None]:
df = cat_train.groupby(['category_sub1'])['price'].agg(['mean']).reset_index().rename(columns={'index': 'main', 'category_main':'count'})
df= df.sort_values('mean', ascending=False)[0:20]

plt.figure(figsize=(20, 15))
plt.barh(range(0,len(df)), df['mean'], align='center', alpha=0.5, color='b')
plt.yticks(range(0,len(df)), df['category_sub1'], fontsize=15)

plt.xlabel('Price', fontsize=15)
plt.ylabel('Sub Category 1', fontsize=15)
plt.title('Top 20 1st Category (Mean Price)', fontsize=20)
plt.show()

* Top 20 giá trung bình của các nhãn categoy_sub1

In [None]:
train['price'] = np.log1p(train['price'])

In [None]:
train['shipping'].value_counts()

In [None]:
train['item_description'].value_counts().head()

In [None]:
train['item_condition_id'].value_counts()

In [None]:
train['brand_name'].value_counts().head()

In [None]:
train['category_name'].value_counts().head(10)

In [None]:
print ("Train data shape:", train.shape)
print ("Test data shape:", test.shape)

In [None]:
train.head(10)

In [None]:
# Remove Punctuation
combined.item_description = combined.item_description.astype(str)

descr = combined[['item_description', 'price']]
descr['count'] = descr['item_description'].apply(lambda x : len(str(x)))

descr['item_description'] = descr['item_description'].apply(remove_digits)
descr['item_description'] = descr['item_description'].apply(remove_punctuation)
# descr['item_description'] = descr['item_description'].apply(remove_stop_words)
descr.head(3)

*Loại bỏ dấu câu và các chữ số trong item_description*


In [None]:
from nltk.stem.porter import PorterStemmer

porter = PorterStemmer()

descr['item_description'] = descr['item_description'].apply(porter.stem)

*Làm sạch item_description*

In [None]:
descr.tail(10)

In [None]:
df = descr.groupby('count')['price'].mean().reset_index()
sns.regplot(x=df["count"], y=(df["price"]))
plt.xlabel("word count")
plt.show()

**Có một mối quan hệ tuyến tính tích cực giữa số lượng từ và giá từ khoảng 0-300 từ. Sau đó, có một mối quan hệ tiêu cực dần dần, giảm xuống ở khoảng điểm 1000 từ.
**

In [None]:
def category_split(category_name):
    try:
        return category_name.split('/')
    except:
        return ['Missing', 'Missing', 'Missing']

In [None]:
train['main_cat'], train['sub_cat'], train['item_cat'] = zip(*train['category_name'].apply(lambda x: category_split(x)))
test['main_cat'], test['sub_cat'], test['item_cat'] = zip(*test['category_name'].apply(lambda x: category_split(x)))

In [None]:
train['category_name'] = train['category_name'].fillna(value='Missing')
train['brand_name'] = train['brand_name'].fillna(value='Missing')
train['item_description'] = train['item_description'].fillna(value='Missing')

In [None]:
test['category_name'] = test['category_name'].fillna(value='Missing')
test['brand_name'] = test['brand_name'].fillna(value='Missing')
test['item_description'] = test['item_description'].fillna(value='Missing')

*Các giá trị Null trong Tệp Test và Train thay bằng giá trị MIssing*

In [None]:
cnt_vec = CountVectorizer()

X_train_name = cnt_vec.fit_transform(train['name'])
X_test_name = cnt_vec.transform(test['name'])

In [None]:
print(X_train_name.shape)
print(X_test_name.shape)

In [None]:
tfidf_descp = TfidfVectorizer(max_features=50000, ngram_range=(1, 3), stop_words='english')

X_train_descp = tfidf_descp.fit_transform(train['item_description'])
X_test_descp = tfidf_descp.transform(test['item_description'])

In [None]:
from sklearn.preprocessing import LabelBinarizer

In [None]:
lb_brand_name = LabelBinarizer(sparse_output=True)
X_train_brand = lb_brand_name.fit_transform(train['brand_name'])
X_test_brand = lb_brand_name.transform(test['brand_name'])

lb_item_cond_id = LabelBinarizer(sparse_output=True)
X_train_item_condition_id = lb_item_cond_id.fit_transform(train['item_condition_id'])
X_test_item_condition_id = lb_item_cond_id.transform(test['item_condition_id'])

lb_shipping = LabelBinarizer(sparse_output=True)
X_train_shipping = lb_shipping.fit_transform(train['shipping'])
X_test_shipping = lb_shipping.transform(test['shipping'])

**Biến đổi Brand_name, Item_condition_id,shipping**

In [None]:
lb_main_cat = LabelBinarizer(sparse_output=True)
X_train_main_cat = lb_main_cat.fit_transform(train['main_cat'])
X_test_main_cat = lb_main_cat.transform(test['main_cat'])

lb_sub_cat = LabelBinarizer(sparse_output=True)
X_train_sub_cat = lb_sub_cat.fit_transform(train['sub_cat'])
X_test_sub_cat = lb_sub_cat.transform(test['sub_cat'])

lb_item_cat = LabelBinarizer(sparse_output=True)
X_train_item_cat = lb_item_cat.fit_transform(train['item_cat'])
X_test_item_cat = lb_item_cat.transform(test['item_cat'])

**Biến đổi main_cat,sub_cat và item_cat**

In [None]:
# Full dataframe printing
print(type(X_train_brand), type(X_train_item_condition_id), type(X_train_shipping))
print(type(X_test_brand), type(X_test_item_condition_id), type(X_test_shipping))

In [None]:
# Train dataframe printing
print('X_train_brand shape:', X_train_brand.shape)
print('X_train_item_cond_id shape:', X_train_item_condition_id.shape)
print('X_train_shipping shape:', X_train_shipping.shape)
print('X_train_main_cat shape:', X_train_main_cat.shape)
print('X_train_sub_cat shape:', X_train_sub_cat.shape)
print('X_train_item_cat shape:', X_train_item_cat.shape)

In [None]:
# Test dataframe printing
print('X_test_brand shape:', X_test_brand.shape)
print('X_test_item_cond_id shape:', X_test_item_condition_id.shape)
print('X_test_shipping shape:', X_test_shipping.shape)
print('X_test_main_cat shape:', X_test_main_cat.shape)
print('X_test_sub_cat shape:', X_test_sub_cat.shape)
print('X_test_item_cat shape:', X_test_item_cat.shape)

In [None]:
from scipy.sparse import hstack
import gc

In [None]:
sparse_matrix_list = (X_train_name, X_train_descp, X_train_brand, 
                      X_train_item_condition_id, X_train_shipping, 
                      X_train_main_cat, X_train_sub_cat, X_train_item_cat)

In [None]:
X_train = hstack(sparse_matrix_list).tocsr()
print(type(X_train), X_train.shape)

In [None]:
del X_train
gc.collect()

In [None]:
def rmsle(y, y_pred):
    return np.sqrt(np.mean(np.power(np.log1p(y) - np.log1p(y_pred), 2)))

def evaluate_orig_price(y_test, preds):
    preds_exmpm = np.expm1(preds)
    y_test_exmpm = np.expm1(y_test)
    
    return rmsle(y_test_exmpm, preds_exmpm)

In [None]:
def model_train_predict(model, matrix_list):
    X = hstack(matrix_list).tocsr()
    X_train, X_test, y_train, y_test = train_test_split(X, train['price'], test_size=0.2)

    model.fit(X_train, y_train)
    preds = model.predict(X_test)
    
    del X, X_train, X_test, y_train
    gc.collect()
    
    return preds, y_test

In [None]:
linear_model = Ridge(solver='lsqr', fit_intercept=False)

sparse_matrix_list = (X_train_name, X_train_brand, 
                      X_train_item_condition_id, X_train_shipping, 
                      X_train_main_cat, X_train_sub_cat, X_train_item_cat)

linear_preds, y_test = model_train_predict(model=linear_model, 
                                           matrix_list=sparse_matrix_list)

print('Item Description rmsle:', evaluate_orig_price(y_test, linear_preds))

sparse_matrix_list = (X_train_name, X_train_descp, X_train_brand, 
                      X_train_item_condition_id, X_train_shipping, 
                      X_train_main_cat, X_train_sub_cat, X_train_item_cat)


linear_preds, y_test = model_train_predict(model=linear_model, 
                                           matrix_list=sparse_matrix_list)
print('Item Description rmsle:', evaluate_orig_price(y_test, linear_preds))

In [None]:
sparse_matrix_list = (X_train_name, X_train_descp, X_train_brand, 
                      X_train_item_condition_id, X_train_shipping, 
                      X_train_main_cat, X_train_sub_cat, X_train_item_cat)

X_train = hstack(sparse_matrix_list).tocsr()
X_train

In [None]:
sparse_matrix_list = (X_test_name, X_test_descp, X_test_brand, 
                      X_test_item_condition_id, X_test_shipping, 
                      X_test_main_cat, X_test_sub_cat, X_test_item_cat)
X_test = hstack(sparse_matrix_list).tocsr()

In [None]:
y_train = train['price']
y_train

In [None]:
linear_model.fit(X_train, y_train)

In [None]:
preds = linear_model.predict(X_test)
preds

In [None]:
preds = np.expm1(preds)
preds

In [None]:
submission = pd.read_csv('sample_submission_stg2.csv')
submission

In [None]:
submission.loc[:, 'price'] = preds
submission

In [None]:
submission.to_csv('submission.csv', index=False)