## Hypothesis: Categories have different price distribution
how well does item_description descript category?

### prepare datas

In [1]:
df = pd.read_csv('./data/train.tsv', sep='\t')

In [2]:
df.isnull().sum()

train_id                  0
name                      0
item_condition_id         0
category_name          6327
brand_name           632682
price                     0
shipping                  0
item_description          4
dtype: int64

In [3]:
cate_unique = df.loc[~df.category_name.isnull(), 'category_name'].unique()
cate_dict = {cat: i for i, cat in enumerate(cate_unique, 1)}
cate_dict[np.nan] = 0

In [4]:
# category_id > doc_id
# too slow
# df['cate_id'] = df.category_name.replace(cate_dict)
df['cate_id'] = df.category_name.map(cate_dict.get)

In [5]:
# issue1: find no descripted datas > not inclued in training 
print('total data:', len(df))
print('null values:', df.item_description.isnull().sum())
print('no description data:', df.item_description.isin(['No description yet']).sum())

total data: 1482535
null values: 4
no description data: 82489


In [6]:
# do all data have categories when they have item_description?
idx_bool = df.item_description.isin([np.nan, 'No description yet'])
temp = df.loc[~idx_bool, ['cate_id', 'item_description']]
temp.loc[temp.cate_id==0, 'cate_id'].count()

5705

In [7]:
df_docs = df.loc[~idx_bool, ['cate_id', 'item_description']]
df_docs = df_docs.loc[~(df_docs.cate_id == 0), :]
df_docs.loc[df_docs.cate_id == 0, :].count()

cate_id             0
item_description    0
dtype: int64

### first test cluster categories with names

In [8]:
from nltk.tokenize import word_tokenize
from gensim.models.doc2vec import TaggedDocument
from gensim.models import doc2vec
import gensim
from tqdm import tqdm
import os
import string as string_package
from pprint import pprint

In [9]:
data_file_path = './data/docs_name.txt'
names = df.name.values

In [9]:
# issue2: '[rm]' 
# have to tokenized as a word
def tokenize_sentence(sentence):
    if '[rm]' in sentence:
        if '[rm]' == sentence.strip():
            return ['[rm]']
        else:
            sen_f, sen_b = sentence.split('[rm]')
            sen_f = word_tokenize(sen_f.strip())
            sen_b = word_tokenize(sen_b.strip())
        return sen_f + ['[rm]'] + sen_b
    else:
        return word_tokenize(sentence)

def remove_puncutation(sentence):
    punc = list(string_package.punctuation) + ['``', '""', "''"]
    for p in punc:
        sentence = sentence.replace(p, '')
    return sentence

def preprocess_sentence(sentence, stop_words=None):
    if not stop_words:
        stop_words = []
    sentence = remove_puncutation(sentence)
    temp = tokenize_sentence(sentence)
    temp = [t.lower() for t in temp if t not in stop_words]
    return temp

def make_data_file(docs, stop_words=None):
    assert type(docs) == list or np.ndarray, 'docs must be a list or np.array([something])'
    
    data = []
    for sentence in tqdm(docs, desc='making data files', total=len(docs)):
        temp = preprocess_sentence(sentence, stop_words=stop_words)
        data.append(temp)
    return data

In [11]:
data = make_data_file(names)

making data files: 100%|██████████| 1482535/1482535 [03:05<00:00, 7976.25it/s]


In [10]:
def write_data_file(file_path, data):
    with open(file_path, 'w', encoding='utf-8') as out_file:
        for tokens in data:
            print('\t'.join(tokens), file=out_file)
    print('Done!')

In [13]:
write_data_file(data_file_path, data)

Done!


In [11]:
def load_data_file(file_path):
    data = []
    with open(file_path, 'r', encoding='utf-8') as out_file:
        for line in out_file:
            data.append(line.strip().split('\t'))
    return data
    print('Done!')

In [27]:
data = load_data_file(data_file_path)

In [45]:
df_docs_name = pd.DataFrame({'cate_id': df.cate_id, 'doc_tokens': np.array(data)})
# remove 6327 unknown categories
df_docs_name_train = df_docs_name.loc[~(df_docs_name.cate_id == 0), :] 

build model

In [42]:
tagged_docs = [TaggedDocument(d, [i]) for i, d in (df_docs_name_train.values)]

In [76]:
model_path = './data/model/cate_name.model'

In [77]:
model = doc2vec.Doc2Vec(documents=tagged_docs, size=300, alpha=0.05, min_alpha=0.025, 
                        window=5, min_count=1, workers=3)
model.save(model_path)

load model

In [78]:
model = doc2vec.Doc2Vec.load(model_path)

test

In [28]:
def get_similar_doc(query, cate_dict, model, stop_words=None):
    cate_dict_inv = {v: k for k, v in cate_dict.items()}
    query_vec = model.infer_vector(preprocess_sentence(query, stop_words=stop_words))
    sims = model.docvecs.most_similar([query_vec], topn=5)
    print('query: {}'.format(query))
    print('='*5 + 'most similar docs' + '='*5)
    for (tag, sim)in sims:
        print('category: {0} | similarity: {1:.4f}'.format(cate_dict_inv.get(tag), sim))

In [79]:
df_test = df.loc[df.cate_id == 0, ['name', 'item_description']]

In [85]:
idx = np.random.choice(len(df_test))
query = df_test.values[idx][0]
get_similar_doc(query, cate_dict, model)

query: White Obey Long-Sleeve Shirt Size M
=====most similar docs=====
category: Vintage & Collectibles/Accessories/Handkerchief | similarity:0.7285
category: Kids/Gear/Playard Bedding | similarity:0.7244
category: Handmade/Candles/Sticker | similarity:0.7129
category: Handmade/Woodworking/Accessories | similarity:0.7120
category: Vintage & Collectibles/Furniture/Entertainment | similarity:0.7106


### using item description

In [12]:
df_docs.head()

Unnamed: 0,cate_id,item_description
1,2,This keyboard is in great condition and works ...
2,3,Adorable top with a hint of lace and a key hol...
3,4,New with tags. Leather horses. Retail for [rm]...
4,5,Complete with certificate of authenticity
5,6,"Banana republic bottoms, Candies skirt with ma..."


In [13]:
data_file_path = './data/docs_item.txt'
items = df_docs.item_description.values
data = make_data_file(items)

making data files: 100%|██████████| 1394337/1394337 [05:21<00:00, 4334.64it/s]


In [16]:
write_data_file(data_file_path, data)

Done!


In [17]:
data = load_data_file(data_file_path)

In [18]:
df_docs_cate_train = pd.DataFrame({'cate_id': df_docs.cate_id, 'doc_tokens': np.array(data)})

build model

In [21]:
tagged_docs = [TaggedDocument(d, [i]) for i, d in (df_docs_cate_train.values)]

In [22]:
model_path = './data/model/cate_item.model'

In [23]:
model = doc2vec.Doc2Vec(documents=tagged_docs, size=300, alpha=0.05, min_alpha=0.025, 
                        window=5, min_count=1, workers=3)
model.save(model_path)

load model

In [24]:
model = doc2vec.Doc2Vec.load(model_path)

test

In [26]:
df_test = df.loc[df.cate_id == 0, ['name', 'item_description']]

In [38]:
idx = np.random.choice(len(df_test))
query = df_test.values[idx][1]
print('name: {}'.format(df_test.values[idx][0]))
get_similar_doc(query, cate_dict, model)

name: PREMIUM NFL FOOTBALL HOT PACK! 3 HITS!
query: Each HOT PACK includes the following. -1 to 2 Autographs -1 to 2 Jersey or Relic Cards -1 to 2 Numbered Insert or Parallels -10 Rookie Cards (all current players) -10 Base Cards You will definitely get more than [rm] value in this pack! I am trying to get rid of my collection and I don't have time to post every card. Players that could be included in the hot packs are: Brett Favre, Julian Edelman, Allen Robinson, Chad Johnson and more! [rm] SHIPPED FOR EACH PACK Let me know if you want more than one hot pack and i can lower the price to save on shipping! :) THIS IS A GREAT DEAL!! 3 HITS FOR [rm]
=====most similar docs=====
category: Home/Bath/Bathroom Shelves | similarity: 0.6268
category: Other/Automotive/Oils & Fluids | similarity: 0.6057
category: Handmade/Woodworking/Boxes | similarity: 0.6045
category: Handmade/Woodworking/Accessories | similarity: 0.6007
category: Handmade/Ceramics and Pottery/Coasters | similarity: 0.5835
