In [1]:
import pandas as pd
import numpy as np

In [2]:
from sklearn.preprocessing import LabelEncoder

In [3]:
#load data
data_folder = "../../Data/Tsv_data/"
train_file = "train.tsv"
test_file = "test.tsv"
train_data = pd.read_csv(data_folder + train_file, delimiter = "\t") 
#test_data = pd.read_csv(data_folder + test_file, delimiter = "\t") 

Tokenize and serialize words for item description and names

In [4]:
#replace some special strings rm
train_data["name"] = train_data["name"].str.replace("\[rm\]", "special_rm")
train_data["item_description"] = train_data["item_description"].str.replace("\[rm\]", "special_rm")

In [5]:
#tokenize
from nltk.tokenize import TweetTokenizer, word_tokenize
tknzr = TweetTokenizer(preserve_case=False, strip_handles=False, reduce_len=True)

In [6]:
print "begin tokenizing names and items..."

train_data["name"]

splitted_names = train_data["name"].apply(lambda x: tknzr.tokenize(unicode(x, "utf-8")))
splitted_description = train_data["item_description"].apply(lambda x: [] if pd.isnull(x) \
                                                            else tknzr.tokenize(unicode(x, "utf-8")))

print "finished tokenizing"

begin tokenizing names and items...
finished tokenizing


In [7]:
#make a bag of words dictionary 
from collections import Counter
all_words = Counter()

def add_words_to_dict(wlist):
    for w in wlist:
        all_words[w] +=1
        
_ = splitted_names.apply(lambda x: add_words_to_dict(x))
_ = splitted_description.apply(lambda x: add_words_to_dict(x))

In [8]:
# create index for each word appeared greater than 50 times
# for rare words =, substitute with special_rare
# for missing , substitute with  special_missing

all_words_usable = {}
threshold = 50
idx = 1
for key, value in all_words.iteritems():
    if value > threshold:
        all_words_usable[key] = idx
        idx +=1
#add special words
all_words_usable["special_rare"] = idx
all_words_usable["special_missing"] = idx + 1

In [9]:
#substitue words with index
def serialize_word_list(x):
    if len(x) == 0:
        return [all_words_usable["special_missing"]]
    else:
        return [all_words_usable[w] if w in all_words_usable\
                else all_words_usable["special_rare"] for w in x]
    
splitted_names_seq =  splitted_names.apply(lambda x: serialize_word_list(x))
splitted_description_seq =  splitted_description.apply(lambda x: serialize_word_list(x))

categorize item_condition, brand name and category_name

In [10]:
train_data["item_condition_id"] = train_data["item_condition_id"] - 1
#using all categories (if category appearance < 5 then defined this category as rare)
#more refined treatment can be tested
rare_threshold = 5
categories = train_data["category_name"].copy()
#missing value replaced by special token
categories[categories.isnull()] = "special_missing"
categories_counts = categories.value_counts()
rare_cates = categories_counts[categories_counts < rare_threshold].index.values

In [11]:
categories[categories.isin(rare_cates)] = "special_rare"

In [12]:
#treat brandname similar to categories
brand_name = train_data["brand_name"].copy()
brand_name[brand_name.isnull()] = "special_missing"

In [13]:
rare_threshold = 3
brand_counts = brand_name.value_counts()
rare_brands = brand_counts[brand_counts < rare_threshold].index.values
brand_name[brand_name.isin(rare_brands)] = "special_rare"

In [14]:
le = LabelEncoder()
categories_clean = le.fit_transform(categories)
brand_name_clean = le.fit_transform(brand_name)
item_condition_id = train_data["item_condition_id"].values
item_id = train_data["train_id"].values
shipping = train_data["shipping"].values
price = np.log(train_data["price"].values + 1)

Save all the cleaned data to a folder

In [15]:
output_folder = "../../Data/Model_data/train/"

np.save(output_folder + "X_name", splitted_names_seq.values)
np.save(output_folder + "X_description", splitted_description_seq.values)
np.save(output_folder + "X_category", categories_clean)
np.save(output_folder + "X_brand", brand_name_clean)
np.save(output_folder + "X_condition", item_condition_id)
np.save(output_folder + "X_id", item_id)
np.save(output_folder + "X_shipping", shipping)
np.save(output_folder + "y_price", price)