In [66]:
#https://www.kaggle.com/code/tunguz/wordbatch-ftrl-fm-lgb-lbl-0-42506

In [67]:
import gc
import time
import numpy as np
import pandas as pd

In [68]:
def normalize_text(text):
    return u" ".join(
        [x for x in [y for y in non_alphanums.sub(' ', text).lower().strip().split(" ")] \
         if len(x) > 1 and x not in stopwords])

def lowercase_text(text):
    return text.lower()

def wordCount(text):
    try:
        if text == 'No description yet':
            return 0
        else:
            text = text.lower()
            words = [w for w in text.split(" ")]
            return len(words)
    except: 
        return 0



# split category name into 3 parts
def split_cat(text):
    try: return text.split("/")
    except: return ("No Label", "No Label", "No Label")



# Filling missing values
def fill_missing_values(df):
    df.category_name.fillna(value="missing", inplace=True)
    df.brand_name.fillna(value="missing", inplace=True)
    df.item_description.fillna(value="missing", inplace=True)
    df.item_description.replace('No description yet',"missing", inplace=True)
    return df

def brandfinder(line, brand_set):
    brand = line[0]
    name = line[1]
    namesplit = name.split(' ')
    if brand == 'missing':
        for x in namesplit:
            if x in brand_set:
                return name
    if name in brand_set:
        return name
    return brand




In [69]:
train = pd.read_table('./data/raw/train.tsv', engine='c')
print(f"train len: {len(train)}")
test = pd.read_table('./data/raw/test.tsv', engine='c')
print(f"test len: {len(test)}")

train len: 1482535
test len: 693359


In [70]:
train = train.loc[train.price >= 1.0].reset_index(drop=True)
print(f"new length: {len(train)}")

new length: 1481661


In [71]:
train['desc_len'] = train['item_description'].apply(lambda x: wordCount(x))
train['name_len'] = train['name'].apply(lambda x: wordCount(x))


train['item_condition_id'] = train['item_condition_id'].astype('str')


train_brands = set(train['brand_name'].values)
train['brand_name'] = train[['brand_name','name']].apply(brandfinder, axis = 1, brand_set = train_brands)
train['subcategory_1'], train['subcategory_2'], train['subcategory_3'] = zip(*train['category_name'].apply(lambda x: split_cat(x)))
train = fill_missing_values(train)
train["subcategory_1"] = train["subcategory_1"].apply(lowercase_text)
train["subcategory_2"] = train["subcategory_2"].apply(lowercase_text)
train["subcategory_3"] = train["subcategory_3"].apply(lowercase_text)
train["brand_name"] = train["brand_name"].apply(lowercase_text)
train["item_description"] = train["item_description"].apply(lowercase_text)
train["name"] = train["name"].apply(lowercase_text)
train["is_brand_missing"] = np.where(train["brand_name"] == "missing", 1, 0)
train["is_item_description_missing"] = np.where(train["item_description"] == "missing", 1, 0)

########################################################

test['desc_len'] = test['item_description'].apply(lambda x: wordCount(x))
test['name_len'] = test['name'].apply(lambda x: wordCount(x))

test['item_condition_id'] = test['item_condition_id'].astype('str')


test_brands = set(test['brand_name'].values)
test['brand_name'] = test[['brand_name','name']].apply(brandfinder, axis = 1, brand_set = test_brands)
test['subcategory_1'], test['subcategory_2'], test['subcategory_3'] = zip(*test['category_name'].apply(lambda x: split_cat(x)))
test = fill_missing_values(test)
test["subcategory_1"] = test["subcategory_1"].apply(lowercase_text)
test["subcategory_2"] = test["subcategory_2"].apply(lowercase_text)
test["subcategory_3"] = test["subcategory_3"].apply(lowercase_text)

test["name"] = test["name"].apply(lowercase_text)
test["category_name"] = test["category_name"].apply(lowercase_text)
test["brand_name"] = test["brand_name"].apply(lowercase_text)
test["item_description"] = test["item_description"].apply(lowercase_text)

test["is_brand_missing"] = np.where(test["brand_name"] == "missing", 1, 0)
test["is_item_description_missing"] = np.where(test["item_description"] == "missing", 1, 0)

In [72]:
train.head()

Unnamed: 0,train_id,name,item_condition_id,category_name,brand_name,price,shipping,item_description,desc_len,name_len,subcategory_1,subcategory_2,subcategory_3,is_brand_missing,is_item_description_missing
0,0,mlb cincinnati reds t shirt size xl,3,Men/Tops/T-shirts,missing,10.0,1,missing,0,7,men,tops,t-shirts,1,1
1,1,razer blackwidow chroma keyboard,3,Electronics/Computers & Tablets/Components & P...,razer,52.0,0,this keyboard is in great condition and works ...,36,4,electronics,computers & tablets,components & parts,0,0
2,2,ava-viv blouse,1,Women/Tops & Blouses/Blouse,target,10.0,1,adorable top with a hint of lace and a key hol...,29,2,women,tops & blouses,blouse,0,0
3,3,leather horse statues,1,Home/Home Décor/Home Décor Accents,missing,35.0,1,new with tags. leather horses. retail for [rm]...,32,3,home,home décor,home décor accents,1,0
4,4,24k gold plated rose,1,Women/Jewelry/Necklaces,missing,44.0,0,complete with certificate of authenticity,5,4,women,jewelry,necklaces,1,0


In [73]:
test.head()

Unnamed: 0,test_id,name,item_condition_id,category_name,brand_name,shipping,item_description,desc_len,name_len,subcategory_1,subcategory_2,subcategory_3,is_brand_missing,is_item_description_missing
0,0,"breast cancer ""i fight like a girl"" ring",1,women/jewelry/rings,missing,1,size 7,2,8,women,jewelry,rings,1,0
1,1,"25 pcs new 7.5""x12"" kraft bubble mailers",1,other/office supplies/shipping supplies,missing,1,"25 pcs new 7.5""x12"" kraft bubble mailers lined...",38,7,other,office supplies,shipping supplies,1,0
2,2,coach bag,1,vintage & collectibles/bags and purses/handbag,coach,1,brand new coach bag. bought for [rm] at a coac...,11,2,vintage & collectibles,bags and purses,handbag,0,0
3,3,floral kimono,2,women/sweaters/cardigan,missing,0,-floral kimono -never worn -lightweight and pe...,10,2,women,sweaters,cardigan,1,0
4,4,life after death,3,other/books/religion & spirituality,missing,1,rediscovering life after the loss of a loved o...,29,3,other,books,religion & spirituality,1,0


In [74]:
#unique train items 
train.name.nunique()/len(train)

0.7685185747617033

In [75]:
#unique train items 
test.name.nunique()/len(train)

0.38157513763269735

In [76]:
unique_test_names = set(test.name.unique()) - set(train.name.unique())

In [77]:
len(unique_test_names)

478512

In [81]:
train.to_parquet("./data/processed/train_processed.parquet")
test.to_parquet("./data/processed/test_processed.parquet")