In [10]:
from utils import *

In [11]:
print('importing data')
df_train = pd.read_csv('../data/train.csv' ,nrows=5)
df_test = pd.read_csv('../data/test.csv', nrows=5)
df_pro_desc = pd.read_csv('../data/product_descriptions.csv')
df_attr = pd.read_csv('../data/attributes.csv')

importing data


In [12]:
df_train.dtypes

id                 int64
product_uid        int64
product_title     object
search_term       object
relevance        float64
dtype: object

In [13]:
print('extracting brand attributes')
df_brand = df_attr[df_attr.name == "MFG Brand Name"][["product_uid", "value"]].rename(columns={"value": "brand"})



extracting brand attributes


In [14]:
print('merging predictor data')
df_all = pd.concat((df_train, df_test), axis=0, ignore_index=True)
df_all = pd.merge(df_all, df_pro_desc, how='left', on='product_uid')
df_all = pd.merge(df_all, df_brand, how='left', on='product_uid')

merging predictor data


In [15]:
def str_stem(s):
    if isinstance(s, str):
        s = s.lower()
        s = re.sub(r"(\w)\.([A-Z])", r"\1 \2", s)
    
        s = s.replace(" x "," xby ")
        s = s.replace("*"," xby ")
        s = s.replace(" by "," xby")
        s = s.replace("x0"," xby 0")
        s = s.replace("x1"," xby 1")
        s = s.replace("x2"," xby 2")
        s = s.replace("x3"," xby 3")
        s = s.replace("x4"," xby 4")
        s = s.replace("x5"," xby 5")
        s = s.replace("x6"," xby 6")
        s = s.replace("x7"," xby 7")
        s = s.replace("x8"," xby 8")
        s = s.replace("x9"," xby 9")
        s = s.replace("0x","0 xby ")
        s = s.replace("1x","1 xby ")
        s = s.replace("2x","2 xby ")
        s = s.replace("3x","3 xby ")
        s = s.replace("4x","4 xby ")
        s = s.replace("5x","5 xby ")
        s = s.replace("6x","6 xby ")
        s = s.replace("7x","7 xby ")
        s = s.replace("8x","8 xby ")
        s = s.replace("9x","9 xby ")
        
        s = re.sub(r"([0-9]+)( *)(inches|inch|in|')\.?", r"\1in. ", s)
        s = re.sub(r"([0-9]+)( *)(foot|feet|ft|'')\.?", r"\1ft. ", s)
        s = re.sub(r"([0-9]+)( *)(pounds|pound|lbs|lb)\.?", r"\1lb. ", s)
        s = re.sub(r"([0-9]+)( *)(square|sq) ?\.?(feet|foot|ft)\.?", r"\1sq.ft. ", s)
        s = re.sub(r"([0-9]+)( *)(gallons|gallon|gal)\.?", r"\1gal. ", s)
        s = re.sub(r"([0-9]+)( *)(ounces|ounce|oz)\.?", r"\1oz. ", s)
        s = re.sub(r"([0-9]+)( *)(centimeters|cm)\.?", r"\1cm. ", s)
        s = re.sub(r"([0-9]+)( *)(milimeters|mm)\.?", r"\1mm. ", s)
        s = re.sub(r"([0-9]+)( *)(millimeters|mm)\.?", r"\1mm. ", s)
        s = re.sub(r"([0-9]+)( *)(degrees|degree)\.?", r"\1deg. ", s)
        s = re.sub(r"([0-9]+)( *)(volts|volt)\.?", r"\1volt. ", s)
        s = re.sub(r"([0-9]+)( *)(watts|watt)\.?", r"\1watt. ", s)
        s = re.sub(r"([0-9]+)( *)(amperes|ampere|amps|amp)\.?", r"\1amp. ", s)

        s = s.replace("whirpool", "whirlpool")
        s = s.replace("whirlpoolga", "whirlpool")
        s = s.replace("whirlpoolstainless", "whirlpool stainless")

        s = s.replace(" +", " ")
        s = " ".join([stemmer.stem(z) for z in s.split(" ")])
        return s

    else:
        print type(s)
        print s
        return "null"

In [16]:
str_stem('6x6')

u'6 xbi 6'

In [17]:
str_stem(df_all['product_title'][2])

u'behr premium textur deckov 1-gal. #sc-141 tugboat wood and concret coat'

In [18]:
df_all['product_title'].map(lambda x: str_stem(x))

0                       simpson strong-ti 12-gaug angl
1                       simpson strong-ti 12-gaug angl
2    behr premium textur deckov 1-gal. #sc-141 tugb...
3    delta vero 1-handl shower onli faucet trim kit...
4    delta vero 1-handl shower onli faucet trim kit...
5                       simpson strong-ti 12-gaug angl
6                       simpson strong-ti 12-gaug angl
7                       simpson strong-ti 12-gaug angl
8                       simpson strong-ti 12-gaug angl
9                       simpson strong-ti 12-gaug angl
Name: product_title, dtype: object

In [19]:
print('creating additional features')
df_all['search_term'] = df_all['search_term'].map(lambda x: str_stem(str(x)))
df_all['product_title'] = df_all['product_title'].map(lambda x: str_stem(str(x)))
df_all['product_description'] = df_all['product_description'].map(lambda x: str_stem(str(x)))
df_all['brand'] = df_all['brand'].map(lambda x: str_stem(str(x)))
df_all['len_of_query'] = df_all['search_term'].map(lambda x: len(str(x).split())).astype(np.int64)
df_all['len_of_title'] = df_all['product_title'].map(lambda x: len(str(x).split())).astype(np.int64)
df_all['len_of_description'] = df_all['product_description'].map(lambda x: len(str(x).split())).astype(np.int64)
df_all['len_of_brand'] = df_all['brand'].map(lambda x: len(str(x).split())).astype(np.int64)
df_all['product_info'] = df_all['search_term']+"\t"+df_all['product_title'] + "\t" + df_all['product_description']
df_all['query_in_title'] = df_all['product_info'].map(lambda x: str_whole_word(str(x).split('\t')[0],str(x).split('\t')[1],0))
df_all['query_in_description'] = df_all['product_info'].map(lambda x: str_whole_word(str(x).split('\t')[0],str(x).split('\t')[2],0))
df_all['word_in_title'] = df_all['product_info'].map(lambda x: str_common_word(str(x).split('\t')[0],str(x).split('\t')[1]))
df_all['word_in_description'] = df_all['product_info'].map(lambda x: str_common_word(str(x).split('\t')[0],str(x).split('\t')[2]))
df_all['ratio_title'] = df_all['word_in_title']/df_all['len_of_query']
df_all['ratio_description'] = df_all['word_in_description']/df_all['len_of_query']
df_all['attr'] = df_all['search_term']+"\t"+df_all['brand']
df_all['word_in_brand'] = df_all['attr'].map(lambda x: str_common_word(str(x).split('\t')[0],str(x).split('\t')[1]))
df_all['ratio_brand'] = df_all['word_in_brand']/df_all['len_of_brand']
df_all['search_term_feature'] = df_all['search_term'].map(lambda x: len(str(x)))

df_brand = pd.unique(df_all.brand.ravel())
d = {}
i = 1
for s in df_brand:
    d[s] = i
    i += 1
df_all['brand_feature'] = df_all['brand'].map(lambda x: d[x])

creating additional features


In [20]:
df_all

Unnamed: 0,id,product_title,product_uid,relevance,search_term,product_description,brand,len_of_query,len_of_title,len_of_description,...,query_in_description,word_in_title,word_in_description,ratio_title,ratio_description,attr,word_in_brand,ratio_brand,search_term_feature,brand_feature
0,2,simpson strong-ti 12-gaug angl,100001,3.0,angl bracket,"not onli do angl make joint stronger, they als...",simpson strong-ti,2,4,126,...,0,1,1,0.5,0.5,angl bracket\tsimpson strong-ti,0,0.0,12,1
1,3,simpson strong-ti 12-gaug angl,100001,2.5,l bracket,"not onli do angl make joint stronger, they als...",simpson strong-ti,2,4,126,...,0,1,1,0.5,0.5,l bracket\tsimpson strong-ti,0,0.0,9,1
2,9,behr premium textur deckov 1-gal. #sc-141 tugb...,100002,3.0,deck over,behr premium textur deckov is an innov solid c...,behr premium textur deckov,2,11,167,...,0,1,1,0.5,0.5,deck over\tbehr premium textur deckov,1,0.25,9,2
3,16,delta vero 1-handl shower onli faucet trim kit...,100005,2.33,rain shower head,updat your bathroom with the delta vero single...,delta,3,13,103,...,0,1,1,0.333333,0.333333,rain shower head\tdelta,0,0.0,16,3
4,17,delta vero 1-handl shower onli faucet trim kit...,100005,2.67,shower onli faucet,updat your bathroom with the delta vero single...,delta,3,13,103,...,0,3,2,1.0,0.666667,shower onli faucet\tdelta,0,0.0,18,3
5,1,simpson strong-ti 12-gaug angl,100001,,90deg. bracket,"not onli do angl make joint stronger, they als...",simpson strong-ti,2,4,126,...,0,0,0,0.0,0.0,90deg. bracket\tsimpson strong-ti,0,0.0,15,1
6,4,simpson strong-ti 12-gaug angl,100001,,metal l bracket,"not onli do angl make joint stronger, they als...",simpson strong-ti,3,4,126,...,0,1,1,0.333333,0.333333,metal l bracket\tsimpson strong-ti,0,0.0,15,1
7,5,simpson strong-ti 12-gaug angl,100001,,simpson sku abl,"not onli do angl make joint stronger, they als...",simpson strong-ti,3,4,126,...,0,1,1,0.333333,0.333333,simpson sku abl\tsimpson strong-ti,1,0.5,15,1
8,6,simpson strong-ti 12-gaug angl,100001,,simpson strong tie,"not onli do angl make joint stronger, they als...",simpson strong-ti,3,4,126,...,0,2,2,0.666667,0.666667,simpson strong tie\tsimpson strong-ti,2,1.0,19,1
9,7,simpson strong-ti 12-gaug angl,100001,,simpson strong tie hcc668,"not onli do angl make joint stronger, they als...",simpson strong-ti,4,4,126,...,0,2,2,0.5,0.5,simpson strong tie hcc668\tsimpson strong-ti,2,1.0,25,1


In [4]:
from utils import *

In [6]:
check_typos('storage shelve')

'storage shelf'