In [89]:
import time
start_time = time.time()

import numpy as np
import pandas as pd
import math as mt

from nltk.stem.porter import *
stemmer = PorterStemmer()

# load csv data
df_train = pd.read_csv('./input/train_subset.csv', encoding="ISO-8859-1")

In [95]:
#Preprocessing
from nltk.corpus import stopwords

def preprocess(row):
    searchterm = row["search_term"]
    return len(searchterm)

def removeStop(row):
    phraseArray = row["search_term"].split()
    stops = set(stopwords.words("english"))
    nonstopwords = [w for w in phraseArray if not w in stops]
    return len(nonstopwords)

def countWords(row):
    phaseArray = row["search_term"].split()
    return len(phaseArray)

df_train['char_count'] = df_train.apply(preprocess, axis=1)
df_train['no_stopwords'] = df_train.apply(removeStop, axis=1)
df_train['word_count'] = df_train.apply(countWords, axis=1)

print('All words')
print(df_train[['relevance', 'char_count']].corr(method='pearson'))
print('\nNo stopwords')
print(df_train[['relevance', 'no_stopwords']].corr(method='pearson'))
print('\nWord counts')
print(df_train[['relevance', 'word_count']].corr(method='pearson'))

#Strongest relationship is without stopwords

All words
            relevance  char_count
relevance    1.000000   -0.001533
char_count  -0.001533    1.000000

No stopwords
              no_stopwords  relevance
no_stopwords      1.000000  -0.032249
relevance        -0.032249   1.000000

Word counts
            relevance  word_count
relevance    1.000000   -0.004733
word_count  -0.004733    1.000000


In [101]:
from nltk.stem.porter import *
stemmer = PorterStemmer()

shingle_size = 3
def cleanAndShingle(s, k = shingle_size):
    #remove stop words
    s = s.lower()
    #print(s)
    phraseArray = s.split()
    stops = set(stopwords.words("english"))
    stops = set([])
    #remove stopwords
    nonstopwords = [w for w in phraseArray if not w in stops]
    #remove meassure units
    units = set(["in.", "ft.", "&amp;"])
    nounits = [word for word in nonstopwords if not word in units]
    s = " ".join([stemmer.stem(word) for word in nounits])
    s = re.sub('[*]', 'x', s)
    s = re.sub('[by]', 'x', s)
    return [s[i:i + k] for i in range(len(s) - k + 1)]

In [102]:
df_train['search_shingles'] = df_train['search_term'].map(cleanAndShingle)
df_train['title_shingles'] = df_train['product_title'].map(cleanAndShingle)

In [103]:
def jaccard(row):
    search_shingles = set(row["search_shingles"])
    title_shingles = set(row["title_shingles"])
    
    union = search_shingles | title_shingles
    intersection = search_shingles & title_shingles
    
    return (len(intersection)/len(union))

df_train['jaccard'] = df_train.apply(jaccard, axis=1)
df_train[["search_shingles", "title_shingles", "jaccard"]]

Unnamed: 0,search_shingles,title_shingles,jaccard
0,"[ang, ngl, gl , l x, xr, xra, rac, ack, cke, ...","[sim, imp, mps, pso, son, on , n s, st, str, ...",0.055556
1,"[l x, xr, xra, rac, ack, cke, ket]","[sim, imp, mps, pso, son, on , n s, st, str, ...",0.000000
2,"[dec, eck, ck , k o, ov, ove, ver]","[xeh, ehr, hr , r p, pr, pre, rem, emi, miu, ...",0.027778
3,"[rai, ain, in , n s, sh, sho, how, owe, wer, ...","[del, elt, lta, ta , a v, ve, ver, ero, ro , ...",0.088608
4,"[sho, how, owe, wer, er , r o, on, onl, nli, ...","[del, elt, lta, ta , a v, ve, ver, ero, ro , ...",0.222222
5,"[con, onv, nve, vec, ect, ct , t o, ot, otr]","[whi, hir, irl, rlp, lpo, poo, ool, ol , l 1, ...",0.072289
6,"[mic, icr, cro, row, owa, wav, av , v o, ov, ...","[whi, hir, irl, rlp, lpo, poo, ool, ol , l 1, ...",0.142857
7,"[mic, icr, cro, row, owa, wav]","[whi, hir, irl, rlp, lpo, poo, ool, ol , l 1, ...",0.075000
8,"[eme, mer, erg, rg , g l, li, lig, igh, ght]","[lit, ith, tho, hon, oni, nia, ia , a l, li, ...",0.150943
9,"[mdf, df , f 3, 3/, 3/4]","[hou, ous, us , s o, of, of , f f, fa, far, ...",0.111111


In [104]:
print(df_train[['relevance', 'jaccard']].corr(method='pearson'))

           relevance   jaccard
relevance   1.000000  0.332853
jaccard     0.332853  1.000000


In [96]:
df_train

Unnamed: 0,id,product_uid,product_title,search_term,relevance,char_count,no_stopwords,word_count,search_shingles,title_shingles,jaccard
0,2,100001,Simpson Strong-Tie 12-Gauge Angle,angle bracket,3.00,13,2,2,"[ang, ngl, gl , l b, br, bra, rac, ack, cke, ...","[sim, imp, mps, pso, son, on , n s, st, str, ...",0.055556
1,3,100001,Simpson Strong-Tie 12-Gauge Angle,l bracket,2.50,9,2,2,"[l b, br, bra, rac, ack, cke, ket]","[sim, imp, mps, pso, son, on , n s, st, str, ...",0.000000
2,9,100002,BEHR Premium Textured DeckOver 1-gal. #SC-141 ...,deck over,3.00,9,1,2,"[dec, eck, ck , k o, ov, ove, ver]","[beh, ehr, hr , r p, pr, pre, rem, emi, miu, ...",0.027778
3,16,100005,Delta Vero 1-Handle Shower Only Faucet Trim Ki...,rain shower head,2.33,16,3,3,"[rai, ain, in , n s, sh, sho, how, owe, wer, ...","[del, elt, lta, ta , a v, ve, ver, ero, ro , ...",0.088608
4,17,100005,Delta Vero 1-Handle Shower Only Faucet Trim Ki...,shower only faucet,2.67,18,2,3,"[sho, how, owe, wer, er , r o, on, onl, nli, ...","[del, elt, lta, ta , a v, ve, ver, ero, ro , ...",0.222222
5,18,100006,Whirlpool 1.9 cu. ft. Over the Range Convectio...,convection otr,3.00,14,2,2,"[con, onv, nve, vec, ect, ct , t o, ot, otr]","[whi, hir, irl, rlp, lpo, poo, ool, ol , l 1, ...",0.072289
6,20,100006,Whirlpool 1.9 cu. ft. Over the Range Convectio...,microwave over stove,2.67,20,2,3,"[mic, icr, cro, row, owa, wav, av , v o, ov, ...","[whi, hir, irl, rlp, lpo, poo, ool, ol , l 1, ...",0.142857
7,21,100006,Whirlpool 1.9 cu. ft. Over the Range Convectio...,microwaves,3.00,10,1,1,"[mic, icr, cro, row, owa, wav]","[whi, hir, irl, rlp, lpo, poo, ool, ol , l 1, ...",0.075000
8,23,100007,Lithonia Lighting Quantum 2-Light Black LED Em...,emergency light,2.67,15,2,2,"[eme, mer, erg, rg , g l, li, lig, igh, ght]","[lit, ith, tho, hon, oni, nia, ia , a l, li, ...",0.150943
9,27,100009,House of Fara 3/4 in. x 3 in. x 8 ft. MDF Flut...,mdf 3/4,3.00,7,2,2,"[mdf, df , f 3, 3/, 3/4]","[hou, ous, us , s o, of, of , f f, fa, far, ...",0.111111
