In [14]:
import time
start_time = time.time()

import numpy as np
import pandas as pd

from nltk.stem.porter import *
stemmer = PorterStemmer()
import re

import random

random.seed(230)

# load csv data
df_train = pd.read_csv('./input/train_subset.csv', encoding="ISO-8859-1")
df_test = pd.read_csv('./input/test.csv', encoding="ISO-8859-1")
df_attributes = pd.read_csv('./input/attributes.csv')
num_train = df_train.shape[0]   # get the length of df_train
print(str(num_train) + " rows read from train.csv")


100 rows read from train.csv


In [15]:
# concat train and test data
df_all = pd.concat((df_train, df_test), axis=0, ignore_index=True)
df_all = pd.merge(df_all, df_attributes, how='left', on='product_uid')
num_all = df_all.shape[0]
print(str(num_all) + " rows read from test.csv + train.csv")

3458774 rows read from test.csv + train.csv


In [16]:
# FOR TESTING
df_all = df_train[0:5]

In [17]:
def str_stemmer(s):
    return " ".join([stemmer.stem(word) for word in s.lower().split()])

def str_common_word(str1, str2):  # this function not used in current example
    return sum(int(str2.find(word)>=0) for word in str1.split())


In [18]:
def shingles(s, k = 5):
    return [s[i:i + k] for i in range(len(s) - k + 1)]

In [19]:
# apply str_stemmer to all search terms - converts to lower case
#df_all['search_term'] = df_all['search_term'].map(lambda x:str_stemmer(x))

# obtain shingles to all search terms
df_all['shingles'] = df_all['search_term'].map(shingles)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy


In [20]:
df_all

Unnamed: 0,id,product_uid,product_title,search_term,relevance,shingles
0,2,100001,Simpson Strong-Tie 12-Gauge Angle,angle bracket,3.0,"[angle, ngle , gle b, le br, e bra, brac, bra..."
1,3,100001,Simpson Strong-Tie 12-Gauge Angle,l bracket,2.5,"[l bra, brac, brack, racke, acket]"
2,9,100002,BEHR Premium Textured DeckOver 1-gal. #SC-141 ...,deck over,3.0,"[deck , eck o, ck ov, k ove, over]"
3,16,100005,Delta Vero 1-Handle Shower Only Faucet Trim Ki...,rain shower head,2.33,"[rain , ain s, in sh, n sho, show, showe, how..."
4,17,100005,Delta Vero 1-Handle Shower Only Faucet Trim Ki...,shower only faucet,2.67,"[showe, hower, ower , wer o, er on, r onl, on..."


In [21]:
# create characteristic matrix
def bitvector(i, shingles, charMatrix):
    for shingle in shingles:
        charMatrix.ix[shingle, i] = 1

In [23]:
charMatrix = pd.DataFrame(columns = df_all['id'])
df_all.apply(lambda x: bitvector(x['id'], x['shingles'], charMatrix), axis=1)
charMatrix.fillna(0, inplace= True)
countShingles = len(charMatrix)
countDocuments = len(charMatrix.columns)
charMatrix['index'] = range(0, countShingles)
charMatrix

id,2,3,9,16,17,index
angle,1,0,0,0,0,0
ngle,1,0,0,0,0,1
gle b,1,0,0,0,0,2
le br,1,0,0,0,0,3
e bra,1,0,0,0,0,4
brac,1,1,0,0,0,5
brack,1,1,0,0,0,6
racke,1,1,0,0,0,7
acket,1,1,0,0,0,8
l bra,0,1,0,0,0,9


In [25]:
numberHashFunctions = 100
hashFunctions = pd.DataFrame(np.random.randint(countShingles, size=(2, numberHashFunctions)))
hashFunctions

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,90,91,92,93,94,95,96,97,98,99
0,15,12,2,35,22,28,11,17,28,2,...,3,27,19,4,25,16,9,35,0,30
1,29,3,21,6,10,8,1,20,16,34,...,2,10,18,36,0,8,10,3,31,0


In [26]:
def calculateHash(row, hashRow):
    return (row['index'] * hashRow[0] + hashRow[1]) % countShingles

def createHash(row, minhashTable):
    hashColumn = str(row[0])+ "x + "+ str(row[1]) + " mod " + str(countShingles)
    minhashTable[hashColumn] = minhashTable.apply(calculateHash, args=(row, ), axis=1)

hashFunctions.apply(createHash, args=(charMatrix,))
charMatrix
    

id,2,3,9,16,17,index,15x + 29 mod 38,12x + 3 mod 38,2x + 21 mod 38,35x + 6 mod 38,...,3x + 2 mod 38,27x + 10 mod 38,19x + 18 mod 38,4x + 36 mod 38,25x + 0 mod 38,16x + 8 mod 38,9x + 10 mod 38,35x + 3 mod 38,0x + 31 mod 38,30x + 0 mod 38
angle,1,0,0,0,0,0,29,3,21,6,...,2,10,18,36,0,8,10,3,31,0
ngle,1,0,0,0,0,1,6,15,23,3,...,5,37,37,2,25,24,19,0,31,30
gle b,1,0,0,0,0,2,21,27,25,0,...,8,26,18,6,12,2,28,35,31,22
le br,1,0,0,0,0,3,36,1,27,35,...,11,15,37,10,37,18,37,32,31,14
e bra,1,0,0,0,0,4,13,13,29,32,...,14,4,18,14,24,34,8,29,31,6
brac,1,1,0,0,0,5,28,25,31,29,...,17,31,37,18,11,12,17,26,31,36
brack,1,1,0,0,0,6,5,37,33,26,...,20,20,18,22,36,28,26,23,31,28
racke,1,1,0,0,0,7,20,11,35,23,...,23,9,37,26,23,6,35,20,31,20
acket,1,1,0,0,0,8,35,23,37,20,...,26,36,18,30,10,22,6,17,31,12
l bra,0,1,0,0,0,9,12,35,1,17,...,29,25,37,34,35,0,15,14,31,4


In [92]:
hashFunctionNames = charMatrix.columns.values[countDocuments+1:]
documentIndices = charMatrix.columns.values[0:countDocuments]
sigMatrix = pd.DataFrame(2*countShingles,index=hashFunctionNames, columns=documentIndices)
sigMatrix

Unnamed: 0,2,3,9,16,17
15x + 29 mod 38,76,76,76,76,76
12x + 3 mod 38,76,76,76,76,76
2x + 21 mod 38,76,76,76,76,76
35x + 6 mod 38,76,76,76,76,76
22x + 10 mod 38,76,76,76,76,76
28x + 8 mod 38,76,76,76,76,76
11x + 1 mod 38,76,76,76,76,76
17x + 20 mod 38,76,76,76,76,76
28x + 16 mod 38,76,76,76,76,76
2x + 34 mod 38,76,76,76,76,76


In [102]:
hashFunctions = charMatrix.ix[:,countDocuments+1:]
documents = charMatrix.ix[:,0:countDocuments]

def compareHashes(hashColumn, docMatch):
    #print(charMatrix[hashColumn.name][docMatch] < sigMatrix[docMatch][1])
    if charMatrix[hashColumn.name][docMatch] < sigMatrix[docMatch][1]:
        sigMatrix[docMatch][1] = charMatrix[hashColumn.name][docMatch]
        
def calculateSignatureMat(docRow, sigMatrix):
    matches = docRow[docRow == 1].index.tolist()
    for match in matches:
        sigMatrix.apply(compareHashes, args=(match,))
        print(sigMatrix[match])
        
documents.apply(calculateSignatureMat, args=(sigMatrix,), axis=1)
sigMatrix

Unnamed: 0,2,3,9,16,17
15x + 29 mod 38,76,76,76,76,76
12x + 3 mod 38,0,0,0,0,0
2x + 21 mod 38,76,76,76,76,76
35x + 6 mod 38,76,76,76,76,76
22x + 10 mod 38,76,76,76,76,76
28x + 8 mod 38,76,76,76,76,76
11x + 1 mod 38,76,76,76,76,76
17x + 20 mod 38,76,76,76,76,76
28x + 16 mod 38,76,76,76,76,76
2x + 34 mod 38,76,76,76,76,76
