In [55]:
import time
start_time = time.time()

import numpy as np
import pandas as pd

from nltk.stem.porter import *
stemmer = PorterStemmer()
import re

import random

random.seed(230)

# load csv data
df_train = pd.read_csv('./input/train_subset.csv', encoding="ISO-8859-1")
df_test = pd.read_csv('./input/test.csv', encoding="ISO-8859-1")
df_attributes = pd.read_csv('./input/attributes.csv')
num_train = df_train.shape[0]   # get the length of df_train
print(str(num_train) + " rows read from train.csv")


100 rows read from train.csv


In [56]:
# concat train and test data
df_all = pd.concat((df_train, df_test), axis=0, ignore_index=True)
df_all = pd.merge(df_all, df_attributes, how='left', on='product_uid')
num_all = df_all.shape[0]
print(str(num_all) + " rows read from test.csv + train.csv")

3458774 rows read from test.csv + train.csv


In [91]:
# FOR TESTING
df_all = df_train[0:5]

In [92]:
def str_stemmer(s):
    return " ".join([stemmer.stem(word) for word in s.lower().split()])

def str_common_word(str1, str2):  # this function not used in current example
    return sum(int(str2.find(word)>=0) for word in str1.split())


In [93]:
def shingles(s, k = 5):
    return [s[i:i + k] for i in range(len(s) - k + 1)]

In [96]:
# apply str_stemmer to all search terms - converts to lower case
#df_all['search_term'] = df_all['search_term'].map(lambda x:str_stemmer(x))

# obtain shingles to all search terms
df_all['shingles'] = df_all['search_term'].map(shingles)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy


In [97]:
df_all

Unnamed: 0,id,product_uid,product_title,search_term,relevance,shingles
0,2,100001,Simpson Strong-Tie 12-Gauge Angle,angle bracket,3.0,"[angle, ngle , gle b, le br, e bra, brac, bra..."
1,3,100001,Simpson Strong-Tie 12-Gauge Angle,l bracket,2.5,"[l bra, brac, brack, racke, acket]"
2,9,100002,BEHR Premium Textured DeckOver 1-gal. #SC-141 ...,deck over,3.0,"[deck , eck o, ck ov, k ove, over]"
3,16,100005,Delta Vero 1-Handle Shower Only Faucet Trim Ki...,rain shower head,2.33,"[rain , ain s, in sh, n sho, show, showe, how..."
4,17,100005,Delta Vero 1-Handle Shower Only Faucet Trim Ki...,shower only faucet,2.67,"[showe, hower, ower , wer o, er on, r onl, on..."


In [98]:
# create characteristic matrix
def bitvector(i, shingles, charMatrix):
    for shingle in shingles:
        charMatrix.ix[shingle, i] = 1

In [119]:
charMatrix = pd.DataFrame(columns = df_all['id'])
df_all.apply(lambda x: bitvector(x['id'], x['shingles'], charMatrix), axis=1)
charMatrix.fillna(0, inplace= True)
countShingles = len(charMatrix)
countDocuments = len(charMatrix.columns)
charMatrix['index'] = range(0, count)
charMatrix

id,2,3,9,16,17,index
angle,1,0,0,0,0,0
ngle,1,0,0,0,0,1
gle b,1,0,0,0,0,2
le br,1,0,0,0,0,3
e bra,1,0,0,0,0,4
brac,1,1,0,0,0,5
brack,1,1,0,0,0,6
racke,1,1,0,0,0,7
acket,1,1,0,0,0,8
l bra,0,1,0,0,0,9


In [120]:
numberHashFunctions = 100
hashFunctions = pd.DataFrame(np.random.randint(count, size=(2, numberHashFunctions)))
hashFunctions

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,90,91,92,93,94,95,96,97,98,99
0,6,8,10,15,29,10,14,14,25,23,...,35,20,6,19,7,8,25,18,29,37
1,19,10,4,2,15,37,25,22,2,22,...,12,15,17,6,8,30,34,34,7,9


In [121]:
def calculateHash(row, hashRow):
    return (row['index'] * hashRow[0] + hashRow[1]) % countShingles

def createHash(row, minhashTable):
    hashColumn = str(row[0])+ "x + "+ str(row[1]) + " mod " + str(countShingles)
    minhashTable[hashColumn] = minhashTable.apply(calculateHash, args=(row, ), axis=1)

hashFunctions.apply(createHash, args=(charMatrix,))
charMatrix
    

id,2,3,9,16,17,index,6x + 19 mod 38,8x + 10 mod 38,10x + 4 mod 38,15x + 2 mod 38,...,35x + 12 mod 38,20x + 15 mod 38,6x + 17 mod 38,19x + 6 mod 38,7x + 8 mod 38,8x + 30 mod 38,25x + 34 mod 38,18x + 34 mod 38,29x + 7 mod 38,37x + 9 mod 38
angle,1,0,0,0,0,0,19,10,4,2,...,12,15,17,6,8,30,34,34,7,9
ngle,1,0,0,0,0,1,25,18,14,17,...,9,35,23,25,15,0,21,14,36,8
gle b,1,0,0,0,0,2,31,26,24,32,...,6,17,29,6,22,8,8,32,27,7
le br,1,0,0,0,0,3,37,34,34,9,...,3,37,35,25,29,16,33,12,18,6
e bra,1,0,0,0,0,4,5,4,6,24,...,0,19,3,6,36,24,20,30,9,5
brac,1,1,0,0,0,5,11,12,16,1,...,35,1,9,25,5,32,7,10,0,4
brack,1,1,0,0,0,6,17,20,26,16,...,32,21,15,6,12,2,32,28,29,3
racke,1,1,0,0,0,7,23,28,36,31,...,29,3,21,25,19,10,19,8,20,2
acket,1,1,0,0,0,8,29,36,8,8,...,26,23,27,6,26,18,6,26,11,1
l bra,0,1,0,0,0,9,35,6,18,23,...,23,5,33,25,33,26,31,6,2,0


In [135]:
hashFunctionNames = charMatrix.columns.values[countDocuments+1:]
documentIndices = charMatrix.columns.values[0:countDocuments]
sigMatrix = pd.DataFrame(2*count,index=hashFunctionNames, columns=documentIndices)
sigMatrix

Unnamed: 0,2,3,9,16,17
6x + 19 mod 38,76,76,76,76,76
8x + 10 mod 38,76,76,76,76,76
10x + 4 mod 38,76,76,76,76,76
15x + 2 mod 38,76,76,76,76,76
29x + 15 mod 38,76,76,76,76,76
10x + 37 mod 38,76,76,76,76,76
14x + 25 mod 38,76,76,76,76,76
14x + 22 mod 38,76,76,76,76,76
25x + 2 mod 38,76,76,76,76,76
23x + 22 mod 38,76,76,76,76,76


In [140]:
hashFunctions = charMatrix.ix[:,countDocuments+1:]
documents = charMatrix.ix[:,0:countDocuments]

def compareHashes(hashColumn, docMatch):
    charMatrixVal = charMatrix[hashColumn.name][docMatch]
    print(charMatrixVal)
    return 0;

def calculateSignatureMat(docRow, sigMatrix):
    matches = docRow[docRow == 1].index.tolist()
    for match in matches:
        sigMatrix[match] = sigMatrix.apply(compareHashes, args=(match,))

#documents.apply(calculateSignatureMat, args=(sigMatrix,), axis=1)


id,6x + 19 mod 38,8x + 10 mod 38,10x + 4 mod 38,15x + 2 mod 38,29x + 15 mod 38,10x + 37 mod 38,14x + 25 mod 38,14x + 22 mod 38,25x + 2 mod 38,23x + 22 mod 38,...,35x + 12 mod 38,20x + 15 mod 38,6x + 17 mod 38,19x + 6 mod 38,7x + 8 mod 38,8x + 30 mod 38,25x + 34 mod 38,18x + 34 mod 38,29x + 7 mod 38,37x + 9 mod 38
angle,19,10,4,2,15,37,25,22,2,22,...,12,15,17,6,8,30,34,34,7,9
ngle,25,18,14,17,6,9,1,36,27,7,...,9,35,23,25,15,0,21,14,36,8
gle b,31,26,24,32,35,19,15,12,14,30,...,6,17,29,6,22,8,8,32,27,7
le br,37,34,34,9,26,29,29,26,1,15,...,3,37,35,25,29,16,33,12,18,6
e bra,5,4,6,24,17,1,5,2,26,0,...,0,19,3,6,36,24,20,30,9,5
brac,11,12,16,1,8,11,19,16,13,23,...,35,1,9,25,5,32,7,10,0,4
brack,17,20,26,16,37,21,33,30,0,8,...,32,21,15,6,12,2,32,28,29,3
racke,23,28,36,31,28,31,9,6,25,31,...,29,3,21,25,19,10,19,8,20,2
acket,29,36,8,8,19,3,23,20,12,16,...,26,23,27,6,26,18,6,26,11,1
l bra,35,6,18,23,10,13,37,34,37,1,...,23,5,33,25,33,26,31,6,2,0
