# Product Matching by Robby Jeffries
## Amazon and Sam's Club

This notebook will identify which products appear in both the Amazon and Sam's Club data sets.

In [1]:
import os
import numpy as np
import pandas as pd

In [2]:
# set working directory
os.chdir('/Users/robbyjeffries/MSEACapstone/Data')

# import raw review data for the Electronics category
amz_raw = pd.read_csv('Products/products_with_first_review_date.csv')
sams_raw = pd.read_excel('Products/Products_Vendor_Start_Date.xlsx', index_col=0) 

In [3]:
sams_raw.head()

Unnamed: 0_level_0,Vendor_Name,UPC_Desc,Start Date
Primary_Desc,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
CG PORT CHARGER LRG,CONCEPT GREEN ENERGY SOLUTIONS,CHARGER,2014-07-01
48IN 720P LED TV,JSP LLC,48IN JVC TV,2015-10-14
"19"" MTS TV",SANYO MFG CORP,"19""REFURB TV",2014-11-28
BELKIN GRP CDY SHEER,BRIGHTSTAR US INC,PHONE CASE,2014-07-03
"TOSHIBA 27"" TV",TOSHIBA AMERICA INC,"TV 27"" PIP",2015-05-30


In [4]:
amz_raw.head()

Unnamed: 0,title,firstReview
0,1000 Cd DVD Silver Aluminum Hard Case for Medi...,2014-01-01
1,Toshiba Satellite L670-17E Laptop Screen 17.3 ...,2014-01-01
2,ASUS GTX780TI-3GD5 Graphics Cards GTX780TI-3GD5,2014-01-01
3,12&quot; SATA 15-Pin Male to Female Power Exte...,2014-01-01
4,GreatShield LEAN Series Slim Bluetooth Keyboar...,2014-01-01


### Example from Toward Data Science
https://towardsdatascience.com/surprisingly-effective-way-to-name-matching-in-python-1a67328e670e

In [8]:
if False: # Change it to true if you haven't installed it
    !pip install cython
    !pip install git+https://github.com/ing-bank/sparse_dot_topn.git

In [9]:
#  Importing libraries and module and some setting for notebook
import pandas as pd 
import re
from sklearn.feature_extraction.text import TfidfVectorizer
import numpy as np
from scipy.sparse import csr_matrix
import sparse_dot_topn.sparse_dot_topn as ct  # Leading Juice for us
import time
pd.set_option('display.max_colwidth', -1)

# reading dataset as df
df = pd.read_csv('Products/products_with_first_review_date.csv')

# printing first five rows
df.head(5)

  pd.set_option('display.max_colwidth', -1)


Unnamed: 0,title,firstReview
0,1000 Cd DVD Silver Aluminum Hard Case for Media Storage Holder w/Hanger Sleeves,2014-01-01
1,Toshiba Satellite L670-17E Laptop Screen 17.3 LED BOTTOM LEFT WXGA++ 1600x900,2014-01-01
2,ASUS GTX780TI-3GD5 Graphics Cards GTX780TI-3GD5,2014-01-01
3,12&quot; SATA 15-Pin Male to Female Power Extension Cable,2014-01-01
4,GreatShield LEAN Series Slim Bluetooth Keyboard Leather Case with Sleep / Wake Cover for Google Nexus 7 FHD 2nd Gen (2013) - Black/Blue,2014-01-01


In [11]:
def ngrams(string, n=3):
    
    string = re.sub(r'[,-./]|\sBD',r'', string)
    ngrams = zip(*[string[i:] for i in range(n)])
    return [''.join(ngram) for ngram in ngrams]

# Testing ngrams work for verification
print('All 3-grams in "Deluxroom":')
ngrams('Deluxroom')

All 3-grams in "Deluxroom":


['Del', 'elu', 'lux', 'uxr', 'xro', 'roo', 'oom']

In [12]:
products = df['title']
vectorizer = TfidfVectorizer(min_df=1, analyzer=ngrams)
tf_idf_matrix = vectorizer.fit_transform(products)

In [13]:
print(tf_idf_matrix[0])

  (0, 112899)	0.16136184017158206
  (0, 89806)	0.11716672116921079
  (0, 89011)	0.12539803554645057
  (0, 97904)	0.10793643099622584
  (0, 69347)	0.10887550257110412
  (0, 2436)	0.09460996511982211
  (0, 105404)	0.07229125110871673
  (0, 91737)	0.09204861623304697
  (0, 101220)	0.1148782024843828
  (0, 83004)	0.11882564321480643
  (0, 50266)	0.11135260940984015
  (0, 113594)	0.22331113023652624
  (0, 3789)	0.21155747597276103
  (0, 105436)	0.10403715095580454
  (0, 86987)	0.0909779238603611
  (0, 97828)	0.1152127994440927
  (0, 103058)	0.10731524355908936
  (0, 50477)	0.12878919897155897
  (0, 1718)	0.09772693157311348
  (0, 87572)	0.10315007221648095
  (0, 91675)	0.09947818720534739
  (0, 82653)	0.11233701705999256
  (0, 106360)	0.13630874069923593
  (0, 103395)	0.1351365534087081
  (0, 110810)	0.08013756350381719
  :	:
  (0, 101746)	0.13981788332382855
  (0, 95057)	0.1281269689604296
  (0, 99562)	0.10586824572869268
  (0, 111936)	0.12714886351628873
  (0, 98463)	0.13257058821880338
 

In [14]:
def awesome_cossim_top(A, B, ntop, lower_bound=0):
    # force A and B as a CSR matrix.
    # If they have already been CSR, there is no overhead
    A = A.tocsr()
    B = B.tocsr()
    M, _ = A.shape
    _, N = B.shape
 
    idx_dtype = np.int32
 
    nnz_max = M*ntop
 
    indptr = np.zeros(M+1, dtype=idx_dtype)
    indices = np.zeros(nnz_max, dtype=idx_dtype)
    data = np.zeros(nnz_max, dtype=A.dtype)
    
    ct.sparse_dot_topn(
            M, N, np.asarray(A.indptr, dtype=idx_dtype),
            np.asarray(A.indices, dtype=idx_dtype),
            A.data,
            np.asarray(B.indptr, dtype=idx_dtype),
            np.asarray(B.indices, dtype=idx_dtype),
            B.data,
            ntop,
            lower_bound,
            indptr, indices, data)
    
    return csr_matrix((data,indices,indptr),shape=(M,N))


In [None]:
#  Top 10 with similarity above 0.8
t1 = time.time()
matches = awesome_cossim_top(tf_idf_matrix, tf_idf_matrix.transpose(), 10, 0.8)
t = time.time()-t1
print("SELFTIMED:", t)

In [None]:
# unpacks the resulting sparse matrix
def get_matches_df(sparse_matrix, name_vector, top=100):
    non_zeros = sparse_matrix.nonzero()
    
    sparserows = non_zeros[0]
    sparsecols = non_zeros[1]
    
    if top:
        nr_matches = top
    else:
        nr_matches = sparsecols.size
    
    left_side = np.empty([nr_matches], dtype=object)
    right_side = np.empty([nr_matches], dtype=object)
    similairity = np.zeros(nr_matches)
    
    for index in range(0, nr_matches):
        left_side[index] = name_vector[sparserows[index]]
        right_side[index] = name_vector[sparsecols[index]]
        similairity[index] = sparse_matrix.data[index]
    
    return pd.DataFrame({'left_side': left_side,
                          'right_side': right_side,
                           'similairity': similairity})

In [None]:
# store the  matches into new dataframe called matched_df and 
# printing 10 samples
matches_df = get_matches_df(matches, room_types, top=200)
matches_df = matches_df[matches_df['similairity'] < 0.99999] # For removing all exact matches
matches_df.sample(10)

In [None]:
# printing the matches in sorted order
matches_df.sort_values(['similairity'], ascending=False).head(10)