In [1]:
import pandas as pd
import numpy as np
from fuzzywuzzy import fuzz
import difflib

import re, math
from collections import Counter

# from tqdm.auto import tqdm
# tqdm.pandas()

In [2]:
WORD = re.compile(r'\w+')

In [3]:
def get_cosine(vec1, vec2):
    intersection = set(vec1.keys()) & set(vec2.keys())
    numerator = sum([vec1[x] * vec2[x] for x in intersection])
    
    sum1 = sum([vec1[x]**2 for x in vec1.keys()])
    sum2 = sum([vec2[x]**2 for x in vec2.keys()])
    denominator = math.sqrt(sum1) * math.sqrt(sum2)
    
    if not denominator:
        return 0.0
    else:
        return float(numerator) / denominator

def text_to_vector(text):
    words = WORD.findall(text)
    return Counter(words)

def get_cosine_similarity_string(string_, list_strings):
    
    subset_strings = difflib.get_close_matches(string_, list_strings, n=5, cutoff=0.5)
    cosine_scores = []
    vector1 = text_to_vector(string_)
    
    for text in subset_strings:    
        vector2 = text_to_vector(text)
        cosine_scores.append(get_cosine(vector1, vector2))
#         string_diff.append(difflib.get_close_matches(vector1, vector2))
    temp_df = pd.DataFrame({'input_string':string_, 'possible_string':subset_strings, 'cosine_score':cosine_scores}).sort_values('cosine_score', ascending=False)
    return temp_df.iloc[:1].to_dict(orient='records')

In [4]:
data_competitor = pd.read_csv('Amazon_Fragrance.csv') # Competitor Data

In [5]:
data_competitor.head(2)

Unnamed: 0.2,Unnamed: 0,Unnamed: 0.1,catalog_code,catalog_sku,catalog_sku_group,brand,title,category,model,color,...,gender,misc1,misc2,image_link,extra,product_rating,num_product_reviews,product_url,status,catalog_sku_merged
0,65476,65476,AMAZON,B07F929XMQ,,nike,"Nike Women's Power Gym Flutter Print Tights, W...",All Perfumes,,,...,Women,"{'Material Composition': '78', 'Brand Name': '...",,https://images-na.ssl-images-amazon.com/images...,,,,https://www.amazon.ae/random-string-no-prod/dp...,,
1,57518,57518,AMAZON,B00B406WDY,,gucci,Gucci Guilty Black Eau de Toilette Spray for H...,All Perfumes,,,...,,{},,https://images-na.ssl-images-amazon.com/images...,,,,https://www.amazon.ae/random-string-no-prod/dp...,,


In [5]:
data_competitor['title'] = data_competitor['title'].str.lower().apply(lambda x : re.sub(r' ml','ml',x))

In [6]:
data_competitor['title'] = data_competitor['title'].apply(lambda x: re.sub(r'EDP|EDT|', "", x, flags=re.IGNORECASE))

In [7]:
data_competitor['title'] = data_competitor['title'].apply(lambda x: re.sub(r'_|-', " ", x))

In [8]:
data_competitor[['title']].head()

Unnamed: 0,title
0,"nike women's power gym flutter print tights, w..."
1,gucci guilty black eau de toilette spray for h...
2,"aramis havana for men, 3.4 oz spray (gentlema..."
3,la petite robe noire eau fraiche by guerlain f...
4,blue lady 2 perfume for women by rasasi 35ml


In [9]:
competitor_title = data_competitor['title']
print(competitor_title[5:15]) # amazon sample rows

5                black aoud by montale  100ml  alish s 
6             cacharel lou lou for women, 1.7 oz  spray
7                 cacharel noa for women, 3.4 oz  spray
8           davidoff cool water for men, 1.35 oz  spray
9                gres cabotine for women, 1.7 oz  spray
10    guerlain samsara eau de toilette for women   50ml
11           guerlain mitsouko for women, 1.7 oz  spray
12         lacoste style in play for men, 2.5 oz  spray
13     poeme by lancome for women   eau de parfum, 50ml
14    ysatis by givenchy for women. eau de toilette ...
Name: title, dtype: object


In [10]:
data_client = pd.read_csv("Noon_Fragrance.csv") # Client Data

In [11]:
data_client['title_en'] = data_client['title_en'].apply(lambda x : re.sub("_|-", " ",x))

In [12]:
data_client['product_subtype'] = data_client['product_subtype'].apply(lambda x : re.sub("_|-", " ",x))

In [13]:
temp = pd.DataFrame([eval(data_client['attributes'][0])])
for i in data_client['attributes'][1:]:
    temp = pd.concat([temp, pd.DataFrame([eval(i)])], axis=0)
temp.reset_index(inplace=True,drop=True)

of pandas will change to not sort by default.

To accept the future behavior, pass 'sort=False'.


  This is separate from the ipykernel package so we can avoid doing imports until


In [15]:
temp['fragrance_size'] = temp['fragrance_size'].apply(lambda x: "ml ".join(str(x).replace(' ',"").split("-")))

In [16]:
data_client  = pd.concat([data_client ,temp[['fragrance_size']]], axis=1)

In [17]:
data_client['title_en'] = data_client['title_en'].apply(lambda x: re.sub(r'EDP|EDT',"", x, flags=re.IGNORECASE))

In [29]:
data_client[['brand_code', 'title_en', 'product_subtype','department','fragrance_size']].tail()

Unnamed: 0,brand_code,title_en,product_subtype,department,fragrance_size
723,moschino,Pink Fresh Couture,eau de toilette,Women,100ml 119ml
724,chanel,Gabrielle,eau de parfum,Women,100ml 119ml
725,bvlgari,Goldea The Roman Night,eau de parfum,Women,70ml 99ml
726,bvlgari,Bvlgari,eau de toilette,Men,100ml 119ml
727,aigner,White for Men,eau de toilette,Men,120ml 149ml


In [19]:
data_client['client_title'] = data_client[['brand_code', 'title_en', 'product_subtype','department', 'fragrance_size']].apply(lambda x : " ".join(x.str.lower()), axis=1)

In [20]:
client_title = data_client.client_title.str.lower()
print(client_title[:5]) # Client sample rows

0    mancera cedrat boise  eau de parfum unisex 120...
1    mancera aoud line  eau de parfum unisex 120ml ...
2    mancera wild leather  eau de parfum unisex 120...
3    mancera wild rose aoud  eau de parfum unisex 1...
4    geoffrey_beene grey flannel for men  eau de to...
Name: client_title, dtype: object


*__Similarity Measuring__*

In [16]:
# results_df = get_cosine_similarity_string(str1, competitor_title) # Get possible matches testing functions

In [21]:
%%time
test_results = data_client['client_title'].apply(lambda x: get_cosine_similarity_string(x, competitor_title))

Wall time: 3min 33s


In [280]:
temp = pd.concat([test_results, data_client[['brand_code','title_en']]],axis=1)
tempx = temp[temp.astype(str)['client_title'] != '[]']
tempx.reset_index(inplace=True, drop=True)

In [287]:
%%time
temp = pd.DataFrame(tempx.client_title[0])
i == 1
for i in tempx.client_title[1:]:
    temp = pd.concat([temp , pd.DataFrame(i)], axis=0)

temp.reset_index(inplace=True,drop=True)

Wall time: 684 ms


In [288]:
temp.reset_index(drop=True, inplace=True)

In [289]:
temp = pd.concat( [temp, tempx[['brand_code','title_en']]], axis=1)

In [290]:
temp.sort_values('cosine_score', ascending=False).head()

Unnamed: 0,cosine_score,input_string,possible_string,brand_code,title_en
230,0.909091,gucci flora by gorgeous gardenia eau de toile...,flora gorgeous gardenia by gucci for women e...,gucci,Flora By Gorgeous Gardenia
131,0.904534,burberry my burberry eau de parfum women 50ml...,burberry my burberry eau de parfum 50ml,burberry,My Burberry
593,0.897085,lalique perles de lalique eau de parfum women...,perles de lalique by lalique for women eau d...,lalique,Perles de Lalique
367,0.888889,tiverton fly free eau de parfum men 100ml 119ml,tiverton fly free for men 100ml eau de parfum,tiverton,Fly Free
155,0.888889,adidas team force eau de toilette men 100ml 1...,"adidas team force for men eau de toilette, 10...",adidas,Team Force


In [291]:
temp.sort_values('cosine_score', ascending=False).tail()

Unnamed: 0,cosine_score,input_string,possible_string,brand_code,title_en
546,0.158114,almas dalmoon bakhour oud incense unisex 50ml ...,montale aoud ambre unisex 100ml,almas,Dalmoon Bakhour
544,0.149071,almas ghalaty oud oud incense unisex nan,montale original aouds unisex 100ml,almas,Ghalaty Oud
545,0.13484,almas bakhoor ashek al oud oud incense unisex ...,montale original aouds unisex 100ml,almas,Bakhoor Ashek Al Oud
446,0.117851,victoria_s_secret pure seduction body mist bod...,coral escape seduction for unisex 100ml,victoria_s_secret,Pure Seduction Body Mist
547,0.11547,almas oud al.oud bakhour oud incense unisex un...,montale aoud ambre unisex 100ml,almas,Oud Al.Oud Bakhour


In [294]:
temp['brand_partial_match'] = temp[['brand_code','possible_string']].apply(lambda x: fuzz.partial_ratio(x[0], x[1]),axis=1)
temp['title_partial_match'] = temp[['title_en','possible_string']].apply(lambda x: fuzz.partial_ratio(x[0], x[1]),axis=1)

In [23]:
temp['token_set_ratio'] = temp[['input_string','possible_string']].apply(lambda x: fuzz.token_set_ratio(x[0],x[1]),axis=1)

HBox(children=(IntProgress(value=0, max=670), HTML(value='')))




In [415]:
# Write the results into df
# pd.concat([data_client, temp[['cosine_score','possible_string']]], axis=1).to_csv('Noon_Fragances_MatchScore.csv')

In [8]:
i = 588
# text1 = temp.input_string[i]
# text2 = temp.possible_string[i]

text2 = 'Alienware 25 Gaming monitor AW2518H'
text1 = 'Alienware 25 monitor AW2518H'

vector1 = text_to_vector(text1)
vector2 = text_to_vector(text2)

print('cosine_score', get_cosine(vector1,vector2))
print('partial_ratio',fuzz.partial_ratio(text1, text2))
print('token_set_ratio',fuzz.token_set_ratio(text1, text2))
print('token_sort_ratio',fuzz.token_sort_ratio(text1, text2))

cosine_score 0.8944271909999159
partial_ratio 75
token_set_ratio 100
token_sort_ratio 89


In [62]:
print(vector1,'\n',vector2)

Counter({'lalique': 2, 'de': 2, 'perles': 1, 'eau': 1, 'parfum': 1, 'women': 1, '100ml': 1, '119ml': 1}) 
 Counter({'de': 2, 'lalique': 2, 'perles': 1, 'by': 1, 'for': 1, 'women': 1, 'eau': 1, 'parfum': 1, '100ml': 1})


In [173]:
temp.to_csv('Noon_FragancesMatchScore_4.csv',index=False)

In [430]:
data_client.to_csv('Raw data used for mappin.csv',index=False)

In [167]:
temp = pd.DataFrame(test_results[0])
i == 1
for idx, i in enumerate(test_results[1:]):
    temp = pd.concat([temp , pd.DataFrame(i)], axis=0)

of pandas will change to not sort by default.

To accept the future behavior, pass 'sort=False'.


  after removing the cwd from sys.path.


In [296]:
temp.to_csv('Final Results.csv',index=False)

# Rough Workspace

In [14]:
data_competitor[['title','brand','gender','size']].head()

Unnamed: 0,title,brand,gender,size
0,"nike women's power gym flutter print tights, w...",nike,Women,
1,gucci guilty black eau de toilette spray for h...,gucci,,90 ml
2,"aramis havana for men, 3.4 oz spray (gentlema...",aramis,Men,
3,la petite robe noire eau fraiche by guerlain f...,guerlain,Women,50ml
4,blue lady 2 perfume for women by rasasi 35ml,rasasi,Women,35ml


In [16]:
'women' in data_competitor.title[0].split(' ')

False

In [17]:
data_competitor.title[0].split(' ')

['nike', "women's", 'power', 'gym', 'flutter', 'print', 'tights,', 'womens']

In [19]:
vecttext_to_vector(data_competitor.title[0])

Counter({'nike': 1,
         'women': 1,
         's': 1,
         'power': 1,
         'gym': 1,
         'flutter': 1,
         'print': 1,
         'tights': 1,
         'womens': 1})

In [20]:
from nltk.stem import WordNetLemmatizer

In [22]:
lemmatizer = WordNetLemmatizer()

In [33]:
from nltk import pos_tag, pos_tag_sents, word_tokenize

In [7]:
# function to lookup token in sentence
def lookup_text(sentence, token):
    if token in sentence.split(' '):
        return token
    else:
        return 'NaN'
sentence = data_competitor.title[0]
lookup_text(sentence, 'nike')

IndentationError: unindent does not match any outer indentation level (<tokenize>, line 5)

In [123]:
amazon_data = data_competitor[['title','brand']]

In [130]:
amazon_data.loc[:,'brand_chk'] = data_competitor[['title','brand']].apply(lambda x:  lookup_text(x[0],x[1]), axis=1)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  self.obj[item_labels[indexer[info_axis]]] = value


In [128]:
amazon_data.loc[:,'gender_chk'] = data_competitor[['title','gender']].apply(lambda x:  lookup_text(x[0],x[1]),axis=1)