#                                         Keyword Mapping

### Match free form text to master list hierarchy (manufacturer, brand, product) within nested dictionary

In [1]:
# Build function to match on eyeglass hierarchy: master = {'manufacturer':{'brand':product}} within nested dictionary
free_form = input("Enter free form text: ")
master = {"luxottica": {"vogue":"VO5153"}}

for k, v in master.items():
    for k1, v1 in v.items():
        key_words = [k,k1,v1]
        
for key_word in key_words:
    if key_word in free_form:
        print("keyword mapping is: ", key_word)

Enter free form text: vogue eyeglasses
keyword mapping is:  vogue


### What if there are no matches or only partial matches? Use fuzzy string matching python package to handle data entry inaccuracies, improve data quality and reduce workload for Data Integrity Specialists

In [2]:
# use fuzzy wuzzy package to handle data inaccuracies within PMS
from fuzzywuzzy import fuzz 
from fuzzywuzzy import process 
choices = ['vogue', 'valentino', 'versace']
query = 'vog'
process.extract(query, choices) 

[('vogue', 90), ('valentino', 33), ('versace', 30)]

In [3]:
# choose the key word that matches the closest to the data innaccuracy
process.extractOne(query, choices)

('vogue', 90)

### Create a Pandas dataframe of the entire product taxonomy master list.  Additionally, include a program for layered keyword mappings of product add-ons in order to keyword match on free form text variability

In [4]:
# create a list containing some common add-ons
add_ons = ['polycarbonate','high-index', 'anti-reflective','polarized', 'transitions', 'blue-light protection']

# now map to brands under Luxottica manufacturer
brands = ['alain mikli', 'armani exchange', 'arnette', 'bolon', 'brooks brothers','burberry', 'bvlgari', 'chanel', 'coach', 'costa','dolce gabbana', 'empirio armani', 'ferrari', 'foster grant', 'georgio armani','luxottica', 'michael kors', 'miu miu', 'molsion', 'oakley','oliver peoples', 'persol', 'polo ralph lauren', 'prada', 'ralph lauren','rayban', 'stereoflex', 'starck', 'tiffany', 'tory borch','valentino','versace','vogue','alain','mikli','armani','exchange','brooks','brothers','dolce','gabbana','empirio','foster','grant','georgio','michael','kors','miu','oliver','polo','ralph','lauren','tory','burch']

# Create an empty list
keywords_list = []

# Loop through brands
for brand in brands:
    # Loop through addons
    for add_on in add_ons:
        # Append combinations
        keywords_list.append([brand , brand + ' ' + add_on])
        keywords_list.append([brand , add_on + ' ' + brand])
        
# Inspect keyword list
from pprint import pprint
pprint(keywords_list[:5])

[['alain mikli', 'alain mikli polycarbonate'],
 ['alain mikli', 'polycarbonate alain mikli'],
 ['alain mikli', 'alain mikli high-index'],
 ['alain mikli', 'high-index alain mikli'],
 ['alain mikli', 'alain mikli anti-reflective']]


### Convert product list of lists into Pandas DataFrame with columns that correspond to Manufacturer, Brand, Product, Add-ons , etc.  

In [5]:
# convert list of lists into a dataframe
import pandas as pd

# Create a DataFrame from list
keywords_df = pd.DataFrame.from_records(keywords_list)

# Print the keywords DataFrame to explore it
keywords_df.head()

Unnamed: 0,0,1
0,alain mikli,alain mikli polycarbonate
1,alain mikli,polycarbonate alain mikli
2,alain mikli,alain mikli high-index
3,alain mikli,high-index alain mikli
4,alain mikli,alain mikli anti-reflective


In [6]:
# Create column names for DataFrame
keywords_df = keywords_df.rename(columns={0: 'Brands', 1: 'Layered_Keyword_Mapping'})
# Add a manufacturer name column
keywords_df['Manufacturer_Name']='Luxottica' 
keywords_df['Product'] = 'sunglasses'
keywords_df.head()

Unnamed: 0,Brands,Layered_Keyword_Mapping,Manufacturer_Name,Product
0,alain mikli,alain mikli polycarbonate,Luxottica,sunglasses
1,alain mikli,polycarbonate alain mikli,Luxottica,sunglasses
2,alain mikli,alain mikli high-index,Luxottica,sunglasses
3,alain mikli,high-index alain mikli,Luxottica,sunglasses
4,alain mikli,alain mikli anti-reflective,Luxottica,sunglasses


### Create Python function to map Product taxonomy Data Frame to free form text entered into PMS

In [7]:
# create keyword mapping of Manufacturer, Brand and Product in data frame to free form text from PMS
s = ['vogue','Luxottica','sunglasses','polarized vogue']
brands_mask = keywords_df['Brands'].apply(lambda x:x in s)
manufacturer_mask = keywords_df['Manufacturer_Name'].apply(lambda x: x in s)
add_ons_mask = keywords_df['Layered_Keyword_Mapping'].apply(lambda x: x in s)

In [8]:
# keyword map string(free form text) to  brands_mask(Product Taxonomy Master List)
brands_mask[brands_mask == True]

384    True
385    True
386    True
387    True
388    True
389    True
390    True
391    True
392    True
393    True
394    True
395    True
Name: Brands, dtype: bool

In [9]:
# keyword map string(free form text) to  manufacturer_mask(Product Taxonomy Master List)
manufacturer_mask[brands_mask == True]

384    True
385    True
386    True
387    True
388    True
389    True
390    True
391    True
392    True
393    True
394    True
395    True
Name: Manufacturer_Name, dtype: bool

### Determine if there is an exact match to free form text entered into PMS and Dataframe product taxonomy and if so, what are the details of that match?

In [10]:
# map free form text to master list 
add_ons_mask[add_ons_mask == True]

391    True
Name: Layered_Keyword_Mapping, dtype: bool

In [11]:
# take a look at row in master list that corresponds to free form text 
keywords_df.loc[391]

Brands                               vogue
Layered_Keyword_Mapping    polarized vogue
Manufacturer_Name                Luxottica
Product                         sunglasses
Name: 391, dtype: object

### Now that we have successfully mapped free form data to the product taxonomy data, utilize NLTK on the marketing text data to look for trends that can be used for marketing purposes

In [12]:
# import text pre-processing packages
import nltk
import nltk.corpus
import re
import string
from nltk.corpus import stopwords
import nltk as nlp

# use string of text from marketing text data
text = 'Ray-Ban, one of the world’s leading lifestyle eyewear brands, and Oakley, a leader in the sport and performance category, serve as a strong base for our proprietary brand portfolio, complemented by Persol, Oliver Peoples and Alain Mikli at the high-end of the market, Costa del Mar and Arnette in the sport market, and Vogue Eyewear, Bolon, Molsion and Ossé in the affordable lifestyle market. The portfolio is rounded out by non-prescription reading glasses, including the brand Foster Grant'
tex = 'Ray-Ban, one of the world’s leading lifestyle eyewear brands, and Oakley, a leader in the sport and performance category, serve as a strong base for our proprietary brand portfolio, complemented by Persol, Oliver Peoples and Alain Mikli at the high-end of the market, Costa del Mar and Arnette in the sport market, and Vogue Eyewear, Bolon, Molsion and Ossé in the affordable lifestyle market. The portfolio is rounded out by non-prescription reading glasses, including the brand Foster Grant'

In [13]:
# create function that will lowercase text, remove punctuation, remove whitespace and remove words containing numbers
def clean_text(text):
    text = text.lower()
    text = re.sub('[%s]' % re.escape(string.punctuation), '', text)
    text = re.sub('\w*\d\w*', '', text)
    text = text.strip()
    return text

clean = lambda x: clean_text(x)

In [14]:
# convert string into list and then list into DataFrame 
List_of_Words = tex.split()
print(f'List of Words ={tex.split()}')
df = pd.DataFrame(List_of_Words,columns =['Names']) 
df.head()

List of Words =['Ray-Ban,', 'one', 'of', 'the', 'world’s', 'leading', 'lifestyle', 'eyewear', 'brands,', 'and', 'Oakley,', 'a', 'leader', 'in', 'the', 'sport', 'and', 'performance', 'category,', 'serve', 'as', 'a', 'strong', 'base', 'for', 'our', 'proprietary', 'brand', 'portfolio,', 'complemented', 'by', 'Persol,', 'Oliver', 'Peoples', 'and', 'Alain', 'Mikli', 'at', 'the', 'high-end', 'of', 'the', 'market,', 'Costa', 'del', 'Mar', 'and', 'Arnette', 'in', 'the', 'sport', 'market,', 'and', 'Vogue', 'Eyewear,', 'Bolon,', 'Molsion', 'and', 'Ossé', 'in', 'the', 'affordable', 'lifestyle', 'market.', 'The', 'portfolio', 'is', 'rounded', 'out', 'by', 'non-prescription', 'reading', 'glasses,', 'including', 'the', 'brand', 'Foster', 'Grant']


Unnamed: 0,Names
0,"Ray-Ban,"
1,one
2,of
3,the
4,world’s


In [15]:
# apply function and view DataFrame
from nltk.tokenize import word_tokenize
updated_text = pd.DataFrame(df.Names.apply(clean))
updated_text.head()

Unnamed: 0,Names
0,rayban
1,one
2,of
3,the
4,world’s


In [16]:
# remove stop words and lemmatize to further clean
from nltk.tokenize import word_tokenize
text1 = updated_text['Names'].tolist()
text2 = str(text1)
stop_words = set(stopwords.words('english'))
from nltk.stem import WordNetLemmatizer
lemmatizer = WordNetLemmatizer()
def cleaner_text(text2):
    text2 = word_tokenize(text2)
    text2 = [i for i in text2 if not i in stop_words]
    text2 = " ".join([lemmatizer.lemmatize(t) for t in text2])
    return text2

clean2 = lambda x: cleaner_text(x)
# take a look at updated DataFrame
updated = pd.DataFrame(updated_text.Names.apply(clean2))
updated.head()

Unnamed: 0,Names
0,rayban
1,one
2,
3,
4,world ’


In [17]:
# take a look at updated dataframe after reseting index
i = updated[updated.Names == ''].index
updated.drop(i, inplace = True)
updated.reset_index(drop=True, inplace = True)
updated.head()

Unnamed: 0,Names
0,rayban
1,one
2,world ’
3,leading
4,lifestyle


In [18]:
# merge product taxonomy dataframe and website text dataframe in order to analyze trends 
updated_new = updated.merge(keywords_df,how='left', left_on='Names', right_on='Brands')
updated_new.head()

Unnamed: 0,Names,Brands,Layered_Keyword_Mapping,Manufacturer_Name,Product
0,rayban,rayban,rayban polycarbonate,Luxottica,sunglasses
1,rayban,rayban,polycarbonate rayban,Luxottica,sunglasses
2,rayban,rayban,rayban high-index,Luxottica,sunglasses
3,rayban,rayban,high-index rayban,Luxottica,sunglasses
4,rayban,rayban,rayban anti-reflective,Luxottica,sunglasses


In [19]:
# View a summary of matches to get counts of brands seen most on marketing text data
summary = updated_new.groupby(['Brands'])['Names'].count()
print(summary)

Brands
alain      12
arnette    12
bolon      12
costa      12
foster     12
grant      12
mikli      12
molsion    12
oakley     12
oliver     12
persol     12
rayban     12
vogue      12
Name: Names, dtype: int64


### This shows that Rayban, Alain Mikli, Arnette, Bolon, Costa, Foster Grant, Molsion, Oakley, Oliver Peoples, Persol and Vogue are seen most on marketing text data   