In [1]:
import numpy as np
import pandas as pd

from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report, confusion_matrix
from sklearn.pipeline import Pipeline
from sklearn.model_selection import GridSearchCV
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer
from sklearn.metrics.pairwise import cosine_similarity

from sklearn.naive_bayes import MultinomialNB

In [31]:
product = pd.read_csv('product.csv')
product.head(25)

Unnamed: 0,product_id,product_group,product_category,product_type,product,product_description,unit_of_measure,current_wholesale_price,current_retail_price,tax_exempt_yn,promo_yn,new_product_yn
0,1,Whole Bean/Teas,Coffee beans,Organic Beans,Brazilian - Organic,It's like Carnival in a cup. Clean and smooth.,12 oz,14.4,$18.00,Y,N,N
1,2,Whole Bean/Teas,Coffee beans,House blend Beans,Our Old Time Diner Blend,Out packed blend of beans that is reminiscent ...,12 oz,14.4,$18.00,Y,N,N
2,3,Whole Bean/Teas,Coffee beans,Espresso Beans,Espresso Roast,Our house blend for a good espresso shot.,1 lb,11.8,$14.75,Y,N,N
3,4,Whole Bean/Teas,Coffee beans,Espresso Beans,Primo Espresso Roast,Our primium single source of hand roasted beans.,1 lb,16.36,$20.45,Y,N,N
4,5,Whole Bean/Teas,Coffee beans,Gourmet Beans,Columbian Medium Roast,A smooth cup of coffee any time of day.,1 lb,12.0,$15.00,Y,N,N
5,6,Whole Bean/Teas,Coffee beans,Gourmet Beans,Ethiopia,From the home of coffee.,1 lb,16.8,$21.00,Y,N,N
6,7,Whole Bean/Teas,Coffee beans,Premium Beans,Jamacian Coffee River,"Ya man, it will start your day off right.",1 lb,15.8,$19.75,Y,N,N
7,8,Whole Bean/Teas,Coffee beans,Premium Beans,Civet Cat,"The most expensive coffee in the world, the ca...",.5 lb,36.0,$45.00,Y,N,N
8,9,Whole Bean/Teas,Coffee beans,Organic Beans,Organic Decaf Blend,Our blend of hand picked organic beans that ha...,1 lb,18.0,$22.50,Y,N,N
9,10,Whole Bean/Teas,Coffee beans,Green beans,Guatemalan Sustainably Grown,Green beans you can roast yourself.,1 lb,8.0,$10.00,Y,N,N


In [12]:
product['attribute'] = product[['product_group','product_category']].agg(', '.join, axis=1)
product['attribute']

0     Whole Bean/Teas, Coffee beans
1     Whole Bean/Teas, Coffee beans
2     Whole Bean/Teas, Coffee beans
3     Whole Bean/Teas, Coffee beans
4     Whole Bean/Teas, Coffee beans
                  ...              
83                Add-ons, Flavours
84                Beverages, Coffee
85                Beverages, Coffee
86                Beverages, Coffee
87                     Food, Bakery
Name: attribute, Length: 88, dtype: object

In [13]:
product['attribute'] = product['attribute'].str.split(', ')
product['attribute'].to_frame()

Unnamed: 0,attribute
0,"[Whole Bean/Teas, Coffee beans]"
1,"[Whole Bean/Teas, Coffee beans]"
2,"[Whole Bean/Teas, Coffee beans]"
3,"[Whole Bean/Teas, Coffee beans]"
4,"[Whole Bean/Teas, Coffee beans]"
...,...
83,"[Add-ons, Flavours]"
84,"[Beverages, Coffee]"
85,"[Beverages, Coffee]"
86,"[Beverages, Coffee]"


In [6]:
product['product_type'] = product['product_type'].str.split(' ')

In [8]:
product['product_type'].to_frame()

Unnamed: 0,product_type
0,"[Organic, Beans]"
1,"[House, blend, Beans]"
2,"[Espresso, Beans]"
3,"[Espresso, Beans]"
4,"[Gourmet, Beans]"
...,...
83,"[Regular, syrup]"
84,"[Specialty, coffee]"
85,"[Barista, Espresso]"
86,"[Barista, Espresso]"


In [15]:
product['all_attr'] = product['attribute']+product['product_type']
product['all_attr']

0       [Whole Bean/Teas, Coffee beans, Organic, Beans]
1     [Whole Bean/Teas, Coffee beans, House, blend, ...
2      [Whole Bean/Teas, Coffee beans, Espresso, Beans]
3      [Whole Bean/Teas, Coffee beans, Espresso, Beans]
4       [Whole Bean/Teas, Coffee beans, Gourmet, Beans]
                            ...                        
83                  [Add-ons, Flavours, Regular, syrup]
84               [Beverages, Coffee, Specialty, coffee]
85               [Beverages, Coffee, Barista, Espresso]
86               [Beverages, Coffee, Barista, Espresso]
87                                [Food, Bakery, Scone]
Name: all_attr, Length: 88, dtype: object

In [20]:
product['all_attr'] = product['all_attr'].apply(', '.join)

In [21]:
cvr = CountVectorizer(
    tokenizer = lambda x:x.split(',')
)

p_attr = cvr.fit_transform(product['all_attr'])
p_attr.toarray()

array([[0, 0, 1, ..., 0, 0, 1],
       [0, 0, 1, ..., 0, 0, 1],
       [0, 0, 1, ..., 0, 0, 1],
       ...,
       [0, 1, 0, ..., 0, 0, 0],
       [0, 1, 0, ..., 0, 0, 0],
       [1, 0, 0, ..., 1, 0, 0]])

In [22]:
print(len(cvr.get_feature_names()))

43


In [23]:
print(cvr.get_feature_names())

[' bakery', ' barista', ' beans', ' biscotti', ' black', ' blend', ' branded', ' brewed', ' chai', ' chocolate', ' clothing', ' coffee', ' coffee beans', ' drink', ' drinking', ' drinking chocolate', ' drip', ' espresso', ' flavours', ' free', ' gourmet', ' green', ' herbal', ' hot', ' house', ' housewares', ' loose tea', ' organic', ' packaged chocolate', ' pastry', ' premium', ' regular', ' scone', ' seasonal', ' specialty', ' sugar', ' syrup', ' tea', 'add-ons', 'beverages', 'food', 'merchandise', 'whole bean/teas']


In [24]:
cos_score = cosine_similarity(p_attr)
cos_score

array([[1.        , 0.67082039, 0.75      , ..., 0.        , 0.        ,
        0.        ],
       [0.67082039, 1.        , 0.67082039, ..., 0.        , 0.        ,
        0.        ],
       [0.75      , 0.67082039, 1.        , ..., 0.25      , 0.25      ,
        0.        ],
       ...,
       [0.        , 0.        , 0.25      , ..., 1.        , 1.        ,
        0.        ],
       [0.        , 0.        , 0.25      , ..., 1.        , 1.        ,
        0.        ],
       [0.        , 0.        , 0.        , ..., 0.        , 0.        ,
        1.        ]])

In [25]:
cos_score.shape

(88, 88)

In [26]:
pd.DataFrame(cos_score, index = product['product'], columns=product['product'])

product,Brazilian - Organic,Our Old Time Diner Blend,Espresso Roast,Primo Espresso Roast,Columbian Medium Roast,Ethiopia,Jamacian Coffee River,Civet Cat,Organic Decaf Blend,Guatemalan Sustainably Grown,...,Jumbo Savory Scone,I Need My Bean! Toque,I Need My Bean! T-shirt,I Need My Bean! Diner mug,I Need My Bean! Latte cup,Chocolate syrup,Rio Nights,Ouro Brasileiro shot,Ouro Brasileiro shot promo,Ginger Scone promo
product,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
Brazilian - Organic,1.00000,0.67082,0.75000,0.75000,0.75000,0.75000,0.75000,0.75000,1.00000,0.75000,...,0.0,0.0,0.0,0.0,0.0,0.0,0.000000,0.000000,0.000000,0.0
Our Old Time Diner Blend,0.67082,1.00000,0.67082,0.67082,0.67082,0.67082,0.67082,0.67082,0.67082,0.67082,...,0.0,0.0,0.0,0.0,0.0,0.0,0.000000,0.000000,0.000000,0.0
Espresso Roast,0.75000,0.67082,1.00000,1.00000,0.75000,0.75000,0.75000,0.75000,0.75000,0.75000,...,0.0,0.0,0.0,0.0,0.0,0.0,0.000000,0.250000,0.250000,0.0
Primo Espresso Roast,0.75000,0.67082,1.00000,1.00000,0.75000,0.75000,0.75000,0.75000,0.75000,0.75000,...,0.0,0.0,0.0,0.0,0.0,0.0,0.000000,0.250000,0.250000,0.0
Columbian Medium Roast,0.75000,0.67082,0.75000,0.75000,1.00000,1.00000,0.75000,0.75000,0.75000,0.75000,...,0.0,0.0,0.0,0.0,0.0,0.0,0.000000,0.000000,0.000000,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
Chocolate syrup,0.00000,0.00000,0.00000,0.00000,0.00000,0.00000,0.00000,0.00000,0.00000,0.00000,...,0.0,0.0,0.0,0.0,0.0,1.0,0.000000,0.000000,0.000000,0.0
Rio Nights,0.00000,0.00000,0.00000,0.00000,0.00000,0.00000,0.00000,0.00000,0.00000,0.00000,...,0.0,0.0,0.0,0.0,0.0,0.0,1.000000,0.612372,0.612372,0.0
Ouro Brasileiro shot,0.00000,0.00000,0.25000,0.25000,0.00000,0.00000,0.00000,0.00000,0.00000,0.00000,...,0.0,0.0,0.0,0.0,0.0,0.0,0.612372,1.000000,1.000000,0.0
Ouro Brasileiro shot promo,0.00000,0.00000,0.25000,0.25000,0.00000,0.00000,0.00000,0.00000,0.00000,0.00000,...,0.0,0.0,0.0,0.0,0.0,0.0,0.612372,1.000000,1.000000,0.0


In [27]:
pd.DataFrame(cos_score, index = product['product'], columns=product['product']).loc['Espresso Roast'].sort_values(ascending=False).head(15)

product
Espresso Roast                  1.00000
Primo Espresso Roast            1.00000
Brazilian - Organic             0.75000
Ethiopia                        0.75000
Guatemalan Sustainably Grown    0.75000
Organic Decaf Blend             0.75000
Jamacian Coffee River           0.75000
Civet Cat                       0.75000
Columbian Medium Roast          0.75000
Our Old Time Diner Blend        0.67082
Spicy Eye Opener Chai           0.25000
Cappuccino Lg                   0.25000
Cappuccino                      0.25000
Latte Rg                        0.25000
Latte                           0.25000
Name: Espresso Roast, dtype: float64

In [38]:
product_like = input('Which product that u like?')
index_like = product[product['product']== product_like].index[0]
product_recom = list(enumerate(cos_score[index_like]))
# product_recom_sorted = sorted(product_recom, key =lambda x :x[1], reverse=True)
# product_recom_sorted[1:11]

Which product that u like? Organic Decaf Blend


In [39]:
product_recom_70 = list(filter(lambda x: x[1] > 0.7, product_recom))
product_recom_70_sorted = sorted(product_recom_70, key=lambda x: x[1], reverse=True)
product_recom_70_sorted 

[(0, 1.0),
 (8, 1.0),
 (2, 0.75),
 (3, 0.75),
 (4, 0.75),
 (5, 0.75),
 (6, 0.75),
 (7, 0.75),
 (9, 0.75)]

In [40]:
for i in product_recom_70_sorted[1:]:
    print(product.iloc[i[0]]['product'])

Organic Decaf Blend
Espresso Roast
Primo Espresso Roast
Columbian Medium Roast
Ethiopia
Jamacian Coffee River
Civet Cat
Guatemalan Sustainably Grown
