## Content-based Recommender System
_Author: Rachel Koenig_
_____

Imports

In [1]:
import pandas as pd

from scipy import sparse
from sklearn.metrics.pairwise import pairwise_distances
from bs4 import BeautifulSoup
from nltk.corpus import stopwords 

Read in Category csv.

In [2]:
categories = pd.read_csv('data/category_only.csv')
categories.head()
pd.set_option('display.max_colwidth',1000)

In [3]:
categories.drop(columns='Unnamed: 0', inplace=True)  #drop unnamed column 
categories.head()  # check first 5 rows of the dataset 

Unnamed: 0,asin,name,Arts_Crafts_Sewing,Automotive,Baby_Products,Beauty_Personal_Care,Cell_Phones_Accessories,Clothing_Shoes_Jewelry,Electronics,Health_Household,...,Trunks,Umbrellas,Underwear,Wallets,Wear_to_Work,Wrist_Watches,Arm_Warmers,Baseball_Caps,Berets,overall_mean
0,0000031887,Mystiqueshapes Girls Ballet Tutu Neon Lime Green,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,4.61
1,0123456479,SHINING IMAGE HUGE PINK LEATHER JEWELRY BOX / CASE / STORAGE / ORGANIZER WITH TRAVEL CASE AND LOCK,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,4.17
2,1608299953,Learn French: Rosetta Stone French - Level 1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,4.31
3,1617160377,Learn Italian: Rosetta Stone Italian - Level 1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,4.62
4,B00001W0KA,Buzz Lightyear Boy's Deluxe Toy Story Costume,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,4.62


In [4]:
# Check number of rows and columns 
categories.shape

(13732, 814)

In [5]:
# change all columns to lowercase 
categories.columns = [x.lower() for x in categories.columns]

In [6]:
# check data types 
categories.dtypes.head(25)

asin                        object
name                        object
arts_crafts_sewing         float64
automotive                 float64
baby_products              float64
beauty_personal_care       float64
cell_phones_accessories    float64
clothing_shoes_jewelry     float64
electronics                float64
health_household           float64
home_kitchen               float64
industrial_scientific      float64
office_products            float64
purchase_circles           float64
software                   float64
sports_outdoors            float64
tools_home_improvement     float64
toys_games                 float64
accessories_supplies       float64
antivirus_security         float64
arts_crafts                float64
baby_toddler_toys          float64
bath                       float64
beading_jewelry_making     float64
bedding                    float64
dtype: object

In [7]:
# Check for nulls 
categories.isnull().sum().sum()

0

In [8]:
# Check column/row output 
categories['name'][143]

"Leading Lady Women's Plus Size Sleep Leisure Cotton Bra"

Create new dataframe with only category columns.

In [9]:
cats = categories.iloc[:, 2:]
cats.head()

Unnamed: 0,arts_crafts_sewing,automotive,baby_products,beauty_personal_care,cell_phones_accessories,clothing_shoes_jewelry,electronics,health_household,home_kitchen,industrial_scientific,...,trunks,umbrellas,underwear,wallets,wear_to_work,wrist_watches,arm_warmers,baseball_caps,berets,overall_mean
0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,4.61
1,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,4.17
2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,4.31
3,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,4.62
4,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,4.62


Set index to product name.

In [10]:
cats = cats.set_index(categories['name'])

In [11]:
cats.head()

Unnamed: 0_level_0,arts_crafts_sewing,automotive,baby_products,beauty_personal_care,cell_phones_accessories,clothing_shoes_jewelry,electronics,health_household,home_kitchen,industrial_scientific,...,trunks,umbrellas,underwear,wallets,wear_to_work,wrist_watches,arm_warmers,baseball_caps,berets,overall_mean
name,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
Mystiqueshapes Girls Ballet Tutu Neon Lime Green,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,4.61
SHINING IMAGE HUGE PINK LEATHER JEWELRY BOX / CASE / STORAGE / ORGANIZER WITH TRAVEL CASE AND LOCK,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,4.17
Learn French: Rosetta Stone French - Level 1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,4.31
Learn Italian: Rosetta Stone Italian - Level 1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,4.62
Buzz Lightyear Boy's Deluxe Toy Story Costume,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,4.62


In [12]:
# check number of columns and rows 
cats.shape

(13732, 812)

Check a value count example.

In [13]:
cats['trunks'].value_counts()

0.0    13714
1.0       18
Name: trunks, dtype: int64

Convert to sparse matrix to make smaller file size 

In [14]:
categories_sparse = sparse.csr_matrix(cats.fillna(0))

Shape is still the same but each only cells with values are shown.

In [15]:
categories_sparse.shape

(13732, 812)

Check the first 5 categories.

In [16]:
print(categories_sparse[:5])

  (0, 13)	1.0
  (0, 72)	1.0
  (0, 396)	1.0
  (0, 725)	1.0
  (0, 733)	1.0
  (0, 757)	1.0
  (0, 811)	4.61
  (1, 5)	1.0
  (1, 70)	1.0
  (1, 197)	1.0
  (1, 368)	1.0
  (1, 582)	1.0
  (1, 811)	4.17
  (2, 12)	1.0
  (2, 31)	1.0
  (2, 377)	1.0
  (2, 811)	4.31
  (3, 12)	1.0
  (3, 31)	1.0
  (3, 377)	1.0
  (3, 811)	4.62
  (4, 15)	1.0
  (4, 30)	1.0
  (4, 755)	1.0
  (4, 811)	4.62


Check similiarity of every item to every other item with pairwise distances. 

In [17]:
recommender = pairwise_distances(cats, metric='cosine')

In [18]:
#Checking to make sure it's a square. 
recommender.shape

(13732, 13732)

In [19]:
type(recommender)

numpy.ndarray

Convert to DataFrame with the index from categories as the index for both rows and columns.

In [20]:
recommender_df = pd.DataFrame(recommender, index=cats.index, columns=cats.index)

In [21]:
# Check first 5 rows 
recommender_df.head()

name,Mystiqueshapes Girls Ballet Tutu Neon Lime Green,SHINING IMAGE HUGE PINK LEATHER JEWELRY BOX / CASE / STORAGE / ORGANIZER WITH TRAVEL CASE AND LOCK,Learn French: Rosetta Stone French - Level 1,Learn Italian: Rosetta Stone Italian - Level 1,Buzz Lightyear Boy's Deluxe Toy Story Costume,Woody Deluxe Child - Size: Child S(4-6),Lewis N. Clark Stash,"Lewis N. Clark Deluxe Neck Stash, Beige","Lewis N. Clark Add-A-Bag Travel Luggage Strap, Black, One Size",Buzz Lightyear Jet Pack,...,Sakkas Everyday Essentials Caftan Tank Dress/Cover Up,Sakkas Natalie Sequin Tie Dye Blouse,Sakkas Malvina Marbled Embroidery Cap Sleeves Blouse/Top,Lindy Bop 'Ophelia' Vintage 1950's Floral Spring Garden Party Picnic Dress,Free to Live Women's Fold Over High Waisted Flowy Floor Length Maxi Skirt,Rampage Womens Ultra Cute Embroidered Heather Jersey Night Shirt (Small-3X),14k Gold-Bonded Sterling Silver Tri-Color Hoop Earrings,Classic Designs Stretch Poplin Elastic Waist Cargo Capri,"TendzArt Azules Poly Span Floral Print Full Length Long Maxi Skirt - Made in USA (Medium, Purple+Mint+Yellow)","Kenneth Cole Reaction Easy To Remember, Black, One Size"
name,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
Mystiqueshapes Girls Ballet Tutu Neon Lime Green,0.0,0.221747,0.180608,0.173118,0.173118,0.23327,0.18823,0.191571,0.210148,0.232939,...,0.166517,0.232997,0.187382,0.169129,0.113773,0.194185,0.20678,0.319598,0.130877,0.173562
SHINING IMAGE HUGE PINK LEATHER JEWELRY BOX / CASE / STORAGE / ORGANIZER WITH TRAVEL CASE AND LOCK,0.221747,0.0,0.182269,0.174794,0.174794,0.182397,0.148275,0.150688,0.164491,0.187664,...,0.164311,0.229286,0.184627,0.166848,0.14966,0.19127,0.166848,0.314547,0.166071,0.132241
Learn French: Rosetta Stone French - Level 1,0.180608,0.182269,0.0,0.000274,0.131173,0.194376,0.147051,0.150562,0.170081,0.194028,...,0.16311,0.246636,0.190082,0.166542,0.149071,0.198692,0.166542,0.349247,0.165493,0.131639
Learn Italian: Rosetta Stone Italian - Level 1,0.173118,0.174794,0.000274,0.0,0.123232,0.187013,0.139255,0.142798,0.162495,0.186661,...,0.155461,0.23975,0.182679,0.158924,0.141293,0.191368,0.158924,0.343299,0.157866,0.123702
Buzz Lightyear Boy's Deluxe Toy Story Costume,0.173118,0.174794,0.131173,0.123232,0.0,0.187013,0.139255,0.142798,0.162495,0.186661,...,0.155461,0.23975,0.182679,0.158924,0.141293,0.191368,0.158924,0.343299,0.157866,0.123702


### Ready to recommend!

A search query to find an exact item name

In [22]:
q = "Dress"
categories.loc[categories['name'].str.contains(q), 'name'][:10]


96                    Skagen Women's Ancher Stainless Steel Mesh Dress Quartz Watch
227    Timex Men's T56371 Ironman Triathlon 42 Lap Combo Analog Digital Dress Watch
322                                   Gold Toe Men's Metropolitan Dress Sock 3-Pack
324                    Gold Toe Men's 3-Pack Metropolitan Over-The-Calf Dress Socks
328                                          Gold Toe Men's Windsor Wool Dress Sock
329         Gold Toe Men's Windsor Wool-Blend Over-The-Calf Dress Sock (Three-Pack)
363              CTM Men's Elastic Button End Dress Suspenders with Silver Hardware
391                                       Seiko Women's SWZ054 Two-Tone Dress Watch
443                              Anne Klein Women's 104899SVTT Two-Tone Dress Watch
475                      50s Strapless Satin Bridesmaid Bridesmaid Dress Homecoming
Name: name, dtype: object

Search for a product's index # 

In [23]:
categories[categories['name'] == "Learn French: Rosetta Stone French - Level 1"]

Unnamed: 0,asin,name,arts_crafts_sewing,automotive,baby_products,beauty_personal_care,cell_phones_accessories,clothing_shoes_jewelry,electronics,health_household,...,trunks,umbrellas,underwear,wallets,wear_to_work,wrist_watches,arm_warmers,baseball_caps,berets,overall_mean
2,1608299953,Learn French: Rosetta Stone French - Level 1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,4.31


Turn one row (a product) into a dictionary & take a closer look at its key value pairs.

In [24]:
foo = categories.loc[2].to_dict()

In [25]:
{k:v for k,v in foo.items() if v != 0}

{'asin': '1608299953',
 'name': 'Learn French: Rosetta Stone French - Level 1',
 'software': 1.0,
 'education_reference': 1.0,
 'languages': 1.0,
 'overall_mean': 4.31}

Create a second dictionary so we can compare two products and see if they have any value similarities.

In [26]:
bar = categories.loc[2888].to_dict()

In [27]:
{k:v for k,v in bar.items() if v != 0}

{'asin': 'B001FYWBOI',
 'name': 'Elegance Long Stem Ring Holder, 9", Silver',
 'home_kitchen': 1.0,
 'home_décor': 1.0,
 'overall_mean': 5.0}

Top 10 recommendations for products using only categories as features.

In [28]:
recommender_df["Buzz Lightyear Boy's Deluxe Toy Story Costume"].sort_values()[1:11]


name
Rubies Star Wars Classic Child's Deluxe Jedi Knight Costume, Medium                                                 0.000001
Halloween Concepts Children's Costumes Pirate King - Child's Medium                                                 0.000029
DC Super Heroes Child's Batgirl Costume, Large                                                                      0.000038
Super DC Heroes Deluxe Muscle Chest Superman Costume, Child's Large                                                 0.000038
Kids Army Camouflage Combat Vest - Fits Ages 5-13 Yrs                                                               0.000077
Melissa & Doug Chef Role-Play Costume Set (Pretend Play, Materials, Machine-Washable, 17.5” H x 24” W x 0.75” L)    0.000094
Spiderman Muscle - Size: Child M(7-8)                                                                               0.000133
California Costumes Toys Hercules                                                                                   0.00

In [29]:
recommender_df["NYDJ Women's Basic Pull on Leggings"].sort_values()[1:11]

name
NYDJ Women's Basic Pull on Leggings                                0.000000
Steve Madden Legwear Women's Basic Legging                         0.000002
Angelina Plush-Lined Leggings                                      0.000002
jntworld Women's Faux Leather High Waisted Leggings                0.000004
HUE Women's Solid Color Original Jeanz Denim Legging               0.000015
D&K Monarchy Women's Seamless Capri Thin Leggings                  0.000033
HUE Women's Solid Color Original Jeanz Denim Legging               0.000033
Modern Kiwi Cable Knit Leggings                                    0.000041
Sakkas Footless Liquid Wet Look Shiny Metallic Stretch Leggings    0.000050
Maidenform Flexees Women's Shapewear Legging                       0.000050
Name: NYDJ Women's Basic Pull on Leggings, dtype: float64

In [30]:
recommender_df["Russell Athletic Men's Basic Cotton T-Shirt"].sort_values()[1:11]

name
Hanes Ultimate Men's 3-Pack Classics Lightweight Slim-Fit V-Neck T-Shirt                          0.000506
Columbia Men's Low Drag Offshore Long Sleeve Shirt, UPF 40 Protection, Moisture Wicking Fabric    0.000927
Hanes Ultimate Men's 3-Pack Classics Lightweight Slim-Fit V-Neck T-Shirt                          0.001036
Noble Mount Mens 100% Cotton Casual Shirt - Regular Fit                                           0.019183
Charles River Apparel Men's Classic Rugby Shirt                                                   0.019183
Calvin Klein Men's Single Welt Pocket Polo                                                        0.019183
Russell Athletic Men's Big & Tall Dri Power Short-Sleeve Polo Shirt                               0.019183
Carhartt Men's Hines Solid Long-Sleeve Shirt Long-Sleeve Button-Front Twill                       0.019183
Carhartt Men's Chamois Button Front Original Fit Shirt                                            0.019183
PGA TOUR Men's Stacked Polo Shir

While it appears some of products are of a similar type, based on their names at least, the scores are too close to 0 and some are identical which means there must be a problem somewhere. Taking a closer look, I find that many items have the exact same features in common.  To fix this, I know I need to add more features.

## Iterations to make model better 

Read in text csv with product names to count vectorize.

In [31]:
names = pd.read_csv('data/names_to_vectorize.csv')

In [32]:
# check first 5 rows 
names.head()

Unnamed: 0.1,Unnamed: 0,asin,name,name_split
0,0,0000031887,Mystiqueshapes Girls Ballet Tutu Neon Lime Green,mystiqueshapes girl ballet tutu neon lime green
1,23,0123456479,SHINING IMAGE HUGE PINK LEATHER JEWELRY BOX / CASE / STORAGE / ORGANIZER WITH TRAVEL CASE AND LOCK,shining image huge pink leather jewelry box case storage organizer travel case lock
2,29,1608299953,Learn French: Rosetta Stone French - Level 1,learn french rosetta stone french level 1
3,42,1617160377,Learn Italian: Rosetta Stone Italian - Level 1,learn italian rosetta stone italian level 1
4,58,B00001W0KA,Buzz Lightyear Boy's Deluxe Toy Story Costume,buzz lightyear boy deluxe toy story costume


In [33]:
# Check for nulls 
names.isnull().sum()

Unnamed: 0    0
asin          0
name          0
name_split    0
dtype: int64

In [34]:
# drop unnamed index column 
names.drop(columns='Unnamed: 0', inplace=True)

In [35]:
names.head()

Unnamed: 0,asin,name,name_split
0,0000031887,Mystiqueshapes Girls Ballet Tutu Neon Lime Green,mystiqueshapes girl ballet tutu neon lime green
1,0123456479,SHINING IMAGE HUGE PINK LEATHER JEWELRY BOX / CASE / STORAGE / ORGANIZER WITH TRAVEL CASE AND LOCK,shining image huge pink leather jewelry box case storage organizer travel case lock
2,1608299953,Learn French: Rosetta Stone French - Level 1,learn french rosetta stone french level 1
3,1617160377,Learn Italian: Rosetta Stone Italian - Level 1,learn italian rosetta stone italian level 1
4,B00001W0KA,Buzz Lightyear Boy's Deluxe Toy Story Costume,buzz lightyear boy deluxe toy story costume


Read in colors csv.

In [36]:
colors = pd.read_csv('data/colors_split.csv')
colors.head()

Unnamed: 0.1,Unnamed: 0,asin,name,turquoise,nickel,diamond,cream,maize,sea green,dark green,...,copper,apricot,mustard,yellow,jade,bronze,cardinal,rose,golden brown,platinum
0,0,0000031887,Mystiqueshapes Girls Ballet Tutu Neon Lime Green,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,23,0123456479,SHINING IMAGE HUGE PINK LEATHER JEWELRY BOX / CASE / STORAGE / ORGANIZER WITH TRAVEL CASE AND LOCK,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,29,1608299953,Learn French: Rosetta Stone French - Level 1,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,1,0,0
3,42,1617160377,Learn Italian: Rosetta Stone Italian - Level 1,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,1,0,0
4,58,B00001W0KA,Buzz Lightyear Boy's Deluxe Toy Story Costume,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [37]:
# drop unnamed index column 
colors.drop(columns=['Unnamed: 0', 'asin',], inplace=True)

In [38]:
# list of all colors from colors dataset to use as stopwords when CountVectorizing the names 
list_of_colors = ['acid green', 'aero', 'african violet', 'alabaster', 'alice blue', 'almond', 'amaranth', 'amazon', 'amber', 'amethyst',
 'antique brass', 'antique bronze', 'antique white', 'apple green', 'apricot', 'aqua', 'aquamarine', 'army green', 'ash grey',
 'auburn', 'avocado', 'awesome', 'azure', 'azure mist', 'baby blue', 'baby pink', 'banana yellow', 'barn red', 'begonia',
 'beige', 'bisque', 'bittersweet', 'black', 'black olive', 'blond', 'blue', 'blue bell', 'blue lagoon', 'blue sapphire',
 'blueberry', 'blush', 'bole', 'bondi blue', 'bone', 'boysenberry', 'brass', 'brick red', 'bright green', 'bright lavender',
 'bright pink', 'bronze', 'brown sugar', 'brown yellow', 'bubble gum', 'buff', 'burgundy', 'burnished brown', 'burnt orange', 'byzantine', 'cadet', 'cadet blue', 'camel',
 'camouflage green', 'canary', 'canary yellow', 'candy pink', 'capri', 'cardinal', 'carmine', 'carnelian', 'carolina blue', 'ceil',
 'celeste', 'celestial blue', 'cerise', 'cerulean', 'cerulean blue', 'champagne', 'charcoal', 'charm pink', 'cherry', 'chestnut', 'chinese red',
 'citrine', 'citron', 'cobalt blue', 'cocoa brown', 'coconut', 'coffee', 'columbia blue', 'cool grey', 'copper', 'copper red',
 'coral', 'coral pink', 'coral red', 'coral reef', 'cordovan', 'corn', 'cornflower blue', 'cornsilk', 'cotton candy', 'coyote brown',
 'cream', 'crimson', 'crimson red', 'cultured', 'cyan', 'cyber yellow', 'daffodil', 'dandelion', 'dark blue', 'dark brown',
 'dark coral', 'dark green', 'dark khaki', 'dark pink', 'dark purple', 'dark red', 'dark tan', 'dark taupe', 'dark yellow', 'deep fuchsia',
 'deep green', 'deep red', 'deer', 'denim', 'denim blue', 'desert', 'desert sand', 'desire', 'diamond', 'dirt', 'dodger blue', 'drab','ebony',
 'ecru', 'eggplant', 'eggshell', 'electric blue', 'electric green', 'electric lime', 'electric purple', 'electric yellow', 'emerald',
 'fandango', 'fawn', 'fern green', 'flame', 'flax', 'flirt', 'folly', 'french blue', 'french pink', 'french rose', 'frostbite', 'fuchsia', 'fuchsia purple',
 'ginger', 'glitter', 'go green', 'gold fusion', 'golden brown', 'golden yellow', 'grape', 'gray', 'green sheen', 'grizzly', 'gunmetal', 'harvest gold', 'heart gold',
 'hollywood cerise', 'hot pink', 'hunter green', 'iceberg', 'imperial', 'imperial blue', 'indigo', 'iris', 'irresistible', 'ivory',
 'jade', 'jasper', 'jet', 'jungle green', 'kelly green', 'kiwi', 'lapis lazuli', 'lava', 'lavender purple', 'lemon', 'lemon yellow', 'liberty',
 'light blue', 'light brown', 'light gray', 'light green', 'light pink', 'light taupe', 'light yellow', 'lilac', 'lime green',
 'linen', 'liver', 'lust', 'magenta', 'magnolia', 'mahogany', 'maize', 'malachite', 'mandarin', 'mango tango', 'mardi gras',
 'marigold', 'mauve', 'medium blue', 'medium purple', 'melon', 'midnight', 'midnight blue', 'milk','ming', 'mint', 'mint green', 'moccasin',
 'moss green', 'mulberry', 'mustard', 'mystic', 'navy', 'neon green', 'new car', 'nickel', 'ocean blue', 'ocean green', 'ochre', 'old gold',
 'old lavender', 'old rose', 'old silver', 'olive', 'onyx', 'orchid', 'oxford blue', 'pacific blue', 'pale green', 'pale pink',
 'paradise pink', 'pastel brown', 'pastel green', 'pastel pink', 'pastel red', 'peach', 'pear', 'pearl', 'peridot', 'periwinkle', 'persian red',
 'persimmon', 'peru', 'pine green', 'pink', 'pink flamingo','pink lace', 'pink pearl','pistachio', 'platinum', 'plum', 'popstar',
 'powder blue', 'pumpkin', 'purple heart', 'quartz', 'quick silver', 'raspberry', 'raspberry rose', 'red', 'redwood', 'regalia',
 'rhythm', 'rose', 'rose dust', 'rose gold', 'rose pink', 'rose quartz', 'rose red', 'rosewood', 'royal blue', 'royal purple',
 'ruby', 'ruby red', 'rust', 'safety orange', 'safety yellow', 'saffron', 'sage', 'salmon', 'sand', 'sand dune', 'sangria', 'sap green', 'sapphire', 'sapphire blue', 'scarlet',
 'sea blue', 'sea green', 'sepia', 'shadow', 'shadow blue', 'shamrock green', 'shocking pink', 'sienna', 'silver', 'silver pink', 'silver sand', 'sky blue', 'slate blue', 
 'slate gray', 'smitten', 'smoke', 'snow', 'solid pink',
 'spring green', 'steel blue', 'steel pink', 'straw', 'strawberry', 'sugar plum', 'sunny', 'sunset', 'sweet brown', 'tan', 'tangelo', 'tangerine', 'taupe', 'teal',
 'teal blue', 'teal green', 'terra cotta', 'thistle', 'timberwolf', 'titanium yellow', 'tomato', 'topaz', 'true blue', 'tulip', 'turquoise', 'turquoise blue', 
 'turquoise green', 'tuscan', 'tuscany', 'ua blue', 'ube', 'ultra pink', 'ultramarine', 'ultramarine blue', 'umber', 'vanilla', 'vegas gold', 'veronica', 
 'violet', 'vivid yellow', 'volt', 'wenge', 'wheat', 'white', 'wild strawberry', 'wine','yale blue','yellow','yellow rose']


In [39]:
type(list_of_colors)

list

In [40]:
colors['lime green'].value_counts()

0    13724
1        8
Name: lime green, dtype: int64

In [41]:
colors.shape

(13732, 367)

In [42]:
colors.head()

Unnamed: 0,name,turquoise,nickel,diamond,cream,maize,sea green,dark green,amber,jet,...,copper,apricot,mustard,yellow,jade,bronze,cardinal,rose,golden brown,platinum
0,Mystiqueshapes Girls Ballet Tutu Neon Lime Green,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,SHINING IMAGE HUGE PINK LEATHER JEWELRY BOX / CASE / STORAGE / ORGANIZER WITH TRAVEL CASE AND LOCK,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,Learn French: Rosetta Stone French - Level 1,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,1,0,0
3,Learn Italian: Rosetta Stone Italian - Level 1,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,1,0,0
4,Buzz Lightyear Boy's Deluxe Toy Story Costume,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


## CountVectorizer 

In [43]:
# Import CountVectorizer.
from sklearn.feature_extraction.text import CountVectorizer

In [44]:
# Instantiate the "CountVectorizer" object, which is scikit-learn's bag of words tool.
cvec = CountVectorizer(analyzer = "word",   # default, says features should be made of words 
                       tokenizer = None,    #default 
                       preprocessor = None, # default 
                       stop_words = list_of_colors,  # list of words I do not want counted 
                       max_features=10000,  # maximum number of features, n most frequent 
                       min_df=2,            # minimum number of documents a word must appear in   
                       ngram_range=(1, 3),  # a range of number of words to look at 
                       binary=True)         # returns a 1 or a 0 if a word is in the document 

In [45]:
names_features = cvec.fit_transform(names['name_split'])

  'stop_words.' % sorted(inconsistent))


In [46]:
# check the shape 
names_features.shape

(13732, 10000)

In [47]:
type(names_features)

scipy.sparse.csr.csr_matrix

In [48]:
# Check the feature/column names 
vocab = cvec.get_feature_names()
print(vocab)

['00', '00 carat', '00 carat total', '00g', '01', '02', '03', '05', '0mm', '10', '10 13', '10 cttw', '10 cttw color', '10 inch', '10 pack', '10 pack cotton', '10 running', '10 running shoe', '10 watch', '10 year', '10 year battery', '100', '100 cashmere', '100 cashmere long', '100 cashmere neck', '100 cotton', '100 cotton flannel', '100 facet', '100 facet collection', '100 leather', '1000', '1000 running', '1000 running shoe', '100cm', '100pc', '100pc belly', '1050', '1050 hd', '1050 hd tuff', '10k', '10k gold', '10kt', '10kt gold', '10mm', '11', '11 12', '11 inch', '12', '12 14', '12 24', '12 inch', '12 mm', '12 month', '12 pack', '12 pair', '12 piece', '12 running', '12 running shoe', '120', '12mm', '13', '13 inch', '13 running', '13 running shoe', '14', '14 16', '14 16 18', '14 40', '14 40 inch', '14 running', '14 running shoe', '1440', '1440 sport', '1440 sport digital', '14g', '14k', '14k ball', '14k gold', '14k gold accent', '14k gold bonded', '14k gold filled', '14k gold hoop', 

Remove words that are just integers

In [49]:
keep_words = []     # instantiate an empty list 
for word in vocab:  # loop through the list words/strings
    try:
        [int(i) for i in word.split(" ")]  # try to turn each word in the string into an integer 
    except:                               # if it gets an error = not an integer
        keep_words.append(word)           # add it tot he keep words list

In [50]:
keep_words[:10]

['00 carat',
 '00 carat total',
 '00g',
 '0mm',
 '10 cttw',
 '10 cttw color',
 '10 inch',
 '10 pack',
 '10 pack cotton',
 '10 running']

Turn newly created feaures matrix into a DataFrame.

In [51]:
feature_names_df = pd.DataFrame(names_features.toarray(),
                         columns= cvec.get_feature_names())

In [52]:
feature_names_df.head()

Unnamed: 0,00,00 carat,00 carat total,00g,01,02,03,05,0mm,10,...,zirconia halo,zirconia pendant,zirconia pendant necklace,zirconia ring,zirconia round,zirconia stud,zirconia stud earring,zombie,zoned,zoned support
0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [53]:
# Check value counts of a specific feature 
feature_names_df['zombie'].value_counts()

0    13726
1        6
Name: zombie, dtype: int64

Find the column names that intersect with the words in the keep list 

In [54]:
feature_names_df.columns.intersection(keep_words)

Index(['00 carat', '00 carat total', '00g', '0mm', '10 cttw', '10 cttw color',
       '10 inch', '10 pack', '10 pack cotton', '10 running',
       ...
       'zirconia halo', 'zirconia pendant', 'zirconia pendant necklace',
       'zirconia ring', 'zirconia round', 'zirconia stud',
       'zirconia stud earring', 'zombie', 'zoned', 'zoned support'],
      dtype='object', length=9885)

Create a DataFrame with the just the non-integer words 

In [55]:
feature_names_df = feature_names_df[feature_names_df.columns.intersection(keep_words)]

Use a dictionary comprhension to check the features of a row.

In [56]:
box = feature_names_df.loc[1].to_dict()

{k:v for k,v in box.items() if v != 0}

{'box': 1,
 'box case': 1,
 'case': 1,
 'case lock': 1,
 'jewelry': 1,
 'jewelry box': 1,
 'jewelry box case': 1,
 'leather': 1,
 'leather jewelry': 1,
 'leather jewelry box': 1,
 'lock': 1,
 'organizer': 1,
 'organizer travel': 1,
 'organizer travel case': 1,
 'storage': 1,
 'storage organizer': 1,
 'storage organizer travel': 1,
 'travel': 1,
 'travel case': 1,
 'travel case lock': 1}

In [57]:
# Check number of rows and columns - eliminated an additional 115 columns by getting rid of ones names with integers.
feature_names_df.shape

(13732, 9885)

In [58]:
feature_names_df.head()

Unnamed: 0,00 carat,00 carat total,00g,0mm,10 cttw,10 cttw color,10 inch,10 pack,10 pack cotton,10 running,...,zirconia halo,zirconia pendant,zirconia pendant necklace,zirconia ring,zirconia round,zirconia stud,zirconia stud earring,zombie,zoned,zoned support
0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


### Add name features 

Merge `categories` with `feature_names_df` on the index.

In [59]:
cats_feats = pd.merge(categories, feature_names_df, right_index=True, left_index=True)

In [60]:
cats_feats.head()

Unnamed: 0,asin,name_x,arts_crafts_sewing,automotive,baby_products,beauty_personal_care,cell_phones_accessories,clothing_shoes_jewelry,electronics,health_household,...,zirconia halo,zirconia pendant,zirconia pendant necklace,zirconia ring,zirconia round,zirconia stud,zirconia stud earring,zombie,zoned,zoned support
0,0000031887,Mystiqueshapes Girls Ballet Tutu Neon Lime Green,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0,0,0,0,0,0,0,0,0,0
1,0123456479,SHINING IMAGE HUGE PINK LEATHER JEWELRY BOX / CASE / STORAGE / ORGANIZER WITH TRAVEL CASE AND LOCK,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,...,0,0,0,0,0,0,0,0,0,0
2,1608299953,Learn French: Rosetta Stone French - Level 1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0,0,0,0,0,0,0,0,0,0
3,1617160377,Learn Italian: Rosetta Stone Italian - Level 1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0,0,0,0,0,0,0,0,0,0
4,B00001W0KA,Buzz Lightyear Boy's Deluxe Toy Story Costume,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0,0,0,0,0,0,0,0,0,0


Check that the 814 category columns were successfully added.

In [61]:
cats_feats.shape

(13732, 10699)

Drop `asin` column and set `name` to index.

In [62]:
cats_feats = cats_feats.set_index('name_x').drop(columns='asin')

In [63]:
cats_feats.shape

(13732, 10697)

In [64]:
cats_feats_sparse = sparse.csr_matrix(cats_feats.fillna(0))

In [65]:
cats_feats_sparse

<13732x10697 sparse matrix of type '<class 'numpy.float64'>'
	with 220665 stored elements in Compressed Sparse Row format>

Check similiarity of every item to every other item with pairwise distances & newly added features. 

In [66]:
recommender2 = pairwise_distances(cats_feats, metric='cosine')

In [67]:
recommender2.round(3)

array([[0.   , 0.488, 0.332, ..., 0.566, 0.387, 0.371],
       [0.488, 0.   , 0.465, ..., 0.649, 0.528, 0.47 ],
       [0.332, 0.465, 0.   , ..., 0.587, 0.415, 0.342],
       ...,
       [0.566, 0.649, 0.587, ..., 0.   , 0.517, 0.573],
       [0.387, 0.528, 0.415, ..., 0.517, 0.   , 0.423],
       [0.371, 0.47 , 0.342, ..., 0.573, 0.423, 0.   ]])

In [68]:
recommender_df = pd.DataFrame(recommender2.round(3), index=cats_feats.index, columns=cats_feats.index)
recommender_df.head()

name_x,Mystiqueshapes Girls Ballet Tutu Neon Lime Green,SHINING IMAGE HUGE PINK LEATHER JEWELRY BOX / CASE / STORAGE / ORGANIZER WITH TRAVEL CASE AND LOCK,Learn French: Rosetta Stone French - Level 1,Learn Italian: Rosetta Stone Italian - Level 1,Buzz Lightyear Boy's Deluxe Toy Story Costume,Woody Deluxe Child - Size: Child S(4-6),Lewis N. Clark Stash,"Lewis N. Clark Deluxe Neck Stash, Beige","Lewis N. Clark Add-A-Bag Travel Luggage Strap, Black, One Size",Buzz Lightyear Jet Pack,...,Sakkas Everyday Essentials Caftan Tank Dress/Cover Up,Sakkas Natalie Sequin Tie Dye Blouse,Sakkas Malvina Marbled Embroidery Cap Sleeves Blouse/Top,Lindy Bop 'Ophelia' Vintage 1950's Floral Spring Garden Party Picnic Dress,Free to Live Women's Fold Over High Waisted Flowy Floor Length Maxi Skirt,Rampage Womens Ultra Cute Embroidered Heather Jersey Night Shirt (Small-3X),14k Gold-Bonded Sterling Silver Tri-Color Hoop Earrings,Classic Designs Stretch Poplin Elastic Waist Cargo Capri,"TendzArt Azules Poly Span Floral Print Full Length Long Maxi Skirt - Made in USA (Medium, Purple+Mint+Yellow)","Kenneth Cole Reaction Easy To Remember, Black, One Size"
name_x,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
Mystiqueshapes Girls Ballet Tutu Neon Lime Green,0.0,0.488,0.332,0.318,0.306,0.362,0.316,0.344,0.435,0.322,...,0.347,0.415,0.395,0.37,0.34,0.442,0.429,0.566,0.387,0.371
SHINING IMAGE HUGE PINK LEATHER JEWELRY BOX / CASE / STORAGE / ORGANIZER WITH TRAVEL CASE AND LOCK,0.488,0.0,0.465,0.454,0.444,0.454,0.424,0.446,0.493,0.424,...,0.474,0.528,0.512,0.493,0.491,0.551,0.518,0.649,0.528,0.47
Learn French: Rosetta Stone French - Level 1,0.332,0.465,0.0,0.036,0.274,0.333,0.285,0.313,0.409,0.291,...,0.348,0.428,0.399,0.371,0.369,0.448,0.402,0.587,0.415,0.342
Learn Italian: Rosetta Stone Italian - Level 1,0.318,0.454,0.036,0.0,0.26,0.32,0.271,0.3,0.397,0.277,...,0.334,0.417,0.387,0.358,0.356,0.437,0.39,0.579,0.403,0.329
Buzz Lightyear Boy's Deluxe Toy Story Costume,0.306,0.444,0.274,0.26,0.0,0.265,0.258,0.254,0.386,0.264,...,0.323,0.406,0.377,0.347,0.345,0.427,0.38,0.571,0.392,0.317


In [69]:
recommender_df.shape

(13732, 13732)

A search query to find the exact title of an item.

In [131]:
q = "Gown"
categories.loc[categories['name'].str.contains(q), 'name'] 

432                                                                         Shadowline Women's Silhouette 53 Inch Sleeveless Long Gown
1643                                                                              3 Bone Hoop Skirt Bridal Wedding Gown Slip (CH130DS)
1761                                                        Shadowline Women's Plus-Size Silhouette 53 Inch Short Cap Sleeve Long Gown
1783                                     SeXy Sheer Stretch Lace Nightgown Long Gown Thong Panty Set Black or Red Color: Red, ONE SIZE
2889                                                                                Shadowline Women's Petals 53 Inch Sleeve Long Gown
4914                                                                      Del Rossa Womens Soild Color Satin Robe, Short Dressing Gown
4994                                                     Melissa & Doug Princess Role Play Costume Set (3 pcs)- Pink Gown, Tiara, Wand
5435                                                   

Check Buzz Lightyear kid's costume again to compare to original and to the no max features model.


In [71]:
recommender_df["Buzz Lightyear Boy's Deluxe Toy Story Costume"].sort_values()[1:11]


name_x
California Costumes Toys Hercules                             0.090
Storybook Cinderella Prestige                                 0.090
California Costumes Toys Vampire Girl                         0.107
The Elder Wand, The Wand of Professor Dumbledore              0.125
Child's Red and Black Spanish Princess Costume, Small         0.127
Rubie's Deluxe George Washington Children's Costume, Large    0.135
Disney Frozen Enchanting Dress - Anna                         0.135
Thor Child Movie Hammer                                       0.136
Spiderman Muscle - Size: Child M(7-8)                         0.147
DC Super Heroes Child's Batgirl Costume, Large                0.153
Name: Buzz Lightyear Boy's Deluxe Toy Story Costume, dtype: float64

A pink jewelry box. 

In [106]:
recommender_df['Fairy Tale Jewelry Box'].sort_values()[1:11]

name_x
Enchantmints Unicorn Music Jewelry Box                                                   0.035
Orb Factory Sticky Mosaics: Jewelry Box                                                  0.036
Schylling Iridescent Fairy Jewelry Box                                                   0.037
Enchantmints Horse Ranch Music Jewelry Box                                               0.051
Lenox Childhood Memories Ballerina Jewelry Box                                           0.051
Schylling - Pink Jewel Jewelry Box                                                       0.057
Schylling Iridescent Ballerina Jewelry Box                                               0.068
Mele & Co. Renee Jewelry Box - 10.5W x 2.25H in.                                         0.083
Sueded Jewelry Box with 24 Sections in Red - Maria - Jewelry Boxes by Mele - 0054522M    0.084
Travelon Jewelry Roll- Leopard                                                           0.104
Name: Fairy Tale Jewelry Box, dtype: float6

A baby beanie that looks like a football.

In [107]:
recommender_df["Mud Pie Baby Boys' Football Hat"].sort_values()[1:11]

name_x
Zutano Unisex Baby Cozie Shaggy Hat                               0.156
Babysoy Janey Baby Hat                                            0.191
Lauren Madison Baby Boy Christening Socks with Cross Appliques    0.196
Babylegs Baby Boys' Monsters Leg Warmer                           0.196
Sunday Afternoons Kids Play Hat                                   0.207
Mud Pie Jeweled Flower Socks                                      0.209
Mud Pie Baby Girl's Birthday Tutu Dress                           0.210
i play. Baby Boys' Classic Flap Sun Protection Hat                0.211
Mud Pie Baby-Girls Newborn Tulle Puff Socks                       0.213
Pearl iZUMi Transfer Hat                                          0.215
Name: Mud Pie Baby Boys' Football Hat, dtype: float64

Search by row index.

In [114]:
recommender_df["U.S. Polo Assn. Men's Short Snorkel Jacket"].sort_values()[1:11]

name_x
U.S. Polo Assn. Men's Basic Vest Small Horse                   0.157
U.S. Polo Assn. Men's Classic Polo Shirt                       0.162
U.S. Polo Assn. Men's Striped V-Neck T-Shirt                   0.165
Columbia Men's Bugaboo Interchange Jacket                      0.165
U.S. Polo Assn. Men's Striped T-shirt                          0.166
Free Country Men's 3 in 1 Systems Jacket                       0.168
Members Only Men's Iconic Classic Racer Jacket                 0.172
Down Under Australian Oilskin Drover                           0.174
U.S. Polo Assn. Men's Hoodie with Nubby Polar Fleece Lining    0.179
U.S. Polo Assn. Men's Side Stripe Fleece Pants                 0.182
Name: U.S. Polo Assn. Men's Short Snorkel Jacket, dtype: float64

In [124]:
recommender_df["Secret Wishes Star Wars Princess Leia Prisoner Adult Costume"].sort_values()[1:11]

name_x
Secret Wishes Star Wars Princess Leia Costume                 0.065
Secret Wishes Maid Costume                                    0.162
Secret Wishes Women's Adult Supergirl Costume                 0.173
InCharacter Wretched Witch Adult Costume-                     0.202
Secret Wishes Batman Arkham City Sexy Harley Quinn Costume    0.223
Secret Wishes Adult Cecilia Wig                               0.232
Secret Wishes Adult Cecilia Wig                               0.232
DC Comics Deluxe Batgirl Adult Costume                        0.238
Star Wars Princess Leia Costume                               0.243
elope Where's Waldo Adult Costume Kit                         0.257
Name: Secret Wishes Star Wars Princess Leia Prisoner Adult Costume, dtype: float64

In [125]:
categories[categories['name'] == "Secret Wishes Star Wars Princess Leia Prisoner Adult Costume"]

Unnamed: 0,asin,name,arts_crafts_sewing,automotive,baby_products,beauty_personal_care,cell_phones_accessories,clothing_shoes_jewelry,electronics,health_household,...,trunks,umbrellas,underwear,wallets,wear_to_work,wrist_watches,arm_warmers,baseball_caps,berets,overall_mean
2611,B001B6N6SK,Secret Wishes Star Wars Princess Leia Prisoner Adult Costume,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,3.55


In [75]:
foo = cats_feats.loc['Van Authentic'].to_dict()

{k:v for k,v in foo.items() if v != 0}

{'clothing_shoes_jewelry': 1.0,
 'skateboarding': 1.0,
 'women': 1.0,
 'shoes': 1.0,
 'athletic_x': 1.0,
 'overall_mean': 4.55,
 'authentic': 1.0,
 'van': 1.0}

Save to csv

In [76]:
# recommender_df.to_csv('data/recommender.csv')

# recommender_df = pd.read_csv('data/recommender.csv').set_index('name_x')

# list(recommender_df["Mud Pie Baby Boys' Football Hat"].sort_values()[1:11].index)

## TF-IDF

Import & Instantiate TfIdfVectorizer

In [77]:
# Import
from sklearn.feature_extraction.text import TfidfVectorizer

#Instantiate. Remove all color stop words 
tfidf = TfidfVectorizer(analyzer='word',
                        stop_words=list_of_colors,
                        binary=True,         # If True, all non-zero term counts are set to 1. however, outputs will not be only 0/1 values, only that the tf term in tf-idf is binary.
                        ngram_range=(1, 3),
                        max_features=10000,  # maximum number of features, n most frequent 
                        min_df=2,            # minimum number of documents a word must appear in   
                        norm='l2',           # scores without this parameter seemed overfit, l2 is better than l1 in this case because we already have a sparse matrix
#                        smooth_idf=False,
#                         use_idf=False
                   )

Fit transform the TFI-Df to the `name_split` column

In [78]:
tfidf_matrix = tfidf.fit_transform(names['name_split'])

#Output the shape of tfidf_matrix
tfidf_matrix.shape

  'stop_words.' % sorted(inconsistent))


(13732, 10000)

In [79]:
tfidf_vocab = tfidf.get_feature_names()

In [80]:
keep_words = []     # instantiate an empty list 
for word in tfidf_vocab:  # loop through the list words/strings
    try:
        [int(i) for i in word.split(" ")]  # try to turn each word in the string into an integer 
    except:                               # if it gets an error = not an integer
        keep_words.append(word)           # add it tot he keep words list

In [81]:
feature_names_tfidf = pd.DataFrame(tfidf_matrix.toarray(),
                         columns= tfidf.get_feature_names())

In [82]:
feature_names_tfidf.head()

Unnamed: 0,00,00 carat,00 carat total,00g,01,02,03,05,0mm,10,...,zirconia halo,zirconia pendant,zirconia pendant necklace,zirconia ring,zirconia round,zirconia stud,zirconia stud earring,zombie,zoned,zoned support
0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [83]:
feature_names_tfidf['classic'].value_counts().head()

0.000000    13368
0.147891        5
0.132128        3
0.137835        3
0.175644        2
Name: classic, dtype: int64

In [84]:
feature_names_tfidf = feature_names_tfidf[feature_names_tfidf.columns.intersection(keep_words)]

In [85]:
feature_names_tfidf.shape

(13732, 9885)

In [86]:
df2 = pd.merge(categories, feature_names_tfidf, right_index=True, left_index=True)

In [87]:
df2 = df2.set_index('name_x').drop(columns='asin')

In [88]:
df2.head()

Unnamed: 0_level_0,arts_crafts_sewing,automotive,baby_products,beauty_personal_care,cell_phones_accessories,clothing_shoes_jewelry,electronics,health_household,home_kitchen,industrial_scientific,...,zirconia halo,zirconia pendant,zirconia pendant necklace,zirconia ring,zirconia round,zirconia stud,zirconia stud earring,zombie,zoned,zoned support
name_x,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
Mystiqueshapes Girls Ballet Tutu Neon Lime Green,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
SHINING IMAGE HUGE PINK LEATHER JEWELRY BOX / CASE / STORAGE / ORGANIZER WITH TRAVEL CASE AND LOCK,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
Learn French: Rosetta Stone French - Level 1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
Learn Italian: Rosetta Stone Italian - Level 1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
Buzz Lightyear Boy's Deluxe Toy Story Costume,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [89]:
df2.shape

(13732, 10697)

In [90]:
df2['classic'].value_counts().head()

0.000000    13368
0.147891        5
0.132128        3
0.137835        3
0.175644        2
Name: classic, dtype: int64

In [91]:
tfidf_sparse = sparse.csr_matrix(df2.fillna(0))

In [92]:
tfidf_sparse

<13732x10697 sparse matrix of type '<class 'numpy.float64'>'
	with 220665 stored elements in Compressed Sparse Row format>

In [93]:
recommender5 = pairwise_distances(df2, metric='cosine')

In [94]:
recommender5.shape

(13732, 13732)

In [95]:
recommender_tfidf = pd.DataFrame(recommender5.round(3), index=df2.index, columns=df2.index)

In [121]:
q = "Prom Dress"
categories.loc[categories['name'].str.contains(q), 'name'][:]

5570    Long Satin Bandage Evening Gown Formal Bridesmaid Prom Dress Brooch
Name: name, dtype: object

In [97]:
recommender_tfidf["Buzz Lightyear Boy's Deluxe Toy Story Costume"].sort_values()[1:11]

name_x
California Costumes Toys Hercules                                                                                   0.025
California Costumes Toys Vampire Girl                                                                               0.026
Rubies Star Wars Classic Child's Deluxe Jedi Knight Costume, Medium                                                 0.029
Rubie's Deluxe George Washington Children's Costume, Large                                                          0.031
Super DC Heroes Deluxe Muscle Chest Superman Costume, Child's Large                                                 0.032
DC Comics Wonder Woman Toddler Costume Red Caped T-Shirt                                                            0.033
Forum Novelties Sparkle Princess Costume, Toddler Size                                                              0.033
Child's Red and Black Spanish Princess Costume, Small                                                               0.033
Melissa & Doug Ch

In [130]:
recommender_tfidf["Secret Wishes Star Wars Princess Leia Prisoner Adult Costume"].sort_values()[1:10]

name_x
Secret Wishes Star Wars Princess Leia Costume                     0.013
Secret Wishes Maid Costume                                        0.025
Secret Wishes Women's Adult Supergirl Costume                     0.027
Secret Wishes Batman Arkham City Sexy Harley Quinn Costume        0.033
DC Comics Secret Wishes Wonder Woman Corset Costume               0.036
DC Comics Deluxe Batgirl Adult Costume                            0.038
InCharacter Wretched Witch Adult Costume-                         0.039
Secret Wishes Women's Dark Knight Rises Adult Catwoman Costume    0.042
California Costumes Deluxe Hooded Robe Adult Costume              0.043
Name: Secret Wishes Star Wars Princess Leia Prisoner Adult Costume, dtype: float64

In [122]:
recommender_tfidf["Long Satin Bandage Evening Gown Formal Bridesmaid Prom Dress Brooch"].sort_values()[1:11]

name_x
Ever-Pretty Sleeveless V-Neck Semi-Formal Maxi Evening Dress 09016                  0.030
Ever-Pretty Sleeveless V-Neck Semi-Formal Maxi Evening Dress 09016                  0.034
50s Strapless Satin Bridesmaid Bridesmaid Dress Homecoming                          0.034
Wild Zebra Inspired Graphic Print Beaded Halter Smocked Bodice Long/Maxi Dress      0.035
Sakkas Stonewashed Rayon Embroidered Adjustable Spaghetti Straps Long Dress         0.036
Arlyn Perez V Neck Floral Lace 3/4 Sleeve Cocktail Party Mini Dress                 0.036
Columbia Women's Saturday Trail Stretch Dress                                       0.036
Funfash Plus Size Women Blue Black Empire Waist A Line Block Long Maxi New Dress    0.036
Funfash Plus Size Women Empire Waist A Line Slimming Cocktail Dress Made in USA     0.037
Funfash Plus Size Women Empire Waist A Line Slimming Cocktail Dress Made in USA     0.037
Name: Long Satin Bandage Evening Gown Formal Bridesmaid Prom Dress Brooch, dtype: float64

In [128]:
recommender_tfidf["Kanu Surf Men's Havana Swim Trunk"].sort_values()[1:11]

name_x
Kanu Surf Men's Barracuda Swim Trunk                                                     0.007
Kanu Surf Men's Barracuda Swim Trunks (Regular & Extended Sizes)                         0.010
Speedo Surf Runner Volley Swim Trunks, Red Pepper, X-Large                               0.027
Speedo Men's Marina Swim Trunk                                                           0.029
Speedo Men's Marina Swim Trunk- Manufacturer Discontinued - Manufacturer Discontinued    0.032
Zehui Mens Swimwear Sexy Sport Shorts Tie Rope Swim Trunks                               0.032
Speedo Men's Poly Mesh Square Leg Swimsuit                                               0.048
Kanu Surf Men's CB Rashguard UPF 50+ Swim Shirt                                          0.060
Kanu Surf Men's Solid Rashguard UPF 50+ Swim Shirt                                       0.062
Mens Solid Color with Racing Stripe Skate Surf Board Short/Swim Trunks                   0.076
Name: Kanu Surf Men's Havana Swim Trunk, dt

In [138]:
categories[categories['name'] == "Kanu Surf Men's Havana Swim Trunk"]

Unnamed: 0,asin,name,arts_crafts_sewing,automotive,baby_products,beauty_personal_care,cell_phones_accessories,clothing_shoes_jewelry,electronics,health_household,...,trunks,umbrellas,underwear,wallets,wear_to_work,wrist_watches,arm_warmers,baseball_caps,berets,overall_mean
5205,B004DNVOW8,Kanu Surf Men's Havana Swim Trunk,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,...,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,3.86


In [100]:
foo = df2.loc["Cute Space Kitten Ladies Leggings"].to_dict()

{k:v for k,v in foo.items() if v != 0}

{'clothing_shoes_jewelry': 1.0,
 'leggings': 1.0,
 'women': 1.0,
 'clothing_x': 1.0,
 'overall_mean': 4.67,
 'cute': 0.4385577715363202,
 'kitten': 0.5380193559519271,
 'lady': 0.33487446394786097,
 'legging': 0.3414623253931198,
 'space': 0.5380193559519271}

In [101]:
bar = df2.loc["Yelete Women's Premium Color Warm Legging"].to_dict()

{k:v for k,v in bar.items() if v != 0}

{'clothing_shoes_jewelry': 1.0,
 'leggings': 1.0,
 'women': 1.0,
 'clothing_x': 1.0,
 'overall_mean': 4.36,
 'color': 0.31743096179852204,
 'legging': 0.39432616643279106,
 'premium': 0.4364655720922731,
 'warm': 0.47820533447493535,
 'woman': 0.12969510242520035,
 'woman premium': 0.5547441769166043}

In [102]:
#recommender_tfidf.to_csv('data/tfidf_recommender.csv')