# Simple Similarity based Recommmendations
> A beginner guide to understand the similarity based recommendations from the ground-up

- toc: true
- badges: true
- comments: true
- categories: [similarity]
- image: 

In [10]:
!pip install names
!pip install faker_music
!pip install faker

Collecting faker
[?25l  Downloading https://files.pythonhosted.org/packages/eb/70/b46c571bfd410d56b8794fa87257eb084f3a6b0c6783bf05188bd33276be/Faker-8.8.1-py3-none-any.whl (1.2MB)
[K     |████████████████████████████████| 1.2MB 4.1MB/s 
Installing collected packages: faker
Successfully installed faker-8.8.1


In [164]:
import numpy as np
import names
from faker import Faker
from faker_music import MusicProvider
from itertools import product
import pandas as pd

import warnings
warnings.filterwarnings("ignore")

fake = Faker()
fake.add_provider(MusicProvider)

In [121]:
def generate_data():
  
  NUM_USER = 8
  NUM_ITEM = 8

  RATING_OPTIONS = np.arange(0,5.5,0.5)

  while True:
    
    users = [names.get_full_name() for i in range(NUM_USER)]
    items = [fake.music_genre() for i in range(NUM_ITEM)]

    data = pd.DataFrame(list(product(users, items)), columns=['USERS', 'ITEMS'])

    PROBS = [0.6]
    PROBS.extend([0.05]*(len(RATING_OPTIONS)-1))
    PROBS = [i/sum(PROBS) for i in PROBS]

    data['RATINGS'] = np.random.choice(RATING_OPTIONS, NUM_USER*NUM_ITEM, p=PROBS)

    data['RATINGS'] = data['RATINGS'].astype('float32')

    data = pd.pivot_table(data, index='ITEMS', columns='USERS', values='RATINGS')

    data = data.replace(0, np.nan)

    if len(np.unique(items))==NUM_ITEM:
      return data, users

In [130]:
data, users = generate_data()
data

USERS,Betty Wilkinson,Cynthia Doughty,Earl Williams,George Brooks,Halina Manganaro,Johnson Butera,Laurence Force,Ralph Papp
ITEMS,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
Alternative,3.5,,4.0,,2.0,5.0,,
Comedy,,3.5,0.5,4.5,,,3.5,
German Folk,,,,,4.5,,,
Jazz,,1.5,,0.5,0.5,1.5,,
Pop,,2.5,,,4.0,,3.0,
Rock,,,3.0,,,0.5,,1.5
Singer/Songwriter,,,2.0,,,,0.5,
Spoken Word,,1.5,1.0,,,1.5,5.0,


In [134]:
# with open('data.json', 'w') as f:
#   f.write(json.dumps([row.dropna().to_dict() for index, row in data.iterrows()]))
data.to_json('data.json')
pd.read_json('data.json')

Unnamed: 0,Betty Wilkinson,Cynthia Doughty,Earl Williams,George Brooks,Halina Manganaro,Johnson Butera,Laurence Force,Ralph Papp
Alternative,3.5,,4.0,,2.0,5.0,,
Comedy,,3.5,0.5,4.5,,,3.5,
German Folk,,,,,4.5,,,
Jazz,,1.5,,0.5,0.5,1.5,,
Pop,,2.5,,,4.0,,3.0,
Rock,,,3.0,,,0.5,,1.5
Singer/Songwriter,,,2.0,,,,0.5,
Spoken Word,,1.5,1.0,,,1.5,5.0,


In [135]:
import json

with open('data.json') as file:
    data = json.load(file)

data

{'Betty Wilkinson': {'Alternative': 3.5,
  'Comedy': None,
  'German Folk': None,
  'Jazz': None,
  'Pop': None,
  'Rock': None,
  'Singer/Songwriter': None,
  'Spoken Word': None},
 'Cynthia Doughty': {'Alternative': None,
  'Comedy': 3.5,
  'German Folk': None,
  'Jazz': 1.5,
  'Pop': 2.5,
  'Rock': None,
  'Singer/Songwriter': None,
  'Spoken Word': 1.5},
 'Earl Williams': {'Alternative': 4.0,
  'Comedy': 0.5,
  'German Folk': None,
  'Jazz': None,
  'Pop': None,
  'Rock': 3.0,
  'Singer/Songwriter': 2.0,
  'Spoken Word': 1.0},
 'George Brooks': {'Alternative': None,
  'Comedy': 4.5,
  'German Folk': None,
  'Jazz': 0.5,
  'Pop': None,
  'Rock': None,
  'Singer/Songwriter': None,
  'Spoken Word': None},
 'Halina Manganaro': {'Alternative': 2.0,
  'Comedy': None,
  'German Folk': 4.5,
  'Jazz': 0.5,
  'Pop': 4.0,
  'Rock': None,
  'Singer/Songwriter': None,
  'Spoken Word': None},
 'Johnson Butera': {'Alternative': 5.0,
  'Comedy': None,
  'German Folk': None,
  'Jazz': 1.5,
  'Pop':

In [136]:
def del_none(d):
    """
    Delete keys with the value ``None`` in a dictionary, recursively.

    This alters the input so you may wish to ``copy`` the dict first.
    """
    # For Python 3, write `list(d.items())`; `d.items()` won’t work
    # For Python 2, write `d.items()`; `d.iteritems()` won’t work
    for key, value in list(d.items()):
        if value is None:
            del d[key]
        elif isinstance(value, dict):
            del_none(value)
    return d  # For convenience

In [138]:
data = del_none(data)
data

{'Betty Wilkinson': {'Alternative': 3.5},
 'Cynthia Doughty': {'Comedy': 3.5,
  'Jazz': 1.5,
  'Pop': 2.5,
  'Spoken Word': 1.5},
 'Earl Williams': {'Alternative': 4.0,
  'Comedy': 0.5,
  'Rock': 3.0,
  'Singer/Songwriter': 2.0,
  'Spoken Word': 1.0},
 'George Brooks': {'Comedy': 4.5, 'Jazz': 0.5},
 'Halina Manganaro': {'Alternative': 2.0,
  'German Folk': 4.5,
  'Jazz': 0.5,
  'Pop': 4.0},
 'Johnson Butera': {'Alternative': 5.0,
  'Jazz': 1.5,
  'Rock': 0.5,
  'Spoken Word': 1.5},
 'Laurence Force': {'Comedy': 3.5,
  'Pop': 3.0,
  'Singer/Songwriter': 0.5,
  'Spoken Word': 5.0},
 'Ralph Papp': {'Rock': 1.5}}

### Minkowski Distance

In [124]:
# defining distance function
def minkowski(ratings1, ratings2, r=2):
    """
    Compute the Minkowski Distance between two users.
    If `r` is not given, it defaults to Euclidian.
    """

    mink_distance = 0

    for key in ratings1:
        if key in ratings2:
            mink_distance += abs(ratings1[key] - ratings2[key])**r

    mink_distance = mink_distance ** (1/r)

    return mink_distance

In [105]:
# finding nearest neighbors
def nearest_neighbors(username, users, r=2):
    """Create a sorted list of users
    based on their Minkowski Distance Metric
    (Euclidian by default) to username"""

    distances = []

    for user in users:
        if user != username:
            mnht_distance = minkowski(users[username], users[user], r)
            distances.append((mnht_distance, user))

    distances.sort()

    return distances

In [150]:
# the recommender system
def recommend_bands(username, users):
    """Recommend bands based on other users' ratings"""
    
    neighbor = nearest_neighbors(username, users)[0][1]

    print("{}'s neighbor is {}".format(username, neighbor))

    recom_bands = []

    for band, rating in users[neighbor].items():
        if not band in users[username]:
            print("{} gave {} stars to {} and {} not listened it before!"\
                  .format(neighbor, rating, band, username))
            recom_bands.append((rating, band))
        else:
            print("{} gave {} stars to {} but {} already listened it!"\
                  .format(neighbor, rating, band, username))
    
    recom_bands.sort(reverse=True)

    return [band[1] for band in recom_bands]

In [151]:
# testing our recommender
for user in users:
  print('Recommendations for {}: {}'.format(user, recommend_bands(user, data)))
  print("\n{}\n".format('='*50))

Johnson Butera's neighbor is Cynthia Doughty
Cynthia Doughty gave 3.5 stars to Comedy and Johnson Butera not listened it before!
Cynthia Doughty gave 1.5 stars to Jazz but Johnson Butera already listened it!
Cynthia Doughty gave 2.5 stars to Pop and Johnson Butera not listened it before!
Cynthia Doughty gave 1.5 stars to Spoken Word but Johnson Butera already listened it!
Recommendations for Johnson Butera: ['Comedy', 'Pop']


Cynthia Doughty's neighbor is Betty Wilkinson
Betty Wilkinson gave 3.5 stars to Alternative and Cynthia Doughty not listened it before!
Recommendations for Cynthia Doughty: ['Alternative']


Betty Wilkinson's neighbor is Cynthia Doughty
Cynthia Doughty gave 3.5 stars to Comedy and Betty Wilkinson not listened it before!
Cynthia Doughty gave 1.5 stars to Jazz and Betty Wilkinson not listened it before!
Cynthia Doughty gave 2.5 stars to Pop and Betty Wilkinson not listened it before!
Cynthia Doughty gave 1.5 stars to Spoken Word and Betty Wilkinson not listened it 

### Pearson Correlation Coefficient

In [157]:
def pearson_corrcoef(x, y):
    
    x_mean = x.mean()
    y_mean = y.mean()

    numer = np.sum( (x - x_mean) * (y - y_mean) )
    denom = ( np.sum( (x - x_mean)**2 ) )**0.5 * ( np.sum( (y - y_mean)**2 ) )**0.5

    return numer / denom

In [158]:
# defining a function to use with our users
def pearson_users(user1, user2):
    
    global data
    ratings1 = []
    ratings2 = []

    for key in data[user1]:
        if key in data[user2]:
            ratings1.append(data[user1][key])
            ratings2.append(data[user2][key])

    ratings1 = np.array(ratings1)
    ratings2 = np.array(ratings2)

    return pearson_corrcoef(ratings1, ratings2)

In [162]:
np.isnan(10)

False

In [165]:
for user1 in users:
  for user2 in users:
    if user1!=user2:
      pearson_corr = pearson_users(user1, user2)
      if not np.isnan(pearson_corr):
        print("Pearson correlation between {} and {} is {:.2f}"\
              .format(user1, user2, pearson_corr))

Pearson correlation between Johnson Butera and Halina Manganaro is 1.00
Pearson correlation between Johnson Butera and Earl Williams is 0.60
Pearson correlation between Cynthia Doughty and Halina Manganaro is 1.00
Pearson correlation between Cynthia Doughty and Earl Williams is -1.00
Pearson correlation between Cynthia Doughty and George Brooks is 1.00
Pearson correlation between Cynthia Doughty and Laurence Force is -0.72
Pearson correlation between Halina Manganaro and Johnson Butera is 1.00
Pearson correlation between Halina Manganaro and Cynthia Doughty is 1.00
Pearson correlation between Earl Williams and Johnson Butera is 0.60
Pearson correlation between Earl Williams and Cynthia Doughty is -1.00
Pearson correlation between Earl Williams and Laurence Force is -0.79
Pearson correlation between George Brooks and Cynthia Doughty is 1.00
Pearson correlation between Laurence Force and Cynthia Doughty is -0.72
Pearson correlation between Laurence Force and Earl Williams is -0.79


### Cosine Similarity

In [177]:
johnson = pd.read_json('data.json').fillna(0)['Johnson Butera'].values
halina = pd.read_json('data.json').fillna(0)['Halina Manganaro'].values

In [178]:
# comparing Johnson and Halina (perfect similarity) using cosine similarity
x_size = np.sqrt( np.sum(johnson**2) )
y_size = np.sqrt( np.sum(halina**2) )
dot_prod = np.dot(johnson, halina)

dot_prod / (x_size * y_size)

0.3096973345860341