# Recommendation System for chocolates.

### Import necessary dependencies. Adjust display settings.

In [None]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.metrics.pairwise import cosine_similarity
from scipy.stats import pearsonr
from scipy.stats import norm

plt.rcParams['figure.figsize'] = [10, 10]
plt.rcParams.update({'font.size': 16})

### Load the data.

In [None]:
cacao = pd.read_csv('../input/chocolate-bar-ratings/flavors_of_cacao.csv')

### Change the column names to be more descriptive.

In [None]:
cacao.columns = ['Company', 'Name', 'Ref', 'Review Date', 'Cocoa percent', 'Country', 'Rating', 'Bean Type', 'Bean Origin']

## Before making the recommendation system, i have a couple of burning questions.

### Is there a correlation between cocoa percentage and rating?

### Let's create a scatterplot of the two attributes.

In [None]:
sns.scatterplot(cacao['Cocoa percent'].apply(lambda x: float(x.split('%')[0])), cacao['Rating']);

### Weak correlation. To quantify this let's measure the Pearson correlation coefficient.

In [None]:
pearsonr(cacao['Cocoa percent'].apply(lambda x: float(x.split('%')[0])), cacao['Rating'])

### Another question i had is whether or not the countries known for their chocolate, really do have the best chocolate. 

In [None]:
countries = cacao.groupby('Country').agg({
    'Rating': 'mean',
    'Name': 'count'
})

In [None]:
countries.sort_values(by='Rating', ascending=False).head(10)

### The top 10 countries rated for their cacao flavors are not known for their chocolate.

In [None]:
countries.sort_values(by='Name', ascending=False).head(10)

### The countries with the most flavors of cacao however are well known for their chocolate. 

## Let's create a simple recommendation system.

### I'm going to use item similarity to find the N most similar items to recommend to a hypothetical user. For item similarity i'm going to use cosine similarity

### First let's do some preprocessing to derive the features we are going to use for the calculation of the similarity.

In [None]:
features = cacao.copy(deep=True)

In [None]:
features['Cocoa percent'] = features['Cocoa percent'].apply(lambda x: float(x.split('%')[0]))

all_companies = list(features['Company'].unique())
features['Company'] = features['Company'].apply(lambda x: all_companies.index(x))

all_countries = list(features['Country'].unique())
features['Country'] = features['Country'].apply(lambda x: all_countries.index(x))

all_beans = list(features['Bean Origin'].unique())
features['Bean Origin'] = features['Bean Origin'].apply(lambda x: all_beans.index(x))

In [None]:
features.head()

### Create a 2D matrix with the similarity for each cacao flavor with every other.

In [None]:
features_as_array = features.drop(columns=['Name', 'Bean Type']).to_numpy()
Similarities = cosine_similarity(features_as_array, features_as_array)

In [None]:
Similarities.shape

### We need to be able to get the index of a cacao flavor in the dataframe by its name.

In [None]:
def get_index(cacao_name):
    return features.loc[features['Name'] == cacao_name].index[0]

print('Index of cacao flavor: {}'.format(get_index('Atsane')))

### Now we need to sort the similarit matrix and retrieve the first N items that the user requests.

In [None]:
def recommend(cacao_name, N=10):
    # Get the index of the chocolate
    index = get_index(cacao_name)
    
    # Put all the similarity scores in a list with their index
    sim_scores = list(enumerate(Similarities[index]))
    
    # Sort the list by the similarity score
    sim_scores = sorted(sim_scores, key=lambda x: x[1], reverse=True)
    
    # Get the first N items
    sim_scores = sim_scores[1:N+1]
    
    # Put their indices in a list
    cacao_indeces = [x[0] for x in sim_scores]
    
    # Return the chocolate names
    return list(features['Name'].iloc[cacao_indeces])

### It is important to be able to derive with the name of a specific item given its index. This is why we use a pandas dataframe.

In [None]:
recommend('Atsane')