### Importin libraries

In [1]:
import pandas as pd
from sklearn.preprocessing import OneHotEncoder, MinMaxScaler
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity
import numpy as np

### Loading transformed data

In [2]:
# Load the coffee dataset
file_path = '../data_store/coffee.csv'
coffee_data = pd.read_csv(file_path)

# Step 1: Data Preprocessing
# Handle missing values by filling with a placeholder
coffee_data.fillna('None', inplace=True)

In [3]:
coffee_data.head()

Unnamed: 0,roaster,name,link,price,altitude,varietal,processing,estate,roast_level,tasting_notes,description,country,scraped_at,transformed_at,location,producers,aroma,acidity,body,other_properties
0,bloom_coffee_roasters,ro-busta rhymes - dark roast,https://bloomcoffeeroasters.in/collections/cof...,440.0,1005.0,sln-795,"natural , washed","venkids valley, moganad",MEDIUM_DARK,,elevation : 3300 ft . – 4500ft . varietal : ar...,india,2024-09-02 04:48:32.811752,2024-09-25 13:26:35.566899,"coorg, karnataka, yercaud, tamil nadu","pavan nanjappa, msp coffee, navin rajes",,,,
1,bloom_coffee_roasters,vin van gogh - medium roast,https://bloomcoffeeroasters.in/collections/cof...,480.0,1135.0,"sln-5b, sln-6, sln-9",washed,"venkids valley, moganad",MEDIUM_DARK,"berries , almonds , spiced chocolate","elevation : 3200 - 4250 ft . varietal : sl6 , ...",india,2024-09-02 04:48:32.811752,2024-09-25 13:26:35.566899,"coorg, karnataka, yercaud, tamil nadu","pavan nanjappa, msp coffee, navin rajes",,,,fruity
2,bloom_coffee_roasters,venkids valley estate - the red honey - light ...,https://bloomcoffeeroasters.in/collections/cof...,525.0,1000.0,catuai,"yellow honey , red honey",venkids valley,LIGHT,"mango , hibiscus , butterscotch",elevation : 1000 masl variety : catuai process...,india,2024-09-02 04:48:32.811752,2024-09-25 13:26:35.566899,"coorg, karnataka",pavan nanjappa,,,richer,
3,bloom_coffee_roasters,venkids valley estate (washed) – coorg - mediu...,https://bloomcoffeeroasters.in/collections/cof...,490.0,1097.0,sln-6,washed,venkids valley,MEDIUM,"mexican chocolate , caramel , orange zest",elevation : 3600 ft . varietal : sln6 processi...,india,2024-09-02 04:48:32.811752,2024-09-25 13:26:35.566899,"coorg, karnataka",pavan nanjappa,,,syrupy,
4,bloom_coffee_roasters,cascara (coffee cherry tea) - salawara estate ...,https://bloomcoffeeroasters.in/collections/cof...,250.0,1050.0,,"washed , coffee cherry tea",salawara,,"sugar , honey , lemon","cascara , also known as “coffee cherry tea” is...",india,2024-09-02 04:48:32.811752,2024-09-25 13:26:35.566899,"chickmagaluru, karnataka",sharan gowda,,,,


### Using roast_level,estate,varietal,tasting notes,processings , will need to change this based on recommender type

* Using above 5 if most similar is needed
* But many time recommendation is needed based on tasting notes alone!

In [4]:
encoder = OneHotEncoder(sparse_output=False)
encoded_features = encoder.fit_transform(coffee_data[['roast_level', 'estate']])

# Use TF-IDF for textual data like tasting notes and processing
tfidf_vectorizer_notes = TfidfVectorizer()
tfidf_vectorizer_processing = TfidfVectorizer()
tfidf_vectorizer_varietal = TfidfVectorizer()

# tfidf_vectorizer_aroma = TfidfVectorizer()
# tfidf_vectorizer_acidity = TfidfVectorizer()
# tfidf_vectorizer_body = TfidfVectorizer()

tasting_notes_tfidf = tfidf_vectorizer_notes.fit_transform(coffee_data['tasting_notes'])
processing_tfidf = tfidf_vectorizer_processing.fit_transform(coffee_data['processing'])
varietal_tfidf = tfidf_vectorizer_varietal.fit_transform(coffee_data['varietal'])

# aroma_tfidf = tfidf_vectorizer_aroma.fit_transform(coffee_data['aroma'])
# acidity_tfidf = tfidf_vectorizer_acidity.fit_transform(coffee_data['acidity'])
# body_tfidf = tfidf_vectorizer_body.fit_transform(coffee_data['body'])

coffee_data['altitude'] = coffee_data['altitude'].replace('None', 0).astype(float)
# Normalize the altitude using MinMaxScaler
scaler = MinMaxScaler()
altitude_scaled = scaler.fit_transform(coffee_data[['altitude']])


# Combine encoded features and TF-IDF vectors into a single feature matrix
features = np.hstack([
    encoded_features, 
    tasting_notes_tfidf.toarray(), 
    processing_tfidf.toarray(),
    varietal_tfidf.toarray(),
    altitude_scaled
    ])

# Step 3: Compute Cosine Similarity
similarity_matrix = cosine_similarity(features)

# Step 4: Build the Recommender System
def recommend_coffee(coffee_name, similarity_matrix, coffee_data, top_n=5):
    try:
        idx = coffee_data[coffee_data['name'].str.lower() == coffee_name.lower()].index[0]
    except IndexError:
        return "Coffee not found in the dataset."
    
    similarity_scores = list(enumerate(similarity_matrix[idx]))
    similarity_scores = sorted(similarity_scores, key=lambda x: x[1], reverse=True)
    
    top_indices = [score[0] for score in similarity_scores[1:top_n+1]]

    queried_set = coffee_data[coffee_data['name'].str.lower() == coffee_name.lower()][['roaster', 'name', 'altitude', 'varietal',
        'processing', 'estate', 'roast_level', 'tasting_notes','location', 'producers',
        'aroma', 'acidity', 'body', 'other_properties']]
    return_set = coffee_data.iloc[top_indices][['roaster', 'name', 'altitude', 'varietal',
        'processing', 'estate', 'roast_level', 'tasting_notes','location', 'producers',
        'aroma', 'acidity', 'body', 'other_properties']]

    return pd.concat([queried_set,return_set])

In [6]:
# Example usage: recommend coffees similar to provided name
recommendations = recommend_coffee('Heisenberg', similarity_matrix, coffee_data, top_n=5)
recommendations

Unnamed: 0,roaster,name,altitude,varietal,processing,estate,roast_level,tasting_notes,location,producers,aroma,acidity,body,other_properties
189,savorworks,Heisenberg,1450.0,sln-9,"natural , fruit infused carbonic maceration",riverdale,LIGHT,"fig , cranberry , roasted pineapple , port wine","yercaud, tamil nadu",prakashan balaraman,,malic,heavy,very high aftertaste
42,blue_tokai,Riverdale Estate N72,1500.0,"sln-5b, sln-9",natural,riverdale,LIGHT,"milk chocolate , grapefruit , red apple","yercaud, tamil nadu",prakashan balaraman,dried fruits,medium high,"bright , syrupy",
66,corridor_seven,Riverdale Estate - Geisha,1450.0,geisha,carbonic maceration,riverdale,LIGHT,"jasmine tea , honeysuckle , macerated plums , ...","yercaud, tamil nadu",prakashan balaraman,,,,
64,corridor_seven,Melkodige Estate - LORD 1385/1386,1280.0,sln-9,natural,melkodige,LIGHT,"pineapple , raisin , white grape","chickmagaluru, karnataka","aveen rodrigues, yogitha",,,,
188,savorworks,Blue Sky - Mixed Naturals,1450.0,"sln-9, sln-795",mixed fruit fermented natural,riverdale,MEDIUM,"chamomile , red grape , blueberry , butterscotch","yercaud, tamil nadu",prakashan balaraman,,malic,creamy,very long aftertaste
61,corridor_seven,Melkodige Estate - Anaerobic Naturals,1280.0,sln-9,anaerobic natural,melkodige,LIGHT,"honeymelon , papaya , grapefruit","chickmagaluru, karnataka","aveen rodrigues, yogitha",,,,
