## For scaling the prototyped model in the previous step the following options can be considered

#### Please note that my dataset is not large and this is only an example of how I would scale the prototype in case I was dealing with bigger dataset like real world applications

#### 1. Batch Processing for Large Datasets (Using Dask)
Dask allows you to process larger-than-memory datasets by breaking them into manageable chunks. If the dataset is too large to fit in memory, we can use Dask to handle chunks of the data in parallel and perform operations like TF-IDF vectorization in batches.

#### 2. Sparse Matrices and Dimensionality Reduction
This option is useful when you're dealing with very high-dimensional data (e.g., after TF-IDF vectorization). Using sparse matrices helps reduce memory usage, and dimensionality reduction techniques like Truncated SVD can speed up the cosine similarity calculations.

In [1]:
from googleapiclient.discovery import build
from google_auth_oauthlib.flow import InstalledAppFlow
from google.auth.transport.requests import Request
import os
import pickle
import io
from googleapiclient.http import MediaIoBaseDownload

import pandas as pd
import numpy as np

from matplotlib import pyplot as plt
import seaborn as sns 

from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.preprocessing import StandardScaler

import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from tabulate import tabulate

In [2]:
# Scopes for the Google Drive API
SCOPES = ['https://www.googleapis.com/auth/drive']

def authenticate_google_drive():
    creds = None
    if os.path.exists('./token.pickle'):
        with open('./token.pickle', 'rb') as token:
            creds = pickle.load(token)
    if not creds or not creds.valid:
        if creds and creds.expired and creds.refresh_token:
            creds.refresh(Request())
        else:
            flow = InstalledAppFlow.from_client_secrets_file(
                'new_client_id.json', SCOPES)
            creds = flow.run_local_server(port=0)
        # Save the credentials for the next run
        with open('token.pickle', 'wb') as token:
            pickle.dump(creds, token)

    service = build('drive', 'v3', credentials=creds)
    return service

drive_service = authenticate_google_drive()

def download_file(file_id, file_name):
    request = drive_service.files().get_media(fileId=file_id)
    fh = io.FileIO(file_name, 'wb')
    downloader = MediaIoBaseDownload(fh, request)
    done = False
    while done is False:
        status, done = downloader.next_chunk()
        print(f"Download {int(status.progress() * 100)}%.")

In [3]:
# All the datasets can be found here in this drive folder 
# https://drive.google.com/drive/folders/14J0u4AhUwkfKJKYnJzuD6dRW2y9uZqnD?usp=sharing
# dataset from beer advocate
# downloaded from https://www.kaggle.com/datasets/thedevastator/1-5-million-beer-reviews-from-beer-advocate
file_id = '1CmgbvYGtgp0b7wZU8z8FT8tbRCJ6WkYT'
file_name = 'beer_and_profile_ratings.csv'
download_file(file_id, file_name)

df = pd.read_csv(file_name)

Download 100%.


In [4]:
print(df.info())
print(df.shape)

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 3197 entries, 0 to 3196
Data columns (total 25 columns):
 #   Column             Non-Null Count  Dtype  
---  ------             --------------  -----  
 0   Name               3197 non-null   object 
 1   Style              3197 non-null   object 
 2   Brewery            3197 non-null   object 
 3   Beer Name (Full)   3197 non-null   object 
 4   Description        3197 non-null   object 
 5   ABV                3197 non-null   float64
 6   Min IBU            3197 non-null   int64  
 7   Max IBU            3197 non-null   int64  
 8   Astringency        3197 non-null   int64  
 9   Body               3197 non-null   int64  
 10  Alcohol            3197 non-null   int64  
 11  Bitter             3197 non-null   int64  
 12  Sweet              3197 non-null   int64  
 13  Sour               3197 non-null   int64  
 14  Salty              3197 non-null   int64  
 15  Fruits             3197 non-null   int64  
 16  Hoppy              3197 

In [5]:
# checking for null values
df.isna().sum()

Name                 0
Style                0
Brewery              0
Beer Name (Full)     0
Description          0
ABV                  0
Min IBU              0
Max IBU              0
Astringency          0
Body                 0
Alcohol              0
Bitter               0
Sweet                0
Sour                 0
Salty                0
Fruits               0
Hoppy                0
Spices               0
Malty                0
review_aroma         0
review_appearance    0
review_palate        0
review_taste         0
review_overall       0
number_of_reviews    0
dtype: int64

In [6]:
df.columns

Index(['Name', 'Style', 'Brewery', 'Beer Name (Full)', 'Description', 'ABV',
       'Min IBU', 'Max IBU', 'Astringency', 'Body', 'Alcohol', 'Bitter',
       'Sweet', 'Sour', 'Salty', 'Fruits', 'Hoppy', 'Spices', 'Malty',
       'review_aroma', 'review_appearance', 'review_palate', 'review_taste',
       'review_overall', 'number_of_reviews'],
      dtype='object')

## Calculating feature wise similarities

In [7]:
vectorizer = TfidfVectorizer()
beer_style_vectors = vectorizer.fit_transform(df['Style'])
body_vectors = df['Body'].to_numpy().reshape(-1, 1)
beer_desc_vectors = vectorizer.fit_transform(df['Description'])

In [8]:
# Cosine similarities for these vectors
style_similarity = cosine_similarity(beer_style_vectors)
body_similarity = cosine_similarity(body_vectors)

# calculating combined similarity of style and body similarities
combined_similarity = (style_similarity + body_similarity)/2
combined_similarity_df = pd.DataFrame(combined_similarity, index=df.index, columns=df.index)

# calculating description similarities
desc_similarity = cosine_similarity(beer_desc_vectors)

desc_similarity_df = pd.DataFrame(desc_similarity, index=df.index, columns=df.index)

## Content Based Filtering

### Our problem is a cold start problem
#### In our implementation, we will be taking inputs from users on what kind of or style of beer they would like to try and ask if they would like a certain minimum level of abv in their beer
#### This means that we would need to use features based on the content of the user input and compare it to what we have in our dataset and return the closest matching recommendations.

In [9]:
def content_based_recs(style, abv=None, refresh=False):
    global start_idx
    #get similar styles of beers
    target_indices = df.index[df['Style'].str.contains(style, case=False, na=False)].tolist()
    num_recs = 5
    # If refresh is True, show next set of recommendations
    if refresh:
        start_idx += num_recs
    else:
        start_idx = 0  # Reset the index when not refreshing

    # Step 3: Get similar beers based on the description similarity
    for idx in target_indices:
        similar_beers = combined_similarity_df[idx].sort_values(ascending=False)
        similar_beers_by_desc = desc_similarity_df[idx].sort_values(ascending=False)
        
        top_recs = similar_beers.head(20)
        top_desc_recs = similar_beers_by_desc.head(20)
        
        beer_recommendations = df.loc[top_recs.index, ['Style', 'Name', 'ABV']]
        beer_recommendations['similarity_score'] = top_recs.values
        
        desc_recommendations = df.loc[top_desc_recs.index, ['Style', 'Name', 'ABV']]
        desc_recommendations['similarity_score'] = top_desc_recs.values
        
        if (abv is not None):
            if not isinstance(abv, (int, float)):
                print(f"{abv} is not a valid number")
                abv=0
                beer_recommendation = beer_recommendations.loc[df['ABV'] >= abv]
                desc_recommendations = desc_recommendations.loc[df['ABV'] >= abv]
            else:
                beer_recommendation = beer_recommendations.loc[df['ABV'] >= abv]
                desc_recommendations = desc_recommendations.loc[df['ABV'] >= abv]
    
    # Slice the DataFrame based on the start index and number of recommendations to show
    current_beer_recs = beer_recommendations.iloc[start_idx:start_idx + num_recs, :]
    current_desc_recs = desc_recommendations.iloc[start_idx:start_idx + num_recs, :]
        
    print("Here are the recommended beers for you! \n Cheers!! \n")
    # Print DataFrame in tabular format using 'tabulate'
    print(tabulate(current_beer_recs.iloc[:5,:]))
    
    

#### As the name suggests, this method returns the most popular beers based on number of reviews and overall review 

In [10]:
def popularity_based_recs(num_recs=5):
    popular_beers = df[['Name', 'review_overall', 'number_of_reviews']].sort_values(
        by=['review_overall', 'number_of_reviews'], ascending=False).head(num_recs)
    
    print(f"Top {num_recs} popular beers:")
    print(tabulate(popular_beers, headers="keys"))


#### This filters the beers based on the feature defined and returns the top beers for that beer

In [11]:
def top_beers_by_feature(feature, num_recs=5):
    top_beers = df[['Name', feature]].sort_values(by=feature, ascending=False).head(num_recs)
    
    print(f"Top {num_recs} beers by {feature}:")
    print(tabulate(top_beers, headers="keys"))


In [12]:
# Run all recommendation types
style = 'lager'
abv = 3

print("Content Based Recommendations:")
content_based_recs(style, abv)

print("\nPopularity-Based Recommendations:")
popularity_based_recs()

print("\nTop Beers by ABV:")
top_beers_by_feature('ABV')


Content Based Recommendations:
Here are the recommended beers for you! 
 Cheers!! 

----  --------------  ---------------------------------------  ----  -
1559  Lager - Vienna  Saranac Season's Best - Nut Brown Lager  5.3   1
1554  Lager - Vienna  Penn Pilsner                             5     1
1563  Lager - Vienna  Jenny Lake Lager                         4.8   1
1562  Lager - Vienna  The Raven Special Lager                  5.25  1
1561  Lager - Vienna  Rusty Chain                              5.2   1
----  --------------  ---------------------------------------  ----  -

Popularity-Based Recommendations:
Top 5 popular beers:
      Name                             review_overall    number_of_reviews
----  -----------------------------  ----------------  -------------------
1675  Lambik (2 Year Old Unblended)           5                          4
3138  Sang Noir                               4.80769                   13
 764  Helios Goya Dry                         4.75             

 ## Scaling using batch processing

In [13]:
pip install scipy dask

Note: you may need to restart the kernel to use updated packages.


In [14]:
import dask.dataframe as dd
import dask.array as da
from sklearn.feature_extraction.text import TfidfVectorizer
from scipy.sparse import csr_matrix
from sklearn.preprocessing import normalize
from scipy.spatial.distance import cosine
import numpy as np

In [17]:
# Load large dataset using Dask
df = dd.read_csv(file_name)

# Process the 'Style' and 'Description' columns in batches using Dask
def compute_tfidf_batch(df_chunk, column_name):
    vectorizer = TfidfVectorizer(stop_words='english')
    tfidf_matrix = vectorizer.fit_transform(df_chunk[column_name])
    return tfidf_matrix

# Process the 'Style' and 'Description' columns
style_tfidf = df.map_partitions(lambda df_chunk: compute_tfidf_batch(df_chunk, 'Style')).compute()
description_tfidf = df.map_partitions(lambda df_chunk: compute_tfidf_batch(df_chunk, 'Description')).compute()

# Convert to sparse matrix
style_tfidf_sparse = csr_matrix(style_tfidf)
desc_tfidf_sparse = csr_matrix(description_tfidf)

# Normalize the vectors
style_tfidf_sparse = normalize(style_tfidf_sparse)
desc_tfidf_sparse = normalize(desc_tfidf_sparse)

# Define cosine similarity manually
def cosine_similarity_manual(X, Y):
    return 1 - cosine(X, Y)

# Now, instead of using dask cosine_similarity, we manually compute similarity
def compute_cosine_similarity_sparse(X_sparse, Y_sparse):
    # Convert sparse matrix to Dask Array
    X_dask = da.from_array(X_sparse.toarray(), chunks=(1000, 1000))
    Y_dask = da.from_array(Y_sparse.toarray(), chunks=(1000, 1000))
    
    # Compute dot products and norms using Dask
    dot_product = da.dot(X_dask, Y_dask.T)
    X_norm = da.linalg.norm(X_dask, axis=1)
    Y_norm = da.linalg.norm(Y_dask, axis=1)
    
    similarity = dot_product / (X_norm[:, None] * Y_norm[None, :])
    
    return similarity

# Compute cosine similarity for style and description in a scalable way
style_similarity = compute_cosine_similarity_sparse(style_tfidf_sparse, style_tfidf_sparse)
description_similarity = compute_cosine_similarity_sparse(desc_tfidf_sparse, desc_tfidf_sparse)

# Average the similarities to get the combined similarity
combined_similarity = (style_similarity + description_similarity) / 2

print(combined_similarity)


dask.array<truediv, shape=(3197, 3197), dtype=float64, chunksize=(1000, 1000), chunktype=numpy.ndarray>
