# Analytics for Unstructured Data: Group Assignment #2

### Authors: Luke Leon, Maxine Gardner, Kedar Godbole, Kimble Horsak, Sonali Hornick, Deeksha Koonadi
### Afternoon Cohort Fall 2024

# PART A

### Web Scraper Code. Note: this cell takes ~80 minutes to run

In [None]:
!pip install selenium


from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from selenium.common.exceptions import TimeoutException, NoSuchElementException, ElementClickInterceptedException
import pandas as pd
import time

def get_driver():
    options = webdriver.ChromeOptions()
    options.add_argument('--headless')
    driver = webdriver.Chrome(options=options)
    return driver

def get_beer_urls(driver, main_url):
    driver.get(main_url)
    beer_urls = []
    try:
        WebDriverWait(driver, 10).until(EC.presence_of_all_elements_located((By.CSS_SELECTOR, "table tbody tr")))
        links = driver.find_elements(By.CSS_SELECTOR, "table tbody tr a[href]")
        for link in links:
            href = link.get_attribute('href')
            if '/beer/' in href:
                product_name = href.split('/beer/')[1].split('/')[0].replace('-', ' ').title()  # Formatting the URL part into a readable product name
                beer_urls.append((product_name, href))
        print(f"Found {len(beer_urls)} beer links.")
        return beer_urls
    except TimeoutException:
        print("Failed to locate beer profile links within the given time.")
        return []


def set_reviews_per_page(driver):
    try:
        dropdown_input = WebDriverWait(driver, 10).until(
            EC.presence_of_element_located((By.XPATH, '//*[contains(concat( " ", @class, " " ), concat( " ", "MuiInputBase-input", " " ))]'))
        )
        dropdown_input.click()
        option_100 = WebDriverWait(driver, 10).until(
            EC.presence_of_element_located((By.XPATH, '//li[text()="100"]'))
        )
        option_100.click()
        print("Changed number of reviews per page to 100.")
        time.sleep(2)
    except NoSuchElementException:
        print("Failed to find dropdown or option to set reviews per page.")
    except Exception as e:
        print(f"An error occurred while setting reviews per page: {e}")

def click_next_page(driver):
    """
    Click the next page button using JavaScript to avoid element interception, and check if it's disabled.
    """
    try:
        # Locate the "Next" button using the correct XPath
        next_button = WebDriverWait(driver, 10).until(
            EC.presence_of_element_located((By.XPATH, '//*[contains(concat( " ", @class, " " ), concat( " ", "MuiIconButton-colorInherit", " " )) and not(contains(@class, "Mui-disabled"))]'))
        )

        # Check if the button is disabled by class
        if "Mui-disabled" in next_button.get_attribute("class"):
            print("Next button is disabled. No more pages.")
            return False

        # Click the button using JavaScript to avoid interception
        driver.execute_script("arguments[0].click();", next_button)
        print("Navigated to the next page.")
        time.sleep(1)
        return True
    except NoSuchElementException:
        print("Next page button not found.")
        return False
    except ElementClickInterceptedException:
        print("Next button click intercepted by another element.")
        return False
    except Exception as e:
        print(f"An error occurred while clicking the next page: {e}")
        return False

def scrape_reviews(driver, url, product_name, max_reviews=100):
    driver.get(url)
    reviews_data = []
    # Set reviews per page to 100
    set_reviews_per_page(driver)

    while len(reviews_data) < max_reviews:
        # Wait for the reviews to be visible
        WebDriverWait(driver, 10).until(EC.visibility_of_all_elements_located((By.CSS_SELECTOR, 'section[class*="BeerReviewListItem"]')))
        review_elements = driver.find_elements(By.CSS_SELECTOR, 'section[class*="BeerReviewListItem"]')

        for review_element in review_elements:
            if len(reviews_data) >= max_reviews:
                break  # Break the inner loop if we have collected enough reviews

            # Handle "Show more" if present
            try:
                show_more_buttons = review_element.find_elements(By.XPATH, ".//button[contains(., 'Show more')]")
                if show_more_buttons:
                    for button in show_more_buttons:
                        driver.execute_script("arguments[0].click();", button)
                    time.sleep(1)  # Wait for expansion of the text
            except Exception as e:
                print(f"Error clicking 'Show more': {e}")

            # Extract the review text
            try:
                review_text_element = review_element.find_element(By.CSS_SELECTOR, 'div[class*="MuiTypography-body"]')
                review_text = review_text_element.text.strip()
            except NoSuchElementException:
                review_text = "Review text not found."

            # Extract the rating
            try:
                rating_element = review_element.find_element(By.CSS_SELECTOR, 'span.MuiTypography-subtitle1')
                rating = rating_element.text.strip()
            except NoSuchElementException:
                rating = "Rating not found."

            reviews_data.append({
                'Product Name': product_name,
                'Rating': rating,
                'Review': review_text
            })

        if len(reviews_data) >= max_reviews:
            break  # Break the outer loop if we have collected enough reviews

        # Click the next page button
        if not click_next_page(driver):
            break  # Break the loop if no more pages

    print(f"Total reviews scraped for {url}: {len(reviews_data)}")
    return reviews_data



def main():
    main_url = 'https://www.ratebeer.com/top'
    driver = get_driver()
    try:
        beer_urls = get_beer_urls(driver, main_url)
        all_reviews = []
        for product_name, beer_url in beer_urls:
            reviews = scrape_reviews(driver, beer_url, product_name)
            all_reviews.extend(reviews)
        df = pd.DataFrame(all_reviews)
        df.to_csv('beer_reviews.csv', index=False)
        print(f"Scraping completed. Total reviews scraped: {len(all_reviews)}")
    finally:
        driver.quit()

if __name__ == "__main__":
    main()



Navigated to the next page.
Navigated to the next page.
Total reviews scraped for https://www.ratebeer.com/beer/bells-black-note-stout/71826/: 100
An error occurred while setting reviews per page: Message: 
Stacktrace:
0   chromedriver                        0x0000000106a4cd18 chromedriver + 4996376
1   chromedriver                        0x0000000106a445da chromedriver + 4961754
2   chromedriver                        0x00000001065e7d10 chromedriver + 388368
3   chromedriver                        0x000000010663430f chromedriver + 701199
4   chromedriver                        0x00000001066343f1 chromedriver + 701425
5   chromedriver                        0x0000000106679464 chromedriver + 984164
6   chromedriver                        0x00000001066589dd chromedriver + 850397
7   chromedriver                        0x0000000106676a00 chromedriver + 973312
8   chromedriver                        0x0000000106658753 chromedriver + 849747
9   chromedriver                        0x00000001

Navigated to the next page.
Navigated to the next page.
Total reviews scraped for https://www.ratebeer.com/beer/alesmith-speedway-stout/14232/: 100
An error occurred while setting reviews per page: Message: 
Stacktrace:
0   chromedriver                        0x0000000106a4cd18 chromedriver + 4996376
1   chromedriver                        0x0000000106a445da chromedriver + 4961754
2   chromedriver                        0x00000001065e7d10 chromedriver + 388368
3   chromedriver                        0x000000010663430f chromedriver + 701199
4   chromedriver                        0x00000001066343f1 chromedriver + 701425
5   chromedriver                        0x0000000106679464 chromedriver + 984164
6   chromedriver                        0x00000001066589dd chromedriver + 850397
7   chromedriver                        0x0000000106676a00 chromedriver + 973312
8   chromedriver                        0x0000000106658753 chromedriver + 849747
9   chromedriver                        0x0000000

Total reviews scraped for https://www.ratebeer.com/beer/founders-kbs-kentucky-breakfast-stout/40544/: 100
An error occurred while setting reviews per page: Message: 
Stacktrace:
0   chromedriver                        0x0000000106a4cd18 chromedriver + 4996376
1   chromedriver                        0x0000000106a445da chromedriver + 4961754
2   chromedriver                        0x00000001065e7d10 chromedriver + 388368
3   chromedriver                        0x000000010663430f chromedriver + 701199
4   chromedriver                        0x00000001066343f1 chromedriver + 701425
5   chromedriver                        0x0000000106679464 chromedriver + 984164
6   chromedriver                        0x00000001066589dd chromedriver + 850397
7   chromedriver                        0x0000000106676a00 chromedriver + 973312
8   chromedriver                        0x0000000106658753 chromedriver + 849747
9   chromedriver                        0x0000000106627635 chromedriver + 648757
10  chrome

An error occurred while setting reviews per page: Message: 
Stacktrace:
0   chromedriver                        0x0000000106a4cd18 chromedriver + 4996376
1   chromedriver                        0x0000000106a445da chromedriver + 4961754
2   chromedriver                        0x00000001065e7d10 chromedriver + 388368
3   chromedriver                        0x000000010663430f chromedriver + 701199
4   chromedriver                        0x00000001066343f1 chromedriver + 701425
5   chromedriver                        0x0000000106679464 chromedriver + 984164
6   chromedriver                        0x00000001066589dd chromedriver + 850397
7   chromedriver                        0x0000000106676a00 chromedriver + 973312
8   chromedriver                        0x0000000106658753 chromedriver + 849747
9   chromedriver                        0x0000000106627635 chromedriver + 648757
10  chromedriver                        0x0000000106627e5e chromedriver + 650846
11  chromedriver                   

Navigated to the next page.
Navigated to the next page.
Navigated to the next page.
Total reviews scraped for https://www.ratebeer.com/beer/toppling-goliath-assassin/163168/: 100
An error occurred while setting reviews per page: Message: 
Stacktrace:
0   chromedriver                        0x0000000106a4cd18 chromedriver + 4996376
1   chromedriver                        0x0000000106a445da chromedriver + 4961754
2   chromedriver                        0x00000001065e7d10 chromedriver + 388368
3   chromedriver                        0x000000010663430f chromedriver + 701199
4   chromedriver                        0x00000001066343f1 chromedriver + 701425
5   chromedriver                        0x0000000106679464 chromedriver + 984164
6   chromedriver                        0x00000001066589dd chromedriver + 850397
7   chromedriver                        0x0000000106676a00 chromedriver + 973312
8   chromedriver                        0x0000000106658753 chromedriver + 849747
9   chromedriver  

Navigated to the next page.
Navigated to the next page.
Total reviews scraped for https://www.ratebeer.com/beer/the-alchemist-heady-topper/32329/: 100
An error occurred while setting reviews per page: Message: 
Stacktrace:
0   chromedriver                        0x0000000106a4cd18 chromedriver + 4996376
1   chromedriver                        0x0000000106a445da chromedriver + 4961754
2   chromedriver                        0x00000001065e7d10 chromedriver + 388368
3   chromedriver                        0x000000010663430f chromedriver + 701199
4   chromedriver                        0x00000001066343f1 chromedriver + 701425
5   chromedriver                        0x0000000106679464 chromedriver + 984164
6   chromedriver                        0x00000001066589dd chromedriver + 850397
7   chromedriver                        0x0000000106676a00 chromedriver + 973312
8   chromedriver                        0x0000000106658753 chromedriver + 849747
9   chromedriver                        0x0000

Total reviews scraped for https://www.ratebeer.com/beer/anchorage-blessed/838934/: 100
An error occurred while setting reviews per page: Message: 
Stacktrace:
0   chromedriver                        0x0000000106a4cd18 chromedriver + 4996376
1   chromedriver                        0x0000000106a445da chromedriver + 4961754
2   chromedriver                        0x00000001065e7d10 chromedriver + 388368
3   chromedriver                        0x000000010663430f chromedriver + 701199
4   chromedriver                        0x00000001066343f1 chromedriver + 701425
5   chromedriver                        0x0000000106679464 chromedriver + 984164
6   chromedriver                        0x00000001066589dd chromedriver + 850397
7   chromedriver                        0x0000000106676a00 chromedriver + 973312
8   chromedriver                        0x0000000106658753 chromedriver + 849747
9   chromedriver                        0x0000000106627635 chromedriver + 648757
10  chromedriver             

An error occurred while setting reviews per page: Message: 
Stacktrace:
0   chromedriver                        0x0000000106a4cd18 chromedriver + 4996376
1   chromedriver                        0x0000000106a445da chromedriver + 4961754
2   chromedriver                        0x00000001065e7d10 chromedriver + 388368
3   chromedriver                        0x000000010663430f chromedriver + 701199
4   chromedriver                        0x00000001066343f1 chromedriver + 701425
5   chromedriver                        0x0000000106679464 chromedriver + 984164
6   chromedriver                        0x00000001066589dd chromedriver + 850397
7   chromedriver                        0x0000000106676a00 chromedriver + 973312
8   chromedriver                        0x0000000106658753 chromedriver + 849747
9   chromedriver                        0x0000000106627635 chromedriver + 648757
10  chromedriver                        0x0000000106627e5e chromedriver + 650846
11  chromedriver                   

Navigated to the next page.
Navigated to the next page.
Navigated to the next page.
Total reviews scraped for https://www.ratebeer.com/beer/omnipollo-siren-david-strachan-lorelei-barrel-aged-extra-maple/496247/: 100
An error occurred while setting reviews per page: Message: 
Stacktrace:
0   chromedriver                        0x0000000106a4cd18 chromedriver + 4996376
1   chromedriver                        0x0000000106a445da chromedriver + 4961754
2   chromedriver                        0x00000001065e7d10 chromedriver + 388368
3   chromedriver                        0x000000010663430f chromedriver + 701199
4   chromedriver                        0x00000001066343f1 chromedriver + 701425
5   chromedriver                        0x0000000106679464 chromedriver + 984164
6   chromedriver                        0x00000001066589dd chromedriver + 850397
7   chromedriver                        0x0000000106676a00 chromedriver + 973312
8   chromedriver                        0x0000000106658753 chr

Navigated to the next page.
Total reviews scraped for https://www.ratebeer.com/beer/alesmith-speedway-stout-vietnamese-coffee/159307/: 100
An error occurred while setting reviews per page: Message: 
Stacktrace:
0   chromedriver                        0x0000000106a4cd18 chromedriver + 4996376
1   chromedriver                        0x0000000106a445da chromedriver + 4961754
2   chromedriver                        0x00000001065e7d10 chromedriver + 388368
3   chromedriver                        0x000000010663430f chromedriver + 701199
4   chromedriver                        0x00000001066343f1 chromedriver + 701425
5   chromedriver                        0x0000000106679464 chromedriver + 984164
6   chromedriver                        0x00000001066589dd chromedriver + 850397
7   chromedriver                        0x0000000106676a00 chromedriver + 973312
8   chromedriver                        0x0000000106658753 chromedriver + 849747
9   chromedriver                        0x0000000106627635

Total reviews scraped for https://www.ratebeer.com/beer/lost-abbey-cable-car/76897/: 100
An error occurred while setting reviews per page: Message: 
Stacktrace:
0   chromedriver                        0x0000000106a4cd18 chromedriver + 4996376
1   chromedriver                        0x0000000106a445da chromedriver + 4961754
2   chromedriver                        0x00000001065e7d10 chromedriver + 388368
3   chromedriver                        0x000000010663430f chromedriver + 701199
4   chromedriver                        0x00000001066343f1 chromedriver + 701425
5   chromedriver                        0x0000000106679464 chromedriver + 984164
6   chromedriver                        0x00000001066589dd chromedriver + 850397
7   chromedriver                        0x0000000106676a00 chromedriver + 973312
8   chromedriver                        0x0000000106658753 chromedriver + 849747
9   chromedriver                        0x0000000106627635 chromedriver + 648757
10  chromedriver           

### These next two code blocks clean the initial output from the scraper into a clean table

In [1]:
import pandas as pd

def transform_csv(input_file, output_file):
    # Load the CSV into a DataFrame
    df = pd.read_csv(input_file)

    # Iterate through the rows of the DataFrame
    row = 0
    while row < len(df) - 1:
        # Replace the value in the 3rd column of the current row with the value from the next row's 3rd column
        df.iloc[row, 2] = df.iloc[row + 1, 2]  # 3rd column is index 2 in zero-indexed Pandas

        # Drop the next row
        df.drop(row + 1, inplace=True)

        # Reset the index of the DataFrame
        df.reset_index(drop=True, inplace=True)

        # Move to the next row (because the next row was deleted, we stay in the same row index)
        row += 1

    # Save the transformed DataFrame back to a new CSV file
    df.to_csv(output_file, index=False)
    print(f"Transformation complete. Output saved to {output_file}.")

# Usage example
input_csv = "beer_reviews.csv"
output_csv = 'beer_reviews_cleaned.csv'

transform_csv(input_csv, output_csv)


Transformation complete. Output saved to beer_reviews_cleaned.csv.


In [2]:
import pandas as pd

# Function to remove rows with repeated values in the 3rd column
def remove_repeated_rows(input_file, output_file):
    # Read the CSV into a DataFrame
    df = pd.read_csv(input_file)

    # Keep track of seen values in the 3rd column
    seen_reviews = set()

    # Define a function to check for duplicates in the 3rd column
    def check_for_duplicates(row):
        review = row[2]  # The 3rd column (index starts from 0, so 2 is the 3rd column)
        if review in seen_reviews:
            return False  # Mark row for removal if the review was already seen
        seen_reviews.add(review)
        return True  # Keep the row if it's not a duplicate

    # Apply the function to filter the DataFrame
    df_filtered = df[df.apply(check_for_duplicates, axis=1)]

    # Save the filtered DataFrame to a new CSV
    df_filtered.to_csv(output_file, index=False)
    print(f"Filtered CSV saved to {output_file}")

# Example usage
input_csv = 'beer_reviews_cleaned.csv'  # Replace with the actual input CSV file path
output_csv = 'filtered_beer_reviews.csv'  # Replace with the desired output CSV file path
remove_repeated_rows(input_csv, output_csv)


Filtered CSV saved to filtered_beer_reviews.csv


  review = row[2]  # The 3rd column (index starts from 0, so 2 is the 3rd column)


### Translates all reviews to English

In [None]:
!pip install deep-translator

import pandas as pd
from deep_translator import GoogleTranslator

def translate_text(text):
    try:
        # Attempt translation using deep-translator
        translated = GoogleTranslator(source='auto', target='en').translate(text)
        if translated is None:  # Handle cases where the result is None
            raise ValueError("Translation API returned None")
        return translated
    except Exception as e:
        # Log the error and return the original text
        print(f"Error translating text: {e}, returning original text: {text}")
        return text  # Return the original text if translation fails

def translate_reviews(file_path, output_file):
    df = pd.read_csv(file_path)

    # Apply translation to the 'Review' column
    df['Review'] = df['Review'].apply(lambda x: translate_text(str(x)))

    # Save the updated DataFrame to a new CSV file
    df.to_csv(output_file, index=False)
    print("Translation completed and saved to", output_file)

# Usage
translate_reviews('filtered_beer_reviews.csv', 'translated_reviews.csv')






Error translating text: There are not hundreds of beers on my bucketlist anymore, throughout the years I have had the pleasure of tasting many of the world's great beers and I am grateful for each and every one of them, but if there is one beer I have been eager to sip for nearly a decade now, it must be this one: Toppling Goliath's Kentucky Brunch Brand Stout, or KBBS as it is sometimes abbreviated.  A coffee stout aged for over a year in bourbon barrels, it was first released in 2012, but the story goes that at that time, only six people showed up - after which founder Carl Lewey stopped selling it and kept the rest of the batch for private consumption.  I guess he must have known already back then that he was onto something, as the buzz around this beer began to grow and it quickly became one of those modern whales every beer lover in the world wants to taste at least once.  In 2015, KBBS reached number one in the list of top beers on this very website, stoking the hype even further

# PART B

### Calculating frequencies to determine attributes

In [9]:
import pandas as pd
from collections import Counter
from nltk.corpus import stopwords
import nltk
nltk.download('stopwords')

def calculate_word_frequencies(file_path):
    df = pd.read_csv(file_path)
    stop_words = set(stopwords.words('english'))

    words = []
    for review in df['Review']:
        # Split into words and filter out stopwords and non-alphabetic terms
        words.extend([word.lower() for word in review.split() if word.lower() not in stop_words and word.isalpha()])

    word_counts = Counter(words)
    word_freq_df = pd.DataFrame(word_counts.items(), columns=['Word', 'Frequency'])
    word_freq_df = word_freq_df.sort_values(by='Frequency', ascending=False)

    # Save to a CSV file
    word_freq_df.to_csv('word_freq_beer.csv', index=False)
    print("Word frequencies saved to word_freq_beer.csv.")

# Assuming the file is stored locally as 'filtered_beer_reviews.csv'
calculate_word_frequencies('translated_reviews.csv')
attribute_freq = pd.read_csv("word_freq_beer.csv")
attribute_freq[:10]


Word frequencies saved to word_freq_beer.csv.


[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\krgod\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


Unnamed: 0,Word,Frequency
0,dark,1013
1,black,622
2,aroma,608
3,brown,515
4,taste,514
5,roasted,487
6,bottle,473
7,sweet,454
8,pours,388
9,chocolate,380


### Calculate lift values to ensure attributes occur together (they do)

In [11]:
import pandas as pd
import itertools

def calculate_lift_matrix(file_path, attributes, window_size=7):
    df = pd.read_csv(file_path)
    num_reviews = len(df)

    # Prepare to count the occurrences and co-occurrences
    occurrences = {attr: 0 for attr in attributes}
    # Initialize co_occurrences for all possible pairs, including pairs with the same elements
    co_occurrences = {tuple(sorted(pair)): 0 for pair in itertools.product(attributes, repeat=2)}

    # Analyze each review
    for review in df['Review']:
        words = review.lower().split()
        seen_words = set()
        seen_pairs = set()

        # Find occurrences within the allowed window size
        for i, word in enumerate(words):
            if word in attributes:
                seen_words.add(word)  # Count each word only once per review
                for j in range(max(0, i - window_size), min(len(words), i + window_size + 1)):
                    if words[j] in attributes and j != i:
                        pair = tuple(sorted((word, words[j])))
                        seen_pairs.add(pair)  # Count each pair only once per review

        # Update global counts
        for word in seen_words:
            occurrences[word] += 1
        for pair in seen_pairs:
            co_occurrences[pair] += 1

    # Calculate lift values and create symmetric matrix
    lift_matrix = pd.DataFrame(index=attributes, columns=attributes, dtype=float)
    for (attr1, attr2), count in co_occurrences.items():
        if attr1 == attr2:
            lift_matrix.at[attr1, attr2] = 1.0
        else:
            lift = (num_reviews * count) / (occurrences[attr1] * occurrences[attr2]) if occurrences[attr1] and occurrences[attr2] else 0
            lift_matrix.at[attr1, attr2] = lift
            lift_matrix.at[attr2, attr1] = lift  # Symmetric entry

    lift_matrix.to_csv('lift_matrix_beer.csv')
    print("Lift matrix saved to 'lift_matrix_beer.csv'.")
    print(lift_matrix)

# Example usage with your specified attributes
attributes = ['dark', 'black', 'brown', 'roasted', 'sweet', 'chocolate']
calculate_lift_matrix('translated_reviews.csv', attributes)


Lift matrix saved to 'lift_matrix_beer.csv'.
               dark     black     brown   roasted     sweet  chocolate
dark       1.000000  0.477953  1.221880  0.913736  0.501171   0.947749
black      0.477953  1.000000  1.156222  0.431025  0.183159   0.228792
brown      1.221880  1.156222  1.000000  0.545946  0.323898   0.465899
roasted    0.913736  0.431025  0.545946  1.000000  0.547378   0.808075
sweet      0.501171  0.183159  0.323898  0.547378  1.000000   0.810198
chocolate  0.947749  0.228792  0.465899  0.808075  0.810198   1.000000


Based on the results from Part B, we will be using Dark, Roasted, and Chocolate as the 3 attributes for our analysis. Of elligible beer attributes, these words were in the top 6 most frequent in the corpus (we ruled out "aroma", "taste", "bottle", and "pours" as they are not attributes), and they all have lift values of 0.81 or greater when compared with one another meaning they're likely attributes that go together and are not diametrically opposed.

# PART C

In [None]:
import pandas as pd
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.metrics.pairwise import cosine_similarity

def calculate_similarity(reviews_file, output_file):
    # Define attributes and create DataFrame
    attributes = ['Dark', 'Roasted', 'Chocolate']
    attributes_df = pd.DataFrame(columns=attributes, data=[{attr: attr for attr in attributes}])
    attributes_df.to_csv('attributes.csv', index=False, header=False)

    # Load reviews
    reviews_df = pd.read_csv(reviews_file)

    # Prepare text data: attributes as a single string, reviews as a list of strings
    attributes_text = ' '.join(attributes_df.columns)  # Combining all attribute names into one string
    reviews = reviews_df['Review'].tolist()

    # Combine attributes and reviews for vectorization
    combined_text = [attributes_text] + reviews

    # Vectorize text data
    vectorizer = CountVectorizer()
    vectors = vectorizer.fit_transform(combined_text)

    # Calculate cosine similarity
    similarity_scores = cosine_similarity(vectors[0:1], vectors[1:]).flatten()

    # Create a result DataFrame
    result_df = pd.DataFrame({
        'Product Name': reviews_df['Product Name'],
        'product_review': reviews_df['Review'],
        'similarity_score': similarity_scores
    })

    # Save to CSV
    result_df.to_csv(output_file, index=False)
    print("Similarity results saved to", output_file)

    return result_df

# Paths to your files
reviews_file = 'translated_reviews.csv'
output_file = 'similarity_results.csv'

# Execute the function
calculate_similarity(reviews_file, output_file)


# PART D

In [1]:
!pip install nltk
import nltk
nltk.download('vader_lexicon')


from nltk.sentiment.vader import SentimentIntensityAnalyzer
import pandas as pd
def perform_sentiment_analysis(reviews_file, output_file):
    reviews_df = pd.read_csv(reviews_file)
    vader_analyzer = SentimentIntensityAnalyzer()
    sentiment_results = []
    for idx, row in reviews_df.iterrows():
        review = row['Review']
        sentiment_score = vader_analyzer.polarity_scores(review)
        sentiment_results.append([row['Product Name'], review, sentiment_score['compound']])
    sentiment_df = pd.DataFrame(sentiment_results, columns=['Product Name', 'product_review', 'sentiment_score'])
    sentiment_df.to_csv(output_file, index=False)
    return sentiment_df
reviews_file = 'translated_reviews.csv'
perform_sentiment_analysis(reviews_file, 'sentiment_results.csv')



[nltk_data] Downloading package vader_lexicon to
[nltk_data]     C:\Users\krgod\AppData\Roaming\nltk_data...
[nltk_data]   Package vader_lexicon is already up-to-date!


Unnamed: 0,Product Name,product_review,sentiment_score
0,Toppling Goliath Kentucky Brunch,"You need personal informations from companies,...",0.9442
1,Toppling Goliath Kentucky Brunch,"Bottle after MBCC 2024. Black colour, malty ar...",0.7955
2,Toppling Goliath Kentucky Brunch,Thank you for sharing this Chris - Black with ...,0.9042
3,Toppling Goliath Kentucky Brunch,"Boxed beer at home, proper glassware. Pitch bl...",0.9423
4,Toppling Goliath Kentucky Brunch,"From backlog. (As 2018 Vintage) 0,3 litre Bott...",0.7845
...,...,...,...
1493,Stone Imperial Russian Stout,"Bottled, 355ml 2010 Vintage given as a Birthda...",0.3167
1494,Stone Imperial Russian Stout,"Bomber, 2014 vintage. Rich roasted malt aroma ...",0.8689
1495,Stone Imperial Russian Stout,Pours inky black with a small white ring. Aro...,0.9100
1496,Stone Imperial Russian Stout,Superlative. \n\nJust what you expect for the...,0.8271


# PART E


In [3]:
import pandas as pd
def create_aggregated_evaluation_score(similarity_file, sentiment_file, output_file):
 
    similarity_df = pd.read_csv(similarity_file)
    sentiment_df = pd.read_csv(sentiment_file)


    combined_df = pd.merge(similarity_df, sentiment_df, on=['Product Name', 'product_review'])

    #create evaluation score by multiplying similarity and sentiment score
    combined_df['evaluation_score'] = combined_df['similarity_score'] * combined_df['sentiment_score']

    # Group by product_name and calculate the average evaluation score for each product
    aggregated_df = combined_df.groupby('Product Name').agg(
        avg_evaluation_score=('evaluation_score', 'mean')
    ).reset_index()

  
    aggregated_df = aggregated_df.sort_values(by='avg_evaluation_score', ascending=False)

  
    aggregated_df.to_csv(output_file, index=False)
    print(f"Aggregated evaluation results saved to {output_file}")

    
    top_3_recommendations = aggregated_df[['Product Name', 'avg_evaluation_score']].head(3)
    top_3_products = aggregated_df['Product Name'].tolist() #helps with Part F
    
    
    print("Top 3 product recommendations based on average evaluation score:")
    print(top_3_recommendations)

    return top_3_recommendations


similarity_file = 'similarity_results.csv'
sentiment_file = 'sentiment_results.csv'
output_file = 'aggregated_evaluation_results.csv'


top_3_products = create_aggregated_evaluation_score(similarity_file, sentiment_file, output_file)



Aggregated evaluation results saved to aggregated_evaluation_results.csv
Top 3 product recommendations based on average evaluation score:
                                    Product Name  avg_evaluation_score
33  Old Chimneys Good King Henry Special Reserve              0.141847
7                         Bells Expedition Stout              0.137780
44                     Toppling Goliath Assassin              0.127725


Top 3 Product Recommendations: 
1. Old Chimneys Good King Henry Special Reserve 
2. Bells Expedition Stout
3. Toppling Goliath Assassin 

# Part F

In [21]:
!pip install spacy --quiet
!python -m spacy download en_core_web_md --quiet
import spacy
import pandas as pd
import numpy as np

#load-in reviews file (built so that we can run this block without running the above blocks)
reviews_file = 'translated_reviews.csv'
reviews_df = pd.read_csv(reviews_file)

#load spacy model
nlp = spacy.load("en_core_web_md")

#Kept attributes consistent with above blocks
attributes_of_interest = ['chocolate', 'roasted', 'dark']

# Generate word vectors for each attribute using SpaCy
attribute_docs = {attr: nlp(attr) for attr in attributes_of_interest}

# Compute the mean vector representation of the attribute vectors
mean_attribute_vector = np.mean([doc.vector for doc in attribute_docs.values()], axis=0)
mean_attribute_doc = nlp(" ".join(attributes_of_interest))  # Create a doc object for mean vector

# get word vectors for reviews
reviews_df['Review_Vector'] = reviews_df['Review'].apply(lambda review: nlp(review))

# Calculate the mean vector representation for each product based on its reviews
product_mean_vectors = reviews_df.groupby('Product Name')['Review_Vector'].apply(
    lambda docs: nlp(" ".join([doc.text for doc in docs]))  # Create a combined doc for product reviews
).reset_index()

#calculates similarity based on Barua code.
product_mean_vectors['Similarity'] = product_mean_vectors['Review_Vector'].apply(
    lambda doc: doc.similarity(mean_attribute_doc)
)

#sort and select top 3 products
top_3_products = product_mean_vectors.sort_values(by='Similarity', ascending=False).head(3)['Product Name'].tolist()

# Filter the reviews DataFrame to include only reviews from the top 3 products
top_beer_reviews = reviews_df[reviews_df['Product Name'].isin(top_3_products)]

# Calculate the percentage of reviews mentioning each attribute for the top 3 beers
attribute_analysis = []
for rank, product in enumerate(top_3_products, start=1):
    product_reviews = top_beer_reviews[top_beer_reviews['Product Name'] == product]

    # Calculate mentions for each attribute
    attribute_counts = {attr: 0 for attr in attributes_of_interest}
    for review in product_reviews['Review']:
        review_doc = nlp(review)
        review_tokens = set([token.lemma_ for token in review_doc])
        for attr in attributes_of_interest:
            if attr in review_tokens:
                attribute_counts[attr] += 1

    #calc percentages and format
    total_reviews = len(product_reviews)
    if total_reviews == 0:
        continue
    attribute_percentages = {attr: f"{(count / total_reviews) * 100:.2f}%" for attr, count in attribute_counts.items()}

    # Store results for this product
    attribute_analysis.append({
        'Rank': rank,
        'Product Name': product,
        'Attribute Similarity': product_mean_vectors[product_mean_vectors['Product Name'] == product]['Similarity'].values[0],
        **attribute_percentages,
    })

#dataframe and print results (DF makes for easier/cleaner display)
attribute_df = pd.DataFrame(attribute_analysis).set_index('Rank').sort_index()
print("Top 3 Beers Analysis:")
print(attribute_df)




[38;5;2m[+] Download and installation successful[0m
You can now load the package via spacy.load('en_core_web_md')




Top 3 Beers Analysis:
                                           Product Name  Attribute Similarity  \
Rank                                                                            
1                                     Anchorage Blessed              0.569042   
2                            Modern Times Monster Tones              0.559058   
3     Alesmith Speedway Stout Barrel Aged Vietnamese...              0.558306   

     chocolate roasted    dark  
Rank                            
1       73.33%  20.00%  40.00%  
2       73.33%  33.33%  50.00%  
3       73.33%  26.67%  50.00%  


The spaCy word vector package changes the recommendations based on the attributes that the user is interested in. The above code block is calculating the vectors of all the reviews for each beer, and then it calculates the average vector for each beer. It then calculates the vector of the three attributes that the user is interested in. It then calculates the beers that have a vector most similar to the attribute vector. This methodology is great for whoever is interested in these reviews because it takes a more in-depth look into the three attributes by calculating the percent of reviews that one attribute is mentioned. If someone is really more interested in the 'roasted', he/she may choose the Modern Times over the Anchorage Blessed since a higher percentage of the reviews mention chocolate. 
We did observe some differences between the BoW and spaCy similarity. We got different results with each of the methodologies. There is no sentiment analysis in our word vector model, whereas that is a part of our BoW model. This is leading to some differences in our results from the two models. 

# Part G

In [7]:
import pandas as pd
import numpy as np
import spacy

#Loads reviews file
reviews_file = 'translated_reviews.csv'
reviews_df = pd.read_csv(reviews_file)

# Load spaCy model
nlp = spacy.load("en_core_web_md")

# Calculate the average rating for each product
reviews_df['Rating'] = reviews_df['Rating'].astype(float)  # Ensure the rating is numeric
average_rating_df = reviews_df.groupby('Product Name').agg(
    avg_rating=('Rating', 'mean'),
    total_reviews=('Rating', 'count')
).reset_index()

# Sort the beers by overall average rating and pull in the top3. 
top_rated_beers = average_rating_df.sort_values(by='avg_rating', ascending=False).head(3)['Product Name'].tolist()

#Same attributes of interest from the other sections to keep it consistent
attributes_of_interest = ['chocolate', 'roasted', 'dark']

#We only want reviews of the top3 beers
top_beer_reviews = reviews_df[reviews_df['Product Name'].isin(top_rated_beers)]

# Generate word vectors for each review
top_beer_reviews['Review_Vector'] = top_beer_reviews['Review'].apply(lambda review: nlp(review).vector)

# Analyze the frequency of specified attributes in the top-rated beer reviews
attribute_analysis = []
for rank, product in enumerate(top_rated_beers, start=1):
    product_reviews = top_beer_reviews[top_beer_reviews['Product Name'] == product]
    
    # Calculate mentions for each attribute
    attribute_counts = {attr: 0 for attr in attributes_of_interest}
    for review in product_reviews['Review']:
        review_doc = nlp(review)
        review_tokens = set([token.lemma_ for token in review_doc])
        for attr in attributes_of_interest:
            if attr in review_tokens:
                attribute_counts[attr] += 1

    # Calculate the percentage for each attribute and format to two decimal places
    total_reviews = len(product_reviews)
    if total_reviews == 0:
        continue
    attribute_percentages = {attr: f"{(count / total_reviews) * 100:.2f}%" for attr, count in attribute_counts.items()}

    # Store results for this product
    attribute_analysis.append({
        'Rank': rank,
        'Product Name': product,
        'Avg Rating': round(average_rating_df[average_rating_df['Product Name'] == product]['avg_rating'].values[0], 2),
        **attribute_percentages,
    })

# convert to a dataframe so that is it easier to display
attribute_df = pd.DataFrame(attribute_analysis).set_index('Rank').sort_index()

#Print statements
print("Attribute Frequency Analysis for Top-Rated Beers:")
print(attribute_df)


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  top_beer_reviews['Review_Vector'] = top_beer_reviews['Review'].apply(lambda review: nlp(review).vector)


Attribute Frequency Analysis for Top-Rated Beers:
                          Product Name  Avg Rating chocolate roasted    dark
Rank                                                                        
1     Toppling Goliath Kentucky Brunch        4.57    60.00%  36.67%  43.33%
2        Perennial Abraxas Barrel Aged        4.42    50.00%  16.67%  43.33%
3                      Westvleteren 12        4.41    31.03%   3.45%  34.48%


If we simply ignored the similarity and feature sentiment scores and chose the 3 highest rated products, we wouldn't be offering any customization to the person viewing the recommendations. We would simply be offering the same 3 products every time, regardless of what the user's desired attributes are. The overall ratings will not change, as it is simply the average score of all the reviews for that particular product. The only way the overall rating for a product can change is a new review pops up for it. Similarly, the sentiment score of a review will not change, since the sentiment analyzer will return a number between 0 and 1 for the entire review (not just parts of it). However, the similarity score is where the recommendations will start to differ, because the similarity score will change as the user changes their attributes. If the user wants 'chocolate' as one of his/her attributes, he/she will get different similarity scores than another user who may choose 'vanilla' as his/her attribute of choice. The flexibility of the similarity score is what makes our review system more personalized to what the user wants. 
As we can see, the 3 highest rated beers aren't necessarily what our users are looking for. For example, the Westvleteren 12 beer only has 'roasted' mentioned in about 3% of the reviews, and 'chocolate' is mentioned way less than our results in Part F. A user looking for specific attributes in reviews will prefer our customized system, as they will get beers that align with their tastes, rather than the three generic beers each time. 

# Part H

In [None]:
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.feature_extraction.text import TfidfVectorizer
import pandas as pd

#Get translated review list and pull 10 beers and all their reviews
ten_beer_df = pd.read_csv('translated_reviews.csv')
top_10_beers = ten_beer_df['Product Name'].value_counts().head(10).index
ten_beer_review = ten_beer_df[ten_beer_df['Product Name'].isin(top_10_beers)][['Product Name','Rating','Review']]

#Concatendated all the reviews for a particular beer together since it is easier/faster to compare with one giant review rather than many individual reviews
top_10_reviews = ten_beer_review.groupby('Product Name')['Review'].apply(' '.join)

#print(top_10_beers) #turned off, used for debugging code

#We chose 'Anchorage Blessed' as our beer that we want to find similarity for
selected_beer = 'Anchorage Blessed'

vectorizer = TfidfVectorizer()
tfidf_matrix = vectorizer.fit_transform(top_10_reviews)

#Calculates cosine similarity for the selected beer against the others
similarity_matrix = cosine_similarity(tfidf_matrix)

#index of selected beer in our matrix
selected_beer_index = list(top_10_reviews.index).index(selected_beer)

#Get similarity scores for the selected beer
similarity_scores = similarity_matrix[selected_beer_index]

#removes the selected beer from the list so that we aren't comparing with itself. 
similarity_scores[selected_beer_index] = -1

# Find the most similar beer (max of the similarity scores for other beers)
most_similar_beer_index = similarity_scores.argmax()
most_similar_beer = top_10_reviews.index[most_similar_beer_index]

#results and print statements
result = f"The beer that is most similar to {selected_beer} is {most_similar_beer}."
print(result)

We chose the 10 beers at random from our beer list. The next step was getting a dataframe that only had the top 10 beers and their reviews, so that the most simlar beer would only be selected from one of the other 9 beers. We joined all the reviews for the beers so that we ended up with one big review for each of the 10 beers, so that we wouldn't be comparing inidividual reviews for each beer. We then converted the data into a TF-IDF matrix, and then calculated a cosine similarity for all the beers. We did not include sentiment in our analysis, as for this part we do not care for the sentiment, we simply want to find the most similar beer. Both beers may be horrible, or one may be good and one bad, but the aim was to find what was most similar. We found similarity scores for all the beers in the top 10 list. Before printing out the most similar beer, we had to make sure to remove the selected beer from the list, or else we would just return the same beer as the most similar beer. 