In [2]:
import streamlit as st
import pandas as pd
import numpy as np
import re
import json
import os
import sys

import matplotlib.pyplot as plt
import seaborn as sns
import plotly.graph_objects as go
import plotly.express as px
from plotly.subplots import make_subplots

import networkx as nx

from sklearn.neighbors import NearestNeighbors
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.feature_extraction.text import TfidfVectorizer

sys.path.append(os.path.abspath(os.path.join('..')))
from src import plots
from src import ml_processing

[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/jobandtalent/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package vader_lexicon to
[nltk_data]     /Users/jobandtalent/nltk_data...
[nltk_data]   Package vader_lexicon is already up-to-date!


### Load data

In [3]:
def loadData(uploaded_file):
    if uploaded_file is not None:
        return pd.read_csv(uploaded_file)
    return None

def extractPrefix(file_name):
    # Split the filename and extract the part before "_ml"
    return file_name.split('_ml')[0]

def loadJson(file_path):
    with open(file_path, 'r') as file:
        return json.load(file)

def reFormatEmbeddings(embedding_str):
    cleaned_str = re.sub(r'[\[\]\n]', '', embedding_str)
    embedding_list = list(map(float, cleaned_str.split()))
    return np.array(embedding_list, dtype=np.float32)
    return embedding_str

processed_path = '../data/processed/'
raw_path = '../data/raw/'

In [4]:
uploaded_file = 'hd_ml_processed_reviews.csv'

## Load all necessary data
# Load reviews data and extract place from the file name
reviews = loadData(processed_path + uploaded_file)
if 'embedding' in reviews.columns:
    # Convert embeddings from string to list of floats
    reviews['embedding'] = reviews['embedding'].apply(reFormatEmbeddings)

file_name = uploaded_file
place = extractPrefix(file_name)

# Paths for the JSON and additional CSV files
general_insights_file = os.path.join(processed_path, f"{place}_general_insights.json")
worst_periods_file = os.path.join(processed_path, f"{place}_worst_periods_insights.json")
sample_reviews_file = os.path.join(processed_path, f"{place}_sample_selected_reviews.csv")
resume_file = os.path.join(raw_path, f"resumme_{place}.csv")

# Load "place"_general_insights.json into a dictionary
if os.path.exists(general_insights_file):
    general_insights = loadJson(general_insights_file)

# Load "place"_worst_periods_insights.json into a dictionary
if os.path.exists(worst_periods_file):
    worst_periods_insights = loadJson(worst_periods_file)

# Load "place"_sample_selected_reviews.csv into a DataFrame
if os.path.exists(sample_reviews_file):
    sample_reviews = pd.read_csv(sample_reviews_file)

# Load resumme_"place".csv from ./data/raw into a DataFrame
if os.path.exists(resume_file):
    resume = pd.read_csv(resume_file)

### Dev

In [5]:
display(sample_reviews.sample(3))

# best_reviews
best_reviews = sample_reviews[sample_reviews['sample_type'] == 'best_reviews_sample'][['date', 'rating_score','review', 'food_score', 'service_score', 'atmosphere_score', 'meal_type']]
best_reviews.rename(columns = {'review':'Review', 'rating_score':'Rating', 'meal_type':'Meal','food_score':'Food', 'service_score':'Service', 'atmosphere_score':'Ambient', 'date':'Date'}, inplace = True)

# worst_reviews
worst_reviews = sample_reviews[sample_reviews['sample_type'] == 'worst_reviews_sample'][['date', 'rating_score','review', 'food_score', 'service_score', 'atmosphere_score', 'meal_type']]
worst_reviews.rename(columns = {'review':'Review', 'rating_score':'Rating', 'meal_type':'Meal','food_score':'Food', 'service_score':'Service', 'atmosphere_score':'Ambient', 'date':'Date'}, inplace = True)

Unnamed: 0,review_id,review,local_guide_reviews,rating_score,service,meal_type,price_per_person_category,food_score,service_score,atmosphere_score,...,cleaned_review,vader_sentiment,sentiment_label,embedding,pca_cluster,umap_cluster,month,year,total_score,sample_type
18,4.0,"Un gofre frío y tieso 9,20€? Nunca me había se...",58.0,1.0,,,,1.0,1.0,2.0,...,gofre frío tieso 920 nunca tanto estafado 3 ba...,0.0,negative,[ 9.47576336e-05 -3.84988636e-01 -1.52471945e-...,0.0,0.0,,,1.27,worst_reviews_sample
16,194.0,¡Solo el ambiente salva este lugar! cóctel te...,14.0,1.0,,,,0.874,0.902,0.894,...,solo ambiente salir lugar cóctel terrible serv...,-0.4767,negative,[-1.21311672e-01 -4.76346105e-01 1.00005493e-...,0.0,0.0,2023-01,2023.0,1.18,worst_reviews_sample
22,,"Ya no es lo que era, vamos q hasta he encontra...",,3.0,,,,,,,...,,,,,,,2024-03,,,low_score_reviews


In [6]:
general_insights

{'best': ['Customers frequently praise the quality and taste of the burgers.',
  'Many enjoy the overall ambiance and atmosphere of the dining space.',
  'The reasonable pricing of the food is consistently noted as a positive aspect.'],
 'worst': ['Several customers have experienced delays when trying to place orders.',
  'There are reports of subpar service that detract from the dining experience.',
  'Some diners have found certain menu items to be disappointing and not worth the price.'],
 'improve': ['Streamline the ordering process to reduce waiting times for customers.',
  'Enhance staff training to improve the quality and responsiveness of customer service.',
  'Review the menu offerings to ensure all items meet customer expectations and value.']}

In [7]:
worst_periods_insights

{'2023-11': {'problems': ['Customers reported that the café environment was not inviting.',
   'There seems to be a lack of urgency in addressing customer concerns during visits.'],
  'improve': ['Enhance the ambiance of the café to make it more welcoming.',
   'Implement a training program focused on timely customer service response.']},
 '2024-02': {'problems': ['Service was described as poor, leading to negative customer experiences.',
   'Customers felt that their orders were not managed properly.'],
  'improve': ['Conduct regular service quality assessments to identify areas needing improvement.',
   'Increase staff training on order management and customer interaction.']},
 '2024-03': {'problems': ['High noise levels were reported, making the atmosphere uncomfortable.',
   'Customers expressed dissatisfaction with the consistency of their experiences.'],
  'improve': ['Consider soundproofing measures or altering the layout to reduce noise.',
   'Standardize service procedures to 

In [8]:
display(sample_reviews.sample(3))

Unnamed: 0,review_id,review,local_guide_reviews,rating_score,service,meal_type,price_per_person_category,food_score,service_score,atmosphere_score,...,cleaned_review,vader_sentiment,sentiment_label,embedding,pca_cluster,umap_cluster,month,year,total_score,sample_type
18,4.0,"Un gofre frío y tieso 9,20€? Nunca me había se...",58.0,1.0,,,,1.0,1.0,2.0,...,gofre frío tieso 920 nunca tanto estafado 3 ba...,0.0,negative,[ 9.47576336e-05 -3.84988636e-01 -1.52471945e-...,0.0,0.0,,,1.27,worst_reviews_sample
22,,"Ya no es lo que era, vamos q hasta he encontra...",,3.0,,,,,,,...,,,,,,,2024-03,,,low_score_reviews
23,,La comida me resultó bastante mala. Los canelo...,,1.0,,,,,,,...,,,,,,,2024-08,,,low_score_reviews


In [9]:
sample_reviews.groupby('sample_type').count()

Unnamed: 0_level_0,review_id,review,local_guide_reviews,rating_score,service,meal_type,price_per_person_category,food_score,service_score,atmosphere_score,...,avg_price_per_person,cleaned_review,vader_sentiment,sentiment_label,embedding,pca_cluster,umap_cluster,month,year,total_score
sample_type,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
best_reviews_sample,5,5,5,5,3,3,3,5,5,5,...,3,5,5,5,5,5,5,5,5,5
low_score_reviews,0,4,0,4,0,0,0,0,0,0,...,0,0,0,0,0,0,0,4,0,0
recent_best_reviews,5,5,5,5,2,2,2,5,5,5,...,2,5,5,5,5,5,5,5,5,5
recent_worst_reviews,5,5,5,5,2,2,2,5,5,5,...,2,5,5,5,5,5,5,5,5,5
worst_reviews_sample,5,5,5,5,1,1,1,5,5,5,...,1,5,5,5,5,5,5,4,4,5


In [10]:
period_reviews = sample_reviews[(sample_reviews['month'] == '2024-08') & (sample_reviews['sample_type'] == 'low_score_reviews')][['date', 'rating_score', 'review', 'food_score', 'service_score', 'atmosphere_score', 'meal_type']]
period_reviews

Unnamed: 0,date,rating_score,review,food_score,service_score,atmosphere_score,meal_type
23,,1.0,La comida me resultó bastante mala. Los canelo...,,,,


### Plots

In [11]:
def plotTrend(reviews, label_mapping, app=False, filter_min=None, filter_max=None):
    # Convert date column to datetime format and create additional time columns
    reviews['date'] = pd.to_datetime(reviews['date'], errors='coerce')
    reviews['month'] = reviews['date'].dt.to_period('M')

    # Filter data for the last periods based on filter_min and filter_max
    limit_date = reviews['date'].max()
    if filter_min is None and filter_max is None:
        # If both filters are None, select data from the last year
        start_date = limit_date - pd.DateOffset(years=1)
        selected_reviews = reviews[(reviews['date'] >= start_date) & (reviews['date'] <= limit_date)]
    else:
        # Apply the filters if provided
        selected_reviews = reviews
        if filter_min is not None:
            selected_reviews = selected_reviews[selected_reviews['date'] >= filter_min]
        if filter_max is not None:
            selected_reviews = selected_reviews[selected_reviews['date'] <= filter_max]

    # Compute averages for the required periods using label_mapping keys
    columns_to_average = list(label_mapping.keys())
    monthly_avg_scores = selected_reviews.groupby('month')[columns_to_average].mean()
    
    # Create a figure to plot the trends
    fig = make_subplots(rows=1, cols=1)
    
    # Update the axis labels for each score to be more readable
    colors = ['#32CD32', 'rgba(31, 119, 180, 0.8)', 'rgba(107, 174, 214, 0.8)', 'rgba(158, 202, 225, 0.8)'] 
    for i, column in enumerate(monthly_avg_scores.columns):
        label = label_mapping[column]
        fig.add_trace(
            go.Scatter(x=monthly_avg_scores.index.astype(str), y=monthly_avg_scores[column],
                       mode='lines+markers', name=label, 
                       text=[f"{label} - {val:.2f}" for val in monthly_avg_scores[column]], 
                       hoverinfo="text", line=dict(color=colors[i], width=3 if i == 0 else 2)),
            row=1, col=1)


    # Analyze low scores and find high score
    _, low_score_periods = ml_processing.analyzeLowScores(reviews, 'rating_score', num_periods=3)
    high_score_period = monthly_avg_scores['rating_score'].idxmax()
    high_score_value = monthly_avg_scores['rating_score'].max()
    
    # Add annotations for low scores
    for i in range(len(low_score_periods)):
        if i > 0 and low_score_periods[i] - low_score_periods[i - 1] == 1:
            # If two periods are contiguous, combine them in one annotation
            fig.add_annotation(x=str(low_score_periods[i]), y=monthly_avg_scores.loc[low_score_periods[i], 'rating_score'] + 0.5,
                               text=f"Drop in {low_score_periods[i - 1].strftime('%B')} & {low_score_periods[i].strftime('%B')}",
                               showarrow=True, arrowhead=2, ax=0, ay=-40, row=1, col=1)
        elif i == 0 or low_score_periods[i] - low_score_periods[i - 1] != 1:
            fig.add_annotation(x=str(low_score_periods[i]), y=monthly_avg_scores.loc[low_score_periods[i], 'rating_score'] + 0.5,
                               text=f"Drop in {low_score_periods[i].strftime('%B')}",
                               showarrow=True, arrowhead=2, ax=0, ay=-40, row=1, col=1)
    
    # Add annotation for high score
    fig.add_annotation(x=str(high_score_period), y=high_score_value - 0.3,
                       text=f"High in {high_score_period.strftime('%B')}",
                       showarrow=True, arrowhead=2, ax=0, ay=40, row=1, col=1)

    fig.update_xaxes(showgrid=False)
    fig.update_yaxes(showgrid=False, title_text='Average Score')
    fig.update_layout(showlegend=False, 
                    #title="Rating Trends",
                    #title_font=dict(size=28),
                    margin=dict(l=50, r=50, t=100, b=50),
                    paper_bgcolor="white",
                    height=400, width=1200)
    
    # Show or return the figure depending on the context
    if app:
        return fig
    else:
        fig.show()

In [12]:
label_mapping = {
    'rating_score': 'Rating',
    'food_score': 'Food',
    'service_score': 'Service',
    'atmosphere_score': 'Ambient'
}

plotTrend(reviews, label_mapping, app=False)
#plotTrend(reviews, label_mapping, app=False, filter_min="2024-01-01", filter_max="2024-05-01")

In [13]:
_, low_score_periods = ml_processing.analyzeLowScores(reviews, 'rating_score', num_periods=3)

In [14]:
worst_periods_insights

{'2023-11': {'problems': ['Customers reported that the café environment was not inviting.',
   'There seems to be a lack of urgency in addressing customer concerns during visits.'],
  'improve': ['Enhance the ambiance of the café to make it more welcoming.',
   'Implement a training program focused on timely customer service response.']},
 '2024-02': {'problems': ['Service was described as poor, leading to negative customer experiences.',
   'Customers felt that their orders were not managed properly.'],
  'improve': ['Conduct regular service quality assessments to identify areas needing improvement.',
   'Increase staff training on order management and customer interaction.']},
 '2024-03': {'problems': ['High noise levels were reported, making the atmosphere uncomfortable.',
   'Customers expressed dissatisfaction with the consistency of their experiences.'],
  'improve': ['Consider soundproofing measures or altering the layout to reduce noise.',
   'Standardize service procedures to 

In [15]:
filter_min = pd.to_datetime('2024/01/29')
filter_max = pd.to_datetime('2024/09/29')
worst_periods_insights_filtered = {k: v for k, v in worst_periods_insights.items() if filter_min <= pd.to_datetime(k) <= filter_max} if filter_min is not None and filter_max is not None else worst_periods_insights
worst_periods_insights_filtered

{'2024-02': {'problems': ['Service was described as poor, leading to negative customer experiences.',
   'Customers felt that their orders were not managed properly.'],
  'improve': ['Conduct regular service quality assessments to identify areas needing improvement.',
   'Increase staff training on order management and customer interaction.']},
 '2024-03': {'problems': ['High noise levels were reported, making the atmosphere uncomfortable.',
   'Customers expressed dissatisfaction with the consistency of their experiences.'],
  'improve': ['Consider soundproofing measures or altering the layout to reduce noise.',
   'Standardize service procedures to ensure a consistent experience for all customers.']},
 '2024-08': {'problems': ['Seating arrangements were found to be uncomfortable by patrons.',
   'There were mixed feelings about the overall dining experience.'],
  'improve': ['Reevaluate the dining layout and invest in comfortable seating options.',
   'Solicit regular feedback from c

In [16]:
from datetime import datetime
dates = list(worst_periods_insights.keys())
dates = [datetime.strptime(date, '%Y-%m') for date in dates]
limit_date = max(dates)

start_date = filter_min if filter_min is not None else (limit_date - pd.DateOffset(years=1))
end_date = filter_max if filter_max is not None else limit_date

# Asegurarse de que start_date y end_date sean objetos datetime
start_date = pd.to_datetime(start_date)
end_date = pd.to_datetime(end_date)

# Filtrar el diccionario usando los límites de fecha
filtered_insights = {date: data for date, data in worst_periods_insights.items()
                     if start_date <= datetime.strptime(date, '%Y-%m') <= end_date}

print(start_date.strftime('%Y-%m'))
print(end_date.strftime('%Y-%m'))
print(filtered_insights)



2024-01
2024-09
{'2024-02': {'problems': ['Service was described as poor, leading to negative customer experiences.', 'Customers felt that their orders were not managed properly.'], 'improve': ['Conduct regular service quality assessments to identify areas needing improvement.', 'Increase staff training on order management and customer interaction.']}, '2024-03': {'problems': ['High noise levels were reported, making the atmosphere uncomfortable.', 'Customers expressed dissatisfaction with the consistency of their experiences.'], 'improve': ['Consider soundproofing measures or altering the layout to reduce noise.', 'Standardize service procedures to ensure a consistent experience for all customers.']}, '2024-08': {'problems': ['Seating arrangements were found to be uncomfortable by patrons.', 'There were mixed feelings about the overall dining experience.'], 'improve': ['Reevaluate the dining layout and invest in comfortable seating options.', 'Solicit regular feedback from customers t

In [17]:
resume

Unnamed: 0,stars,reviews
0,5,2290
1,4,1308
2,3,396
3,2,132
4,1,128
