In [128]:
import openai
import json
import tiktoken
import time

def num_tokens_from_messages(messages, model="gpt-3.5-turbo-0301"):
   """Returns the number of tokens used by a list of messages."""
   try:
       encoding = tiktoken.encoding_for_model(model)
   except KeyError:
       print("Warning: model not found. Using cl100k_base encoding.")
       encoding = tiktoken.get_encoding("cl100k_base")
   if model == "gpt-3.5-turbo":
       print("Warning: gpt-3.5-turbo may change over time. Returning num tokens assuming gpt-3.5-turbo-0301.")
       return num_tokens_from_messages(messages, model="gpt-3.5-turbo-0301")
   elif model == "gpt-4":
       print("Warning: gpt-4 may change over time. Returning num tokens assuming gpt-4-0314.")
       return num_tokens_from_messages(messages, model="gpt-4-0314")
   elif model == "gpt-3.5-turbo-0301":
       tokens_per_message = 4  # every message follows <|start|>{role/name}\n{content}<|end|>\n
       tokens_per_name = -1  # if there's a name, the role is omitted
   elif model == "gpt-4-0314":
       tokens_per_message = 3
       tokens_per_name = 1
   else:
       raise NotImplementedError(f"""num_tokens_from_messages() is not implemented for model {model}. See https://github.com/openai/openai-python/blob/main/chatml.md for information on how messages are converted to tokens.""")
   num_tokens = 0
   for message in messages:
       num_tokens += tokens_per_message
       for key, value in message.items():
           num_tokens += len(encoding.encode(value))
           if key == "name":
               num_tokens += tokens_per_name
   num_tokens += 3  # every reply is primed with <|start|>assistant<|message|>
   return num_tokens

class SentimentAnalyzer:
   def __init__(self, openai_api_key, temperature=0.5):
       openai.api_key = openai_api_key
       openai.api_base = "http://subhagato-desktop.local:5001/v1"
       self.temperature = temperature
       self.system_message = '''You are a NLP model that generates Aspect Based Sentiment in the form of a JSON string with the following format: {"Food Quality" : 1, "Price": -1, "Ambiance": 1, "Service": 0, "Comfort": 1} given a review. The values will be -1 if the sentiment is negative, it will be 0 if nothing is mentioned about the topic, will be 1 if the sentiment is positive'''
       self.sample_data = []
       self.conversation = [{"role": "system", "content": self.system_message}]
       self.token_count = self.count_tokens()

   def addSampleData(self, review, sentiment):
       self.sample_data.append((review, sentiment))
       self.conversation.append({"role": "user", "content": review})
       self.conversation.append({"role": "assistant", "content": sentiment})
       # Count tokens in review and sentiment
       self.token_count = self.count_tokens()

   def deleteSamples(self):
       self.sample_data = []
       self.conversation = [{"role": "system", "content": self.system_message}]
       self.token_count = self.count_tokens()

   def analyze_sentiment(self, review):
       self.conversation.append({"role": "user", "content": review})
       self.token_count = self.count_tokens()

       response = openai.ChatCompletion.create(
         model="gpt-3.5-turbo",
         messages=self.conversation,
         temperature=self.temperature
       )
       time.sleep(1)

       sentiment = response['choices'][0]['message']['content']
       del self.conversation[-1]
       return json.loads(sentiment)

   def count_tokens(self):
       return num_tokens_from_messages(self.conversation)

In [129]:
def num_tokens(string):
    encoding = tiktoken.encoding_for_model("gpt-3.5-turbo-0301")
    return len(encoding.encode(string))


In [149]:
analyzer = SentimentAnalyzer('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', temperature=0.1)
analyzer.addSampleData("The chicken burger was incredibly tasty but the service was pretty slow, and it's also quite expensive.", '''{"Food Quality" : 1, "Price": -1, "Ambiance": 0, "Service": -1, "Comfort": 0}''')
analyzer.addSampleData("There was a great vibe and the ambiance was great, with loud music and it was comfortable.", '''{"Food Quality" : 0, "Price": 0, "Ambiance": 1, "Service": 0, "Comfort": 1}''')
analyzer.addSampleData("I ordered fried rice. My food tastes good but the service is not great overall. Place was cozy", '''{"Food Quality" : 1, "Price": 0, "Ambiance": 0, "Service": -1, "Comfort": 1}''')# 

In [150]:
result = analyzer.analyze_sentiment('''I was hesitant to write this review because I really love this place. The service is good and friendly. The food is delicious! However, I am so annoyed right now because I ordered jambalaya and I'm so confused. Are they making this dish with no prawns or shrimp now? I mean there was literally NOT ONE PRAWN in the bowl. That's my favorite part. Smh I am not happy with them today.''')

In [151]:
print(result)

{'Food Quality': 0, 'Price': 0, 'Ambiance': 0, 'Service': -1, 'Comfort': 0}


In [133]:
import pandas as pd
df_balanced_data = pd.read_pickle('df_top_fifty_restaurant.pkl')
df_balanced_data.head()

Unnamed: 0,business_id,stars,text
0,9kv9JpLhzkN3UeWZDrsmEA,3.0,Ate at the bar late one evening. The food was...
1,8kUh6TROemLfbVR_ewVVLg,2.0,ive gone here more than 10 times. the first f...
2,9kv9JpLhzkN3UeWZDrsmEA,3.0,"Went here Saturday for drag brunch, we had 8 p..."
3,9kv9JpLhzkN3UeWZDrsmEA,3.0,Turns out they randomly close the pool down wh...
4,9kv9JpLhzkN3UeWZDrsmEA,3.0,"Totally awesome experience. Very New Orleans ""..."


In [160]:
# Shuffle the DataFrame and sample 100 rows
df_sampled = df_balanced_data.sample(n=2000)

# Reset the index of the sampled DataFrame
df_sampled = df_sampled.reset_index(drop=True)


In [162]:
from sumy.parsers.plaintext import PlaintextParser
from sumy.nlp.tokenizers import Tokenizer
from sumy.summarizers.lex_rank import LexRankSummarizer

# Create a summarizer instance
summarizer = LexRankSummarizer()

# Define a function for summarization
def summarize_text(text):
    # If the text is too short, don't summarize it
    tokens = num_tokens(text)
    if tokens < 400:
        print("number of token in review: " + str(tokens))
        return text
    
    # Create a parser and tokenizer for the text
    parser = PlaintextParser.from_string(text, Tokenizer("english"))
    
    # Let's start with a large number of sentences to summarize
    num_sentences = 50
    summarized = summarizer(parser.document, num_sentences)

    # If the summary is too long, decrease the number of sentences and try again
    while num_tokens(str(summarized)) > 400:
        num_sentences -= 1
        summarized = summarizer(parser.document, num_sentences)

    if summarized:
        # return the full summary
        summary_text = " ".join(str(sentence) for sentence in summarized)
        print("number of token in summarized review: " + str(num_tokens(summary_text)))
        return summary_text
    else:
        return text

# Create a new column 'summary' in df_sampled
df_sampled['summary'] = df_sampled['text'].apply(summarize_text)





number of token in review: 96
number of token in summarized review: 283
number of token in review: 143
number of token in review: 104
number of token in review: 208
number of token in review: 239
number of token in review: 40
number of token in review: 275
number of token in review: 399
number of token in review: 44
number of token in review: 107
number of token in review: 58
number of token in review: 28
number of token in review: 126
number of token in review: 149
number of token in review: 221
number of token in review: 67
number of token in review: 67
number of token in review: 35
number of token in review: 297
number of token in review: 29
number of token in review: 88
number of token in review: 108
number of token in review: 43
number of token in review: 133
number of token in review: 33
number of token in review: 111
number of token in review: 141
number of token in review: 153
number of token in review: 206
number of token in review: 37
number of token in review: 377
number of 

In [163]:
df_sampled.head()

Unnamed: 0,business_id,stars,text,summary
0,aDgughL1vDootnXe5kUWGQ,3.0,"Not authentic but good flavor, cooked well, & ...","Not authentic but good flavor, cooked well, & ..."
1,ORVWMFwF5qTWy9Cbahf0Pg,4.0,Another Brennan Family restaurant! Café Adelai...,Another Brennan Family restaurant! The Swizzle...
2,nAbese1E1H5TwZntoEyfzg,4.0,"I order Uncle Julio's a lot, for one reason. T...","I order Uncle Julio's a lot, for one reason. T..."
3,U_hgCCf3CZs1no_BYXU8zg,3.0,Red Lion is decent establishment to grab drink...,Red Lion is decent establishment to grab drink...
4,o3X9fjmFXk0chl7PcXv5uQ,5.0,Pho Thai Nan is wonderful! The Coconut Shrimp ...,Pho Thai Nan is wonderful! The Coconut Shrimp ...


In [114]:
# df_sampled['aspect_sentiment']=df_sampled['summary'].apply(lambda x:analyzer.analyze_sentiment(x))

In [164]:
df_sampled['aspect_sentiment'] = None  # Create an empty column to store the results

for index, row in df_sampled.iterrows():
    summary = row['summary']
    sentiment = analyzer.analyze_sentiment(summary)
    df_sampled.at[index, 'aspect_sentiment'] = sentiment

APIConnectionError: Error communicating with OpenAI: HTTPConnectionPool(host='subhagato-desktop.local', port=5001): Max retries exceeded with url: /v1/chat/completions (Caused by NewConnectionError('<urllib3.connection.HTTPConnection object at 0x7f2b74f58790>: Failed to establish a new connection: [Errno 111] Connection refused'))

In [157]:
df_sampled.head()

Unnamed: 0,business_id,stars,text,summary,aspect_sentiment
0,C26xmwM2_OgJJi2BbL3KkQ,2.0,"The tar tar was tasteless and looked awful, th...","The tar tar was tasteless and looked awful, th...","{'Food Quality': -1, 'Price': 0, 'Ambiance': 0..."
1,2WGnykxiM-Mp-qIm2u7iAw,1.0,This place is definitely overrated. Not good w...,This place is definitely overrated. Not good w...,"{'Food Quality': -1, 'Price': -1, 'Ambiance': ..."
2,dK10D96iGeHuTQzR8aG90Q,5.0,W. O. W. !!! What an amazing establishment! T...,W. O. W. !!! What an amazing establishment! T...,"{'Food Quality': 1, 'Price': 0, 'Ambiance': 1,..."
3,wr4tUYkhdlWs6HJNbiipSg,5.0,"Awesome breakfast food! Great prices, good se...","Awesome breakfast food! Great prices, good se...","{'Food Quality': 1, 'Price': 1, 'Ambiance': 1,..."
4,GmXC3CBu172Bkb0c6RnO7w,2.0,I have no idea why a lot of people saying this...,I have no idea why a lot of people saying this...,"{'Food Quality': 0, 'Price': 0, 'Ambiance': 0,..."


In [158]:
df_sampled['Food Quality'] = pd.Series([sentiment['Food Quality'] for sentiment in df_sampled['aspect_sentiment']])
df_sampled['Price'] = pd.Series([sentiment['Price'] for sentiment in df_sampled['aspect_sentiment']])
df_sampled['Ambiance'] = pd.Series([sentiment['Ambiance'] for sentiment in df_sampled['aspect_sentiment']])
df_sampled['Service'] = pd.Series([sentiment['Service'] for sentiment in df_sampled['aspect_sentiment']])
df_sampled['Comfort'] = pd.Series([sentiment['Comfort'] for sentiment in df_sampled['aspect_sentiment']])


In [159]:
df_sampled

Unnamed: 0,business_id,stars,text,summary,aspect_sentiment,Food Quality,Price,Ambiance,Service,Comfort
0,C26xmwM2_OgJJi2BbL3KkQ,2.0,"The tar tar was tasteless and looked awful, th...","The tar tar was tasteless and looked awful, th...","{'Food Quality': -1, 'Price': 0, 'Ambiance': 0...",-1,0,0,-1.0,0
1,2WGnykxiM-Mp-qIm2u7iAw,1.0,This place is definitely overrated. Not good w...,This place is definitely overrated. Not good w...,"{'Food Quality': -1, 'Price': -1, 'Ambiance': ...",-1,-1,0,0.0,0
2,dK10D96iGeHuTQzR8aG90Q,5.0,W. O. W. !!! What an amazing establishment! T...,W. O. W. !!! What an amazing establishment! T...,"{'Food Quality': 1, 'Price': 0, 'Ambiance': 1,...",1,0,1,1.0,1
3,wr4tUYkhdlWs6HJNbiipSg,5.0,"Awesome breakfast food! Great prices, good se...","Awesome breakfast food! Great prices, good se...","{'Food Quality': 1, 'Price': 1, 'Ambiance': 1,...",1,1,1,1.0,1
4,GmXC3CBu172Bkb0c6RnO7w,2.0,I have no idea why a lot of people saying this...,I have no idea why a lot of people saying this...,"{'Food Quality': 0, 'Price': 0, 'Ambiance': 0,...",0,0,0,-1.0,0
...,...,...,...,...,...,...,...,...,...,...
95,ny1N_Py01kVoGOvL9oKjMQ,2.0,This was my first time coming for brunch on a ...,This was my first time coming for brunch on a ...,"{'Food Quality': -1, 'Price': 0, 'Ambiance': 0...",-1,0,0,0.0,0
96,B2KGQwnxjpdY4TIVqmG3XA,2.0,Clean and friendly but the food seemed half-he...,Clean and friendly but the food seemed half-he...,"{'Food Quality': 0, 'Price': 0, 'Ambiance': 0,...",0,0,0,0.0,0
97,teFjQxUqT8c-yxQdoILDVQ,3.0,"Good food, good service, loud and expensive. M...","Good food, good service, loud and expensive. M...","{'Food Quality': 1, 'Price': -1, 'Ambiance': 0...",1,-1,0,0.0,0
98,jZd462wsUBOIB60UEjUOsw,5.0,Even though this place is known for its coffee...,Even though this place is known for its coffee...,"{'Food Quality': 1, 'Price': 0, 'Ambiance': 1,...",1,0,1,0.0,1
