In [16]:
import pandas as pd
import numpy as np
import stanza
from spellchecker import SpellChecker
from IPython.display import display, clear_output


In [2]:
tip_df = pd.read_csv("tips_for_important_businesses.csv")

In [4]:
tip_df = tip_df.drop('_id', axis=1)

In [5]:
tip_df.head()

Unnamed: 0,business_id,date,compliment_count,text
0,Cjwb7VQGP0u2eWYj1wnO3g,2014-06-25 21:32:46,0,great rooftop
1,1N9WC6FwF8MSc7BHbIP35A,2016-12-07 22:08:12,0,great food lovely service! had the lentil sou...
2,_A2YtPqLxss0uLhRey5yCw,2014-09-18 00:21:05,0,great variety of food
3,qY-BUQY-SFBaSrFHowF3nA,2016-01-26 18:11:02,0,great meal i really like their hot pot
4,y6-J_UjNk69VNLb39c_5CA,2011-04-12 21:55:52,0,champagne toast cheese oysters.


In [8]:
tip_df.loc[:, 'date'] = pd.to_datetime(tip_df['date'])

In [10]:
# Initialize PySpellChecker
spell = SpellChecker()
 
# Define a user-defined function (UDF) to correct the spelling
def correct_spelling(text):
    corrected_words = []
    if text is None:
        return ''
    words = text.split()
    for word in words:
        period = ""
        if "." in word:
            period = "."
        if spell.correction(word) is not None:
            corrected_words.append(spell.correction(word) + period)
        else:
            corrected_words.append(word + period)
    return ' '.join(corrected_words)

In [11]:
tip_df.dropna(subset=['text'], inplace=True)

In [12]:
tip_df['corrected_text'] = tip_df['text'].apply(lambda x: correct_spelling(x))

In [17]:
nlp = stanza.Pipeline(processors='tokenize,mwt,pos,sentiment', lang='en', use_gpu=True)
# Retrieve the sentiment of each noun from a sentence
number_of_records = len(tip_df)

def get_noun_sentiment(review, record_count):
    noun_sentiment_map = {}
    flat_sentiment = ''
    # start_time_cpu = time.time()
    for sentence in nlp(review).sentences:
        sentiment = sentence.sentiment
        for word in sentence.words:
            if word.pos.startswith('N'):
                noun_sentiment_map[word.text] = sentiment
    record_count[0] += 1
    # end_time_cpu = time.time()
    # logging.info(f'Processed {record_count[0]} out of {record_count[1]}.')
    # print(f'Processed {record_count[0]} out of {record_count[1]}.')
    # print(f'Time: {end_time_cpu - start_time_cpu}.')
    clear_output(wait=True)  # Clears the output cell before displaying the next message
    display(f'Processed {record_count[0]} out of {record_count[1]}.')
    return noun_sentiment_map


2023-06-27 18:45:54 INFO: Checking for updates to resources.json in case models have been updated.  Note: this behavior can be turned off with download_method=None or download_method=DownloadMethod.REUSE_RESOURCES


Downloading https://raw.githubusercontent.com/stanfordnlp/stanza-resources/main/resources_1.5.0.json:   0%|   …

2023-06-27 18:45:54 INFO: Loading these models for language: en (English):
| Processor | Package  |
------------------------
| tokenize  | combined |
| pos       | combined |
| sentiment | sstplus  |

2023-06-27 18:45:54 INFO: Using device: cpu
2023-06-27 18:45:54 INFO: Loading: tokenize
2023-06-27 18:45:54 INFO: Loading: pos
2023-06-27 18:45:55 INFO: Loading: sentiment
2023-06-27 18:45:55 INFO: Done loading processors!


In [29]:
tip_df[tip_df['business_id'] == "Rt4xYQBWC8i2xqLp9dP7XQ"]['corrected_text'].shape

(10,)

In [37]:

record_count = [0, len(tip_df)]
tip_df['noun_sentiment'] = tip_df['corrected_text'].apply(lambda x: get_noun_sentiment(x, record_count))


'Processed 3251 out of 3251.'

In [38]:
def write_file_to_disk(df, filename):
    df.to_csv(filename, index=False,)

In [39]:
write_file_to_disk(tip_df, f'tip_sentiment_extracted.csv')
