In [78]:
import pandas as pd
import numpy as np
import stanza
import time
from IPython.display import display, clear_output


In [2]:
review_df = pd.read_csv('D:\\DATA603\\project\\new_MongoDB\\review\\review_corrected_text.csv')

In [3]:
# stanza.download('en')


In [4]:
nlp = stanza.Pipeline(processors='tokenize,mwt,pos,sentiment', lang='en', use_gpu=True)


2023-06-26 19:07:51 INFO: Checking for updates to resources.json in case models have been updated.  Note: this behavior can be turned off with download_method=None or download_method=DownloadMethod.REUSE_RESOURCES


Downloading https://raw.githubusercontent.com/stanfordnlp/stanza-resources/main/resources_1.5.0.json:   0%|   …

2023-06-26 19:07:52 INFO: Loading these models for language: en (English):
| Processor | Package  |
------------------------
| tokenize  | combined |
| pos       | combined |
| sentiment | sstplus  |

2023-06-26 19:07:52 INFO: Using device: cpu
2023-06-26 19:07:52 INFO: Loading: tokenize
2023-06-26 19:07:52 INFO: Loading: pos
2023-06-26 19:07:52 INFO: Loading: sentiment
2023-06-26 19:07:53 INFO: Done loading processors!


In [79]:
# Retrieve the sentiment of each noun from a sentence
number_of_records = len(review_df)

def get_noun_sentiment(review, record_count):
    noun_sentiment_map = {}
    flat_sentiment = ''
    start_time_cpu = time.time()
    for sentence in nlp(review).sentences:
        sentiment = sentence.sentiment
        for word in sentence.words:
            if word.pos.startswith('N'):
                noun_sentiment_map[word.text] = sentiment
    record_count[0] += 1
    end_time_cpu = time.time()
    # logging.info(f'Processed {record_count[0]} out of {record_count[1]}.')
    # print(f'Processed {record_count[0]} out of {record_count[1]}.')
    # print(f'Time: {end_time_cpu - start_time_cpu}.')
    clear_output(wait=True)  # Clears the output cell before displaying the next message
    display(f'Processed {record_count[0]} out of {record_count[1]}.')
    return noun_sentiment_map


1. Sentiment extraction from the reviews is taking a very long time. 
2. spelling checker took 6 hours to run.
3. so i need to batch it.
   
   1. Write the corrected_text table to a csv file.
   2. Run sentiment extraction by batching the data by business_id.
   3. After completing every run, write the file with appropriate filename to the disk.
   4. use what ever file is ready, and let it run in the background for however long it needs.
   5. Full sentiment extraction should be done in a little more than 73 hours.

In [45]:
# Business ids 
business_ids = review_df.groupby('business_id')['stars'].count().sort_values().index.to_list()


In [64]:
review_df.loc[:, 'review_sentiment'] = review_df[review_df['business_id'] == "Rt4xYQBWC8i2xqLp9dP7XQ"]['corrected_text'][:2].apply(lambda x: get_noun_sentiment(x, record_count))


Processed 25 out of 28401. Time: 7.920673608779907.
Processed 26 out of 28401. Time: 10.565166473388672.


In [97]:

def write_file_to_disk(df, filename):
    df.to_csv(filename, index=False,)

# File map
business_file_map = {}
def batch_sentiment_extraction(b_id, business_number, business_file_map):
    batch_size = len(review_df[review_df['business_id'] == b_id])
    record_count = [0, batch_size]
    print(f'Sentiment Extraction of {b_id} Beginning.')
    print(f'Batch size: {batch_size}.')
    review_df.loc[:, 'review_sentiment'] = review_df[review_df['business_id'] == b_id]['corrected_text'].apply(lambda x: get_noun_sentiment(x, record_count))
    print(f'Sentiment Extraction of {b_id} Complete.')
    print('Writing the file to disk...')
    processed_sentiment = review_df.loc[review_df['business_id'] == b_id]
    write_file_to_disk(processed_sentiment, f'sentiment/b_num_{business_number}.csv')
    business_file_map[business_number] = f'{b_id}'
    return

In [98]:
business_file_map = {}

In [104]:
def begin_batch_processing(business_ids, business_file_map):

    for idx, b_id in enumerate(business_ids):
        if b_id in business_file_map.values():
            continue
        else:
            batch_sentiment_extraction(b_id, idx, business_file_map)
    
    

1. Write few processed records at a time to the file.
2. Keep track of which indexes are done. 
3. Resume on restart.

In [125]:
begin_batch_processing(business_ids, business_file_map)

'Processed 3427 out of 3427.'

Sentiment Extraction of IkY2ticzHEn4QFn8hQLSWg Complete.
Writing the file to disk...


In [126]:
business_ids_df = pd.DataFrame(data=business_file_map.values(), index=business_file_map.keys(), columns=['business_id'])
business_ids_df.to_csv("business_ids_processed.csv")

In [127]:
business_file_map

{0: 'Rt4xYQBWC8i2xqLp9dP7XQ',
 1: 'k37i1EW_x848o_n1ATcErg',
 2: '5t-25pkm9ovVcbH3BBIpCw',
 3: 'P5Wq5OwEJ0Zz7piLoToXDA',
 4: 'YkXedtqYoQGXgNFHLClYIQ',
 5: 'C809UuprygJyEgJw4wr2Pg',
 6: 'Ri2S4YEre3xTmWUCO2XEGQ',
 7: 'UNWb37aMC3nuWdszceGMxg',
 8: 'roKcuykq_7CNMcTGtNUmYg',
 9: 'RGzIHIhOIZzsjLRVDvSjRQ',
 10: 'UwI-vRH7Mu6PmsPR67MXyw',
 11: '-tRhEoFo5viTTPX316ut6w',
 12: 'UoDicg0wO3Q1JPUymA-91w',
 13: 'Rv6P37KiiuowrXti2JHZNQ',
 14: 'tNOLXgYTykXmLaAZnvo1vg',
 15: 'XX0xyWDpFc6Diq-XVHUckg',
 16: 'AlnAoqsqChTn1Eg3dzLmMw',
 17: '38yHZCHWgPZlDj5SqdNcpA',
 18: 'VVarKkODJTs_qx_gz4Hxbg',
 19: '6rRGA1V-mLl7f2BqqZ_AmA',
 20: '8ynnZAfsIHltv72hiM-rlA',
 21: 'zJoT9gFvGtZAhbdcIF4Xmw',
 22: 'QUshQjkKA_s0yTrxOGbRrQ',
 23: 'UMHuKs1sO-wq3XqKaejXeA',
 24: 'pXVGayL-fCoAMci3G1dRzA',
 25: 'jF3RPKNsdcb4657pNRbGxQ',
 26: '8bUZSK2GPfwkGRbh06r07w',
 27: 'UhjfJpaAzgSYrTZ_dMMF1Q',
 28: 'PmFBiD-KW4U_L1MS9qcIUQ',
 29: 'YRw8RmnSc1olsEFTf5H6Eg',
 30: 'uf4gH2bLBox8bCHw30kJxQ',
 31: 'yxQs5gSf0-8cEd6YMgSjbw',
 32: 'rF0xI_3jjlsE

In [123]:
business_ids_df

Unnamed: 0,business_id
0,Rt4xYQBWC8i2xqLp9dP7XQ
1,k37i1EW_x848o_n1ATcErg
2,5t-25pkm9ovVcbH3BBIpCw
3,P5Wq5OwEJ0Zz7piLoToXDA
4,YkXedtqYoQGXgNFHLClYIQ
...,...
95,tOPDno-cu5NQO56FeOBg-g
96,CnhmThuteYExAEvBSzL0qg
97,ZfOS7Mz-iGseNxBhlhXm_w
98,qY-BUQY-SFBaSrFHowF3nA
