In [1]:
import pandas as pd
import re
import numpy as np
import stanza
import os
import logging

log_file_path = os.path.abspath('..\logs\lemmatization.log')

logging.basicConfig(filename='..\logs\lemmatization.log', level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s')


nlp = stanza.Pipeline(processors='tokenize,sentiment', lang='en', use_gpu=True)


Pyarrow will become a required dependency of pandas in the next major release of pandas (pandas 3.0),
(to allow more performant data types, such as the Arrow string type, and better interoperability with other libraries)
but was not found to be installed on your system.
If this would cause problems for you,
please provide us feedback at https://github.com/pandas-dev/pandas/issues/54466
        
  import pandas as pd
  from .autonotebook import tqdm as notebook_tqdm
2024-01-25 17:38:58 INFO: Checking for updates to resources.json in case models have been updated.  Note: this behavior can be turned off with download_method=None or download_method=DownloadMethod.REUSE_RESOURCES
Downloading https://raw.githubusercontent.com/stanfordnlp/stanza-resources/main/resources_1.7.0.json: 370kB [00:00, 5.99MB/s]                    
2024-01-25 17:38:59 INFO: Loading these models for language: en (English):
| Processor | Package  |
------------------------
| tokenize  | combined |
| mwt       | combin

In [2]:
business_df = pd.read_csv('../data/processed/business.csv')
df_iter = pd.read_json('../data/raw/yelp_academic_dataset_review.json', lines=True, chunksize=100, encoding='utf-8')
df = next(df_iter)

In [3]:
restaurant_b_ids = list(business_df['business_id'].unique())

In [4]:
# retaining only relevant attributes
df = df[['business_id', 'stars', 'useful', 'text', 'date']]

In [5]:
# reviews of restaurants
filtered_df = df[df['business_id'].isin(restaurant_b_ids)]

In [6]:
def validate_datetime(chunk):
  assert not chunk['date'].isnull().any(), "AssertionError: Null values found in the \"date\" column"
  assert chunk['date'].dtype == 'datetime64[ns]', "AssertionError: dtype mismatch of date column"
  assert not ((chunk['date'].dt.month > 12) | (chunk['date'].dt.month < 1)).any(), "AssertionError: Month should be between 1 and 12 (inclusive)"
  assert not ((chunk['date'].dt.day > 31) | (chunk['date'].dt.day < 1)).any(), "AssertionError: Date should be between 1 and 31 (inclusive)"
  

In [7]:
def validate_numerical_col(chunk):
  numerical_col = ['stars', 'useful']
  for col in numerical_col:
    assert chunk[col].dtype == 'int64', f"AssertionError: {col} should have data type 'int64'" 
    assert not chunk[col].isnull().any(), f"AssertionError: {col} should not have missing values"
  
  assert (1.0 <= chunk['stars'].min() <= 5.0), "AssertionError: 'stars' should be in the range of 1 to 5"
  assert (1.0 <= chunk['stars'].max() <= 5.0), "AssertionError: 'stars' should be in the range of 1 to 5"

  return True

In [8]:
def validate_data(chunk):
  assert not chunk['business_id'].isnull().any(), "AssertionError: 'Business id' should not have missing values"
  validate_numerical_col(chunk)
  validate_datetime(chunk)
  assert not chunk.isnull().any().any(), "AssertionError: Chunk must not contain any missing values"
  return True

In [9]:
def clean_text(inp_text):
  # lower case
  inp_text = inp_text.lower()
  # extract out all alphabet, numbers, and select special characters and join them back together with a 'space'.
  regex_pattern = r'[a-zA-Z0-9s!?." "]+'
  matched_substrings = re.findall(regex_pattern, inp_text)
  cleaned_text = ''.join(matched_substrings)
  # replace all non alphabetic, space and period characters with a period
  cleaned_text = re.sub(r'[^a-zA-Z0-9\s.]', '.', cleaned_text)
  
  return cleaned_text

  

In [10]:
def clean_stars(stars_column):
  # choice_of_ratings = [np.floor(stars_column.mean()), stars_column.mode(), stars_column.quantile(0.5)]
  choice_of_ratings = [3, 4, 5]
  stars_column = stars_column.fillna(np.random.choice(choice_of_ratings))
  stars_column = stars_column.astype('int')
  return stars_column

def clean_useful(useful_column):
  useful_column = useful_column.fillna(useful_column.mode())
  useful_column = useful_column.astype('int')
  return useful_column

def clean_date(date_column):
  date_column = pd.to_datetime(date_column)
  return date_column

In [11]:
def clean_data(chunk):
  chunk.loc[:, 'business_id'] = chunk['business_id'].dropna()
  chunk.loc[:, 'stars'] = clean_stars(chunk['stars'])
  chunk.loc[:, 'useful'] = clean_useful(chunk['useful'])
  chunk.loc[:, 'date'] = clean_date(chunk['date'])
  chunk.loc[:, 'text'] = chunk['text'].apply(lambda x: clean_text(x))
  return chunk

1. ~~Spell correction (did not add any value, very time taking)~~
2. ~~(Sentence, Sentiment) map using Stanza.~~
3. ~~Flatten (sentence, sentiment) map into individual rows.~~
4. annotate sentence into one word "business area" using OpenAI or other libraries available on github. 
   1. Note: OpenAI is paid but gave the best results so far. better than manual labelling, semi-supervised labelling, and stanza.

In [12]:
cleaned_data = clean_data(filtered_df)

In [13]:
validate_data(cleaned_data)


True

In [14]:
# Retrieve the sentiment of each noun from a sentence
def get_line_sentiment(review, record_count):
  sentiment_map = {}
  for sentence in nlp(review).sentences:
    sentiment = sentence.sentiment
    sentiment_map[sentence.text] = sentiment
  record_count[0] += 1
  logging.info(f'get_line_sentiment: {record_count[0]}/{record_count[1]}.')
  return sentiment_map

In [15]:
batch_size = len(cleaned_data)
record_count = [0, batch_size]
cleaned_data = cleaned_data.assign(sentiment_dict=cleaned_data['text'].apply(lambda x: get_line_sentiment(x, record_count)))


In [16]:
def get_sentiment_noun(sentiment_dict):
  
  return sentiment_dict.keys()

def get_sentiment_value(sentiment_dict):
  
  return sentiment_dict.values()

def clean_dataframe(df):
  # fix date type
  # df.loc[:, 'date'] = pd.to_datetime(df['date'])
  # df = df.drop(['text', 'corrected_text'], axis=1)
  df.loc[:, 'statement'] = df['sentiment_dict'].apply(get_sentiment_noun)
  df.loc[:, 'sentiment'] = df['sentiment_dict'].apply(get_sentiment_value)

  df = df.explode(['statement', 'sentiment'])

  df = df.drop('sentiment_dict', axis=1)

  return df

In [17]:
# cleaned_data = cleaned_data.copy()
cleaned_data = clean_dataframe(cleaned_data)


In [19]:
cleaned_data.head()

Unnamed: 0,business_id,stars,useful,text,date,statement,sentiment
0,XQfwVwDr-v0ZS3_CbbE5Xw,3,0,if you decide to eat here just be aware it is ...,2018-07-07 22:09:11,if you decide to eat here just be aware it is ...,1
0,XQfwVwDr-v0ZS3_CbbE5Xw,3,0,if you decide to eat here just be aware it is ...,2018-07-07 22:09:11,we have tried it multiple times because i want...,1
0,XQfwVwDr-v0ZS3_CbbE5Xw,3,0,if you decide to eat here just be aware it is ...,2018-07-07 22:09:11,i have been to its other locations in nj and n...,2
0,XQfwVwDr-v0ZS3_CbbE5Xw,3,0,if you decide to eat here just be aware it is ...,2018-07-07 22:09:11,the food is good but it takes a very long time...,1
0,XQfwVwDr-v0ZS3_CbbE5Xw,3,0,if you decide to eat here just be aware it is ...,2018-07-07 22:09:11,the waitstaff is very young but usually pleasant.,2
