In [None]:
import spacy
import pandas as pd
from collections import Counter 
nlp = spacy.load('en_core_web_sm')

df = pd.read_csv('yelp.csv')

# TODO remove later: sampling 
df = df.sample(n=500)

### Preprocessing

In [None]:
# preprocessing
# TODO come back and touch up: simple processing for now
def preprocess(text): 
    doc = nlp(text.lower())
    tokens = [token.lemma_ for token in doc if token.is_alpha and token.text not in nlp.Defaults.stop_words]
    return tokens

df['text'] = df['text'].apply(preprocess)
print(df['text'])

4712    [place, albertson, definitely, high, end, like...
7943    [go, place, time, night, big, plus, place, par...
3836                                     [legit, brewpub]
6825    [find, place, accident, blink, miss, well, fry...
6928    [place, near, yelp, say, chandler, ray, road, ...
                              ...                        
8256    [place, hole, wall, right, phx, car, rental, c...
8842    [burger, tasteless, weird, texture, fry, decen...
5945    [go, happy, hour, long, time, friend, want, tr...
8342    [visit, high, review, think, little, rate, goo...
6023    [eat, time, food, good, time, eat, find, bug, ...
Name: text, Length: 500, dtype: object


In [59]:
df = pd.read_csv('yelp_processed_v3.csv')

# turn tokens back into arrays
import ast
df['text'] = df['text'].apply(lambda x: ast.literal_eval(x) if isinstance(x, str) else x)
print(df['text'].head())

type(df['text'][0])

0    [place, albertson, definitely, high, end, like...
1    [go, place, time, night, big, plus, place, par...
2                                     [legit, brewpub]
3    [find, place, accident, blink, miss, well, fry...
4    [place, near, yelp, say, chandler, ray, road, ...
Name: text, dtype: object


list

In [20]:
df.to_csv('yelp_processed_v3.csv')

# 2: Aspect Extraction
After preprocessing, the next goal is to extract the most important topics, to find points on which we can give feedback. Below are three methods of aspect extraction. 

#### Initial Results (1000 rows, dataset not split into different business types)

##### SpaCy: 

place, food, time, order, service

##### LDA: 

Topic 0: good, place, food, order, like, time, great, chicken, love, try

Topic 1: great, place, like, good, food, time, look, service, try, nice

Topic 2: good, like, time, great, food, place, come, order, service, restaurant

Topic 3: good, place, like, food, time, service, price, come, way, great

Topic 4: like, place, great, burger, food, good, drink, want, love, think

#### TF-IDF: 

**Without sentiments:** food, time, service, come, go

**With only nouns:** food, place, time, service, order


TODO: try LDA without sentiments

### SpaCy Aspect Extraction
This code extracts the aspects (nouns and adj-noun pairs) from each of the sentences, then finds the most common aspects in the entire dataset.

In [None]:
from collections import Counter 

# TODO categorize by type of building
def extract_aspects(text):
    doc = nlp(" ".join(text))
    aspects = [token.text for token in doc if token.pos_ == "NOUN"] # extract nouns
    adj_noun_pairs = [" ".join([token.text, token.head.text]) for token in doc if token.pos_ == "ADJ" and token.head.pos_ == "NOUN"] # extract important pairs
    return aspects + adj_noun_pairs

df['aspects'] = df['text'].apply(extract_aspects)

# count common aspects
all_aspects = [aspect for sublist in df['aspects'] for aspect in sublist ]
aspect_counts = Counter(all_aspects)
print(aspect_counts.most_common(20))

[('place', 663), ('food', 545), ('time', 451), ('order', 355), ('service', 302), ('love', 216), ('restaurant', 210), ('price', 196), ('thing', 180), ('people', 161), ('day', 157), ('staff', 151), ('night', 148), ('experience', 143), ('table', 143), ('burger', 140), ('way', 139), ('friend', 139), ('lot', 138), ('hour', 137)]


### LDA
LDA assigns probability distributions over topics to each document, and produces the top related words for each topic. 


In [35]:
# trying LDA 

from sklearn.feature_extraction.text import CountVectorizer
from sklearn.decomposition import LatentDirichletAllocation

vectorizer = CountVectorizer(stop_words = 'english')
X = vectorizer.fit_transform([" ".join(words) for words in df['text']])

lda = LatentDirichletAllocation(n_components=5, random_state=0)
lda.fit(X)

# get top words per topic
feature_names = vectorizer.get_feature_names_out()
for topic_idx, topic in enumerate(lda.components_): 
    top_words = [feature_names[i] for i in topic.argsort()[:-11:-1]]
    print(f"Topic {topic_idx}: {', '.join(top_words)}")



Topic 0: good, place, food, order, like, time, great, chicken, love, try
Topic 1: great, place, like, good, food, time, look, service, try, nice
Topic 2: good, like, time, great, food, place, come, order, service, restaurant
Topic 3: good, place, like, food, time, service, price, come, way, great
Topic 4: like, place, great, burger, food, good, drink, want, love, think


### TF-IDF
Compute TF-IDF scores for each word, then select top words with highest scores as aspects.

In [115]:
df = pd.read_csv('yelp_processed_v3.csv')

# turn tokens back into arrays
import ast
df['text'] = df['text'].apply(lambda x: ast.literal_eval(x) if isinstance(x, str) else x)

if (type(df['text'][0] != 'text')): df['text'] = df['text'].apply(lambda x: ' '.join(x))

##### Processing without sentiment words:

In [None]:
# remove sentiment words (as possible) so the data doesn't get skewed 
sentiment_words = ['place', 'great', 'good', 'excellent', 'bad', 'terrible', 'like', 'nice', 'love', 'hate', 'amazing', 'wonderful', 'fantastic']
def remove_sentiment(text, sentiment_words): 
    return " ".join([word for word in text.split() if word.lower() not in sentiment_words])

df['text'] = df['text'].apply(lambda x: remove_sentiment(x, sentiment_words))
print(df['text'].head(10))

0    albertson definitely high end food barrel clea...
1    go time night big plus parking check sure rest...
2                                        legit brewpub
3    find accident blink miss well fry catfish tota...
4      near yelp say chandler ray road industrial area
5    girls want splurge go hair salon time money cl...
6    big organic kind guy care hormone free bpa fre...
7    go get phoenix go month mom favorite eat under...
8    ok stingray high point trendy hip explode youn...
9    food northern chinese similar chinese muslim f...
Name: text, dtype: object


##### Processing with only nouns:

In [116]:
# alternate processing, filtered for nouns 

def filter_nouns(text): 
    doc = nlp(text)
    nouns = [token.text for token in doc if token.pos_ in ['NOUN']]
    return " ".join(nouns)

df['text'] = df['text'].apply(lambda x: filter_nouns(x))

##### TF-IDF Vectorizer:

In [117]:
from sklearn.feature_extraction.text import TfidfVectorizer

vectorizer = TfidfVectorizer()
tfidf_matrix = vectorizer.fit_transform(df['text'])
feature_names = vectorizer.get_feature_names_out()

# get average tfidf scores 
tfidf_scores = tfidf_matrix.mean(axis=0).A1
word_scores = dict(zip(feature_names, tfidf_scores))

# top 20 words 
aspects = sorted(word_scores.items(), key=lambda x: x[1], reverse=True)[:20]
print(aspects)

[('food', 0.06452233170841527), ('place', 0.05299159837590725), ('time', 0.03751332767274223), ('service', 0.03545812582771143), ('order', 0.026482994183309364), ('restaurant', 0.0231678198924396), ('love', 0.0220315312477425), ('price', 0.02188395824593481), ('staff', 0.02152927352995993), ('bar', 0.01960026585892079), ('burger', 0.01847028556501228), ('hour', 0.017986373129966933), ('day', 0.017277618719814026), ('drink', 0.01694587674196936), ('thing', 0.016554321190996587), ('experience', 0.0164796254543083), ('night', 0.016339683684711954), ('people', 0.016071050222510867), ('meal', 0.015739915332787225), ('taste', 0.01565029664736228)]


## 3: ABSA (Aspect-Based Sentiment Analysis)

The next step is to analyze each review in relation to each of these aspects. 

1. Extract aspect-related content from each review. 

2. Compute the sentiment for each aspect for each review (positive, negative, neutral). If there is no data in a review about an aspect, it is classified as Neutral. 


TODO: expand number of aspects, only 3 are used now for efficiency 

In [118]:
# reset dataset back to pre-processing, before tokenization
df = pd.read_csv('yelp_processed_v3.csv')

# turn tokens back into arrays
import ast
df['text'] = df['text'].apply(lambda x: ast.literal_eval(x) if isinstance(x, str) else x)

if (type(df['text'][0] != 'text')): df['text'] = df['text'].apply(lambda x: ' '.join(x))

### Assign sentiment to each aspect

### Train ABSA model

In [None]:
# word embeddings for review text

# train classification model (try different ones: logreg for tfidf, rnns, bert)
# multi-label classification model 

