In [2]:
# Step 1: Install Necessary Libraries (for Colab Only)
!pip install spacy pandas scikit-learn nltk
!python -m spacy download en_core_web_sm

# Step 2: Import Libraries
import pandas as pd
import nltk
from nltk import ngrams
from nltk.sentiment.vader import SentimentIntensityAnalyzer
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer
from sklearn.decomposition import LatentDirichletAllocation
import spacy

# Ensure necessary NLTK resources are downloaded
nltk.download('punkt')
nltk.download('vader_lexicon')

# Load SpaCy's English language model
nlp = spacy.load("en_core_web_sm")

# Step 3: Upload Files (You will need to upload your CSV (austin_yelp_reviews.csv) here)
from google.colab import files
uploaded = files.upload()

# Assuming the CSV file is uploaded with the name 'austin_yelp_reviews.csv'
yelp_data = pd.read_csv('austin_yelp_reviews.csv')

# Step 4: Data Preparation
# Use the 'Review' column for analysis, combining it with other fields if necessary
yelp_data['combined_text'] = yelp_data['Review']  # 'Review' column contains the text

# Define the bigrams and trigrams keywords from the earlier analysis
bigrams = ['tasting menu', 'michelin star', 'dining experience', 'wine pairing', 'fine dining']
trigrams = ['course tasting menu', 'fine dining experience', 'prix fixe menu', 'wine pairing experience', 'earned michelin star']

# Tokenize the text and find bigrams and trigrams
yelp_data['tokens'] = yelp_data['combined_text'].apply(nltk.word_tokenize)

# Create bigrams and trigrams in the text
yelp_data['bigrams'] = yelp_data['tokens'].apply(lambda tokens: list(ngrams(tokens, 2)))
yelp_data['trigrams'] = yelp_data['tokens'].apply(lambda tokens: list(ngrams(tokens, 3)))

# Function to check for matching bigrams or trigrams in posts
def contains_phrases(n_grams, phrases):
    n_grams_joined = [' '.join(ngram) for ngram in n_grams]
    return [phrase for phrase in phrases if any(phrase in ngram for ngram in n_grams_joined)]

# Apply the function to find matching bigrams and trigrams
yelp_data['matching_bigrams'] = yelp_data['bigrams'].apply(lambda x: contains_phrases(x, bigrams))
yelp_data['matching_trigrams'] = yelp_data['trigrams'].apply(lambda x: contains_phrases(x, trigrams))

# Step 5: TF-IDF Analysis
tfidf_vectorizer = TfidfVectorizer(stop_words='english')
tfidf_matrix = tfidf_vectorizer.fit_transform(yelp_data['combined_text'])
feature_names = tfidf_vectorizer.get_feature_names_out()

# Get the highest scoring TF-IDF terms for each post
def get_top_tfidf_features(row, features, top_n=5):
    topn_ids = row.argsort()[-top_n:]
    top_feats = [(features[i], row[i]) for i in topn_ids]
    return top_feats

yelp_data['top_tfidf'] = [get_top_tfidf_features(row, feature_names) for row in tfidf_matrix.toarray()]

# Step 6: Sentiment Analysis
# Initialize sentiment analyzer
sia = SentimentIntensityAnalyzer()

# Function to get sentiment score
def get_sentiment(text):
    scores = sia.polarity_scores(text)
    return scores['compound']

# Apply sentiment analysis to the combined text
yelp_data['sentiment'] = yelp_data['combined_text'].apply(get_sentiment)

# Step 7: Named Entity Recognition using SpaCy
def extract_restaurant_names_spacy(text):
    doc = nlp(text)
    entities = [ent.text for ent in doc.ents if ent.label_ in ['ORG', 'GPE']]
    return entities

# Apply the SpaCy-based NER extraction to the combined text
yelp_data['restaurant_names_spacy'] = yelp_data['combined_text'].apply(extract_restaurant_names_spacy)

# Step 8: Matching with Known Restaurants
# Load your list of known Austin restaurants (from austin_restaurants.csv)
austin_restaurants = pd.read_csv('austin_restaurants.csv')
known_austin_restaurants = austin_restaurants['Restaurant Name'].tolist()

def match_known_restaurants(detected_names, known_restaurants):
    return [name for name in detected_names if name in known_restaurants]

yelp_data['matched_known_restaurants'] = yelp_data['restaurant_names_spacy'].apply(lambda x: match_known_restaurants(x, known_austin_restaurants))

# Step 9: Topic Modeling (LDA)
count_vectorizer = CountVectorizer(stop_words='english')
count_matrix = count_vectorizer.fit_transform(yelp_data['combined_text'])

lda_model = LatentDirichletAllocation(n_components=5, random_state=42)
lda_topics = lda_model.fit_transform(count_matrix)

# Get top words per topic
def get_top_words(model, feature_names, n_top_words=10):
    topics = {}
    for topic_idx, topic in enumerate(model.components_):
        top_words = [feature_names[i] for i in topic.argsort()[:-n_top_words - 1:-1]]
        topics[f'Topic {topic_idx+1}'] = top_words
    return topics

# Get feature names and top words for each topic
count_feature_names = count_vectorizer.get_feature_names_out()
topic_words = get_top_words(lda_model, count_feature_names)

# Step 10: Save the Results to CSV
yelp_data[['Restaurant Name', 'matching_bigrams', 'matching_trigrams', 'top_tfidf', 'sentiment', 'restaurant_names_spacy', 'matched_known_restaurants']].to_csv('yelp_analysis_spacy.csv', index=False)

# Step 11: Download the results (for Colab only)
from google.colab import files
files.download('yelp_analysis_spacy.csv')

# Step 12: Display Topics from LDA
for topic, words in topic_words.items():
    print(f"{topic}: {', '.join(words)}")


Collecting en-core-web-sm==3.7.1
  Downloading https://github.com/explosion/spacy-models/releases/download/en_core_web_sm-3.7.1/en_core_web_sm-3.7.1-py3-none-any.whl (12.8 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m12.8/12.8 MB[0m [31m74.2 MB/s[0m eta [36m0:00:00[0m
[38;5;2m✔ Download and installation successful[0m
You can now load the package via spacy.load('en_core_web_sm')
[38;5;3m⚠ Restart to reload dependencies[0m
If you are in a Jupyter or Colab notebook, you may need to restart Python in
order to load all the package's dependencies. You can do this by selecting the
'Restart kernel' or 'Restart runtime' option.


[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package vader_lexicon to /root/nltk_data...
[nltk_data]   Package vader_lexicon is already up-to-date!


Saving austin_yelp_reviews.csv to austin_yelp_reviews (1).csv
Saving austin_restaurants.csv to austin_restaurants.csv


<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

Topic 1: pizza, happy, good, great, hour, time, got, ordered, service, bar
Topic 2: food, good, just, like, place, service, time, order, really, great
Topic 3: experience, menu, food, service, dishes, sushi, restaurant, amazing, dish, dining
Topic 4: good, sauce, delicious, flavor, ordered, tacos, food, taco, like, dish
Topic 5: food, great, place, good, service, austin, delicious, tacos, brunch, spot
