<a href="https://colab.research.google.com/github/rashmic20/Phase5_Project/blob/main/Vader%20Sentiment%20Analysis.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
import pandas as pd

# Load the cleaned dataset
df_filtered = pd.read_csv('filtered_reviews.csv')


# Check for missing values
df_filtered.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 54611 entries, 0 to 54610
Data columns (total 16 columns):
 #   Column         Non-Null Count  Dtype  
---  ------         --------------  -----  
 0   review_id      54611 non-null  object 
 1   user_id        54611 non-null  object 
 2   business_id    54611 non-null  object 
 3   stars_x        54611 non-null  int64  
 4   date           54611 non-null  object 
 5   name           54611 non-null  object 
 6   city           54611 non-null  object 
 7   state          54611 non-null  object 
 8   postal_code    54611 non-null  int64  
 9   latitude       54611 non-null  float64
 10  longitude      54611 non-null  float64
 11  stars_y        54611 non-null  float64
 12  review_count   54611 non-null  int64  
 13  categories     54611 non-null  object 
 14  cleaned_text   54611 non-null  object 
 15  review_length  54611 non-null  int64  
dtypes: float64(3), int64(4), object(9)
memory usage: 6.7+ MB


In [2]:
import nltk
from gensim.utils import simple_preprocess
from gensim.models.phrases import Phrases, Phraser
from nltk.corpus import stopwords
import spacy

# Load necessary resources
nltk.download('stopwords')
stop_words = stopwords.words('english')
nlp = spacy.load('en_core_web_sm', disable=['parser', 'ner'])

# Tokenization and stopword removal
texts = [simple_preprocess(doc) for doc in df_filtered['cleaned_text']]
texts = [[word for word in doc if word not in stop_words] for doc in texts]

# Creating bigrams and trigrams
bigram = Phrases(texts, min_count=5, threshold=100)
trigram = Phrases(bigram[texts], threshold=100)
bigram_mod = Phraser(bigram)
trigram_mod = Phraser(trigram)

texts_bigrams = [bigram_mod[doc] for doc in texts]

# Lemmatization
def lemmatization(texts, allowed_postags=['NOUN', 'ADJ', 'VERB', 'ADV']):
    texts_out = []
    for sent in texts:
        doc = nlp(" ".join(sent))
        texts_out.append([token.lemma_ for token in doc if token.pos_ in allowed_postags])
    return texts_out

texts_lemmatized = lemmatization(texts_bigrams)


[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.


In [3]:
from gensim.corpora import Dictionary
from gensim.models import LdaModel
from gensim.models.coherencemodel import CoherenceModel

# Create a dictionary and corpus for LDA
id2word = Dictionary(texts_lemmatized)
corpus = [id2word.doc2bow(text) for text in texts_lemmatized]

# Build LDA model and evaluate coherence
lda_model = LdaModel(corpus=corpus, id2word=id2word, num_topics=4, random_state=100, update_every=1, passes=10, alpha='auto', per_word_topics=True)

# Coherence model
coherence_model = CoherenceModel(model=lda_model, texts=texts_lemmatized, dictionary=id2word, coherence='c_v')
coherence_score = coherence_model.get_coherence()
print(f'Coherence Score: {coherence_score}')


Coherence Score: 0.5015917155704872


In [5]:
!pip install vaderSentiment
from vaderSentiment.vaderSentiment import SentimentIntensityAnalyzer

# Function to extract topics and perform sentiment analysis
def get_sentiment_and_topics(ldamodel, corpus, texts):
    analyzer = SentimentIntensityAnalyzer()
    topic_sentiments = []

    for i, row in enumerate(ldamodel[corpus]):
        dominant_topic = sorted(row[0], key=lambda x: x[1], reverse=True)[0][0]
        sentiment = analyzer.polarity_scores(texts[i])['compound']
        topic_sentiments.append((dominant_topic, sentiment))

    return topic_sentiments

# Get the sentiment scores and dominant topics for each review
df_filtered['Topic_Sentiment'] = get_sentiment_and_topics(lda_model, corpus, df_filtered['cleaned_text'])


Collecting vaderSentiment
  Downloading vaderSentiment-3.3.2-py2.py3-none-any.whl.metadata (572 bytes)
Downloading vaderSentiment-3.3.2-py2.py3-none-any.whl (125 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m126.0/126.0 kB[0m [31m3.1 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: vaderSentiment
Successfully installed vaderSentiment-3.3.2


In [6]:
# Separate the topics and sentiment into different columns
df_filtered[['Dominant_Topic', 'Sentiment_Score']] = pd.DataFrame(df_filtered['Topic_Sentiment'].tolist(), index=df_filtered.index)

# Save the DataFrame for Tableau import
df_filtered.to_csv('topics_and_sentiments_for_tableau.csv', index=False)
