# Performing Sentiment Analysis and Topic Modelling on Collected Google Play Store Reviews Data

- Collect Data 
- Ingest Data 
- Preporcess Data
- Approach - Using Scikit Learn and Vader Sentiment Package
- Conclusion and Export

## Collect Data

# Import Data
from google_play_scraper import reviews_all
import pickle

In [2]:
def scrape(APP_ID):
    result = reviews_all(
        APP_ID,
    )

    print(len(result), lang)
    with open(f"{APP_ID}-en-list.pkl", "wb") as pklfile:
        pickle.dump(result, pklfile)
        
# scrape("com.drivetrackplusrefuel")
# scrape("com.cgt.bharatgas")
# scrape("cx.indianoil.in")

## Ingest data
The following pickle files contain data collected through google-play-scraper package in Python which uses NodeJS to scrape data from Google Play Store's batchexecute API.
The data is filtered and preprocessed for sentiment analysis and topic modelling

In [None]:
# Import packages
from datetime import datetime, timedelta
import pandas as pd
import pickle

In [4]:
# Import data
bpcl_df = pd.DataFrame(pd.read_pickle("com.cgt.bharatgas-en-list.pkl"))
iocl_df = pd.DataFrame(pd.read_pickle("cx.indianoil.in-en-list.pkl"))
hpcl_df = pd.DataFrame(pd.read_pickle("com.drivetrackplusrefuel-en-list.pkl"))

In [None]:
# Merge the three dataframes
bpcl_df["AppName"] = ["BPCL"] * len(bpcl_df)
hpcl_df["AppName"] = ["HPCL"] * len(hpcl_df)
iocl_df["AppName"] = ["IOCL"] * len(iocl_df)
merged_df = pd.concat([bpcl_df, iocl_df, hpcl_df])
len(merged_df)

In [None]:
merged_df.info()

In [None]:
# Filter the data collected in the last 5 years
recent_df = merged_df[merged_df["at"] >= datetime.now()-timedelta(days=(365*5))]
len(recent_df)

In [None]:
# remove uneccesary columns
filtered_df = recent_df.drop(labels=["userImage", "reviewCreatedVersion", 'replyContent', 'repliedAt', 'appVersion'], axis=1)
filtered_df.columns

## Preprocess Data
Using NLTK package to remove stopwords and tokenise review text

In [9]:
# Import packages
import nltk
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
from nltk.tokenize import word_tokenize

In [None]:
# Download required NLTK resources
nltk.download('stopwords')
nltk.download('punkt')
nltk.download('wordnet')

In [11]:
def preprocess_review(review):
    # Convert text to lowercase
    review = review.lower()
    
    # Tokenize text into words
    words = word_tokenize(review)
    
    # Remove stopwords
    stop_words = set(stopwords.words('english'))
    words = [word for word in words if word not in stop_words]
    
    # Lemmatize words
    lemmatizer = WordNetLemmatizer()
    words = [lemmatizer.lemmatize(word) for word in words]
    words = " ".join(words)
    return words

## Approach - Using Scikit Learn and Vader Sentiment Package

In [12]:
filtered_df_1 = filtered_df.copy()

In [13]:
# Import packages
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.decomposition import LatentDirichletAllocation
from sklearn.metrics import accuracy_score
from vaderSentiment.vaderSentiment import SentimentIntensityAnalyzer

In [14]:
# Perform sentiment analysis
analyzer = SentimentIntensityAnalyzer()
sentiments = []
for review in filtered_df_1['content']:
    sentiment = analyzer.polarity_scores(review)['compound']
    if sentiment > 0:
        sentiment='positive'
    elif sentiment < 0:
        sentiment='negative'
    else:
        sentiment='neutral'
    sentiments.append(sentiment)

filtered_df_1['sentiment'] = sentiments

In [15]:
# Preprocess Reviews
filtered_df_1['preprocessed_text'] = filtered_df_1['content'].apply(preprocess_review)

# Topic modeling with LDA
vectorizer = CountVectorizer(max_features=1000, max_df=0.95, min_df=2, stop_words='english')
document_term_matrix = vectorizer.fit_transform(filtered_df_1['preprocessed_text'])

lda = LatentDirichletAllocation(n_components=10, random_state=42)
lda.fit(document_term_matrix)

# Add predicted topics to dataframe
topics = lda.transform(document_term_matrix)
filtered_df_1['topic'] = topics.argmax(axis=1)

In [None]:
# Print topics and top words
feature_names = vectorizer.get_feature_names_out()
for topic_idx, topic in enumerate(lda.components_):
    print(f"Topic #{topic_idx}:")
    top_words = [feature_names[i] for i in topic.argsort()[:-11:-1]]
    print(", ".join(top_words))
    print()


## Conclusion and Export

In [20]:
filtered_df_1.to_excel("Topic_Sentiment.xlsx")
with open("Topic Reference.txt", "w") as file:
    feature_names = vectorizer.get_feature_names_out()
    for topic_idx, topic in enumerate(lda.components_):
        file.write(f"Topic #{topic_idx}:\n")
        top_words = [feature_names[i] for i in topic.argsort()[:-11:-1]]
        file.write(", ".join(top_words) + "\n")

In [None]:
filtered_df_1.head()