In [1]:
# utilities
import re
import numpy as np
import pandas as pd
# plotting
import seaborn as sns
from wordcloud import WordCloud
import matplotlib.pyplot as plt
from nltk.stem import WordNetLemmatizer
# sklearn
from sklearn.model_selection import train_test_split
from sklearn.metrics import confusion_matrix, classification_report
from sklearn.utils import class_weight
import string
import nltk
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
from nltk.stem import PorterStemmer
from sklearn.feature_extraction.text import CountVectorizer,TfidfVectorizer
from nltk.sentiment.vader import SentimentIntensityAnalyzer
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.model_selection import RandomizedSearchCV,GridSearchCV, train_test_split
from sklearn.metrics import accuracy_score
import tensorflow as tf
from sklearn.preprocessing import LabelEncoder
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, Dropout, Embedding, LSTM, Bidirectional
from tensorflow.keras.layers import TextVectorization
from tensorflow.keras.preprocessing.text import text_to_word_sequence
from tensorflow.keras.preprocessing import sequence


print("TensorFlow version:", tf.__version__)
print('Done')

2023-05-18 22:31:02.261950: I tensorflow/core/platform/cpu_feature_guard.cc:193] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations:  AVX2 FMA
To enable them in other operations, rebuild TensorFlow with the appropriate compiler flags.


TensorFlow version: 2.11.0
Done


In [2]:
# Data Preparation
csv_data = pd.read_csv('extended_googleplaystore_user_reviews.csv')
csv_data = csv_data[csv_data['Translated_Review'].notna()]
p=csv_data['original_Sentiment_Polarity'].max()
q=csv_data['original_Sentiment_Polarity'].min()
text_with_max_length = csv_data['Translated_Review'][csv_data['Translated_Review'].str.len().idxmax()]

print(p,q,len(csv_data))
print('MAXIMUN LENGTH REVIEW---->',text_with_max_length)
csv_data.head()


1.0 -1.0 37427
MAXIMUN LENGTH REVIEW----> »TOO BRIGHT!… NIGHT MODE, PLEASE. HOW MANY REQUESTS DO Y'ALL NEED BEFORE YOU IMPLEMENT A DAMN DARK THEME?!!« Also, I can't help feel like lacking lot features make desktop version wonderful... For instance, ★Interface Personalization★. (I mean, call crazy I consider blindingly hideous white bar impossible-to-see-in-sunlight, thin grey font... aesthetically pleasing. It's actually rather ineffective. It's utilitarian depressing horrible.) Please explain personalize bejesus Chrome which, trapped home desktops, inconveniently stationary thus essentially useless (unless plan hella typing)... Yet comes single personal object carried virtually everyday, everyone, everywhere... An object become profoundly integral who/what ARE modern humans. An object, fosters us sense necessity, roughly 94% Americans purportedly "cannot live without it".... When we're considering single profoundly imperative aspect modern existence, smartphone, expectation us content

Unnamed: 0,App,Translated_Review,sentences_count,characters_count,spaces_count,count_words,duplicates_count,chars_excl_spaces_count,emoji_count,whole_numbers_count,...,spelling_quality,spelling_quality_summarised,ease_of_reading_score,ease_of_reading_quality,ease_of_reading_summarised,grammar_check_score,grammar_check,original_Sentiment,original_Sentiment_Polarity,original_Sentiment_Subjectivity
0,10 Best Foods for You,I like eat delicious food. That's I'm cooking ...,2.0,122.0,20.0,22.0,6.0,102.0,0.0,1.0,...,Bad,Bad,86.2,Easy,Easy,5.0,5 issues,Positive,1.0,0.533333
1,10 Best Foods for You,This help eating healthy exercise regular basis,1.0,47.0,6.0,7.0,0.0,41.0,0.0,0.0,...,Very good,Good,38.99,Difficult,Difficult,0.0,No issues,Positive,0.25,0.288462
3,10 Best Foods for You,Works great especially going grocery store,1.0,42.0,5.0,6.0,0.0,37.0,0.0,0.0,...,Very good,Good,48.47,Difficult,Difficult,0.0,No issues,Positive,0.4,0.875
4,10 Best Foods for You,Best idea us,1.0,12.0,2.0,3.0,0.0,10.0,0.0,0.0,...,Very good,Good,119.19,Very Easy,Easy,1.0,1 issue,Positive,1.0,0.3
5,10 Best Foods for You,Best way,1.0,8.0,1.0,2.0,0.0,7.0,0.0,0.0,...,Very good,Good,120.21,Very Easy,Easy,0.0,No issues,Positive,1.0,0.3


In [3]:
class SentimentSatisfaction:
    def __init__(self,satisfaction_index):
        self.satisfaction_index = satisfaction_index
    def get_sentiment_satisfaction(self):
        if self.satisfaction_index <= -0.6:
            return "very_negative"
        elif self.satisfaction_index <= -0.2:
            return "negetive"
        elif self.satisfaction_index <= 0.2:
            return "neutral"
        elif self.satisfaction_index <= 0.6:
            return "positive"
        else:
            return "very_positive"

satisfaction_class = ["very_negative","negative","neutral","positive","very_positive"]
csv_data['result'] = csv_data['original_Sentiment_Polarity'].apply(lambda x:SentimentSatisfaction(x).get_sentiment_satisfaction())

In [4]:
data = csv_data.copy()
data = csv_data[['Translated_Review','result','original_Sentiment_Polarity']]
# Shuffle Row Order
# data = data.sample(frac=1).reset_index(drop=True)
data.head()

Unnamed: 0,Translated_Review,result,original_Sentiment_Polarity
0,I like eat delicious food. That's I'm cooking ...,very_positive,1.0
1,This help eating healthy exercise regular basis,positive,0.25
3,Works great especially going grocery store,positive,0.4
4,Best idea us,very_positive,1.0
5,Best way,very_positive,1.0


In [5]:
#-------------------------------------------------------------------------------------------------------------
# Cleaning and Wrangling of Data
#-------------------------------------------------------------------------------------------------------------

# Remove URLs and mentions from text
data.loc[:,'Translated_Review'] = data['Translated_Review'].apply(lambda x: re.sub(r'http\S+', '', x))
data.loc[:,'Translated_Review'] = data['Translated_Review'].apply(lambda x: re.sub(r'@\S+', '', x))



# Remove non-alphabetic characters and convert to lowercase
data.loc[:,'Translated_Review'] = data['Translated_Review'].apply(lambda x: re.sub('[^a-zA-Z]', ' ', x.lower()))

# Tokenize text
data.loc[:,'Translated_Review'] = data['Translated_Review'].apply(lambda x: nltk.word_tokenize(x))
data.loc[:,'Translated_Review'] = data['Translated_Review'].apply(lambda x: [value for value in x if not re.match(r'^-?\d+\.?\d*$', value)])


# Remove stopwords
stop_words = stopwords.words('english')
data.loc[:,'Translated_Review'] = data['Translated_Review'].apply(lambda x: [word for word in x if word not in stop_words])

# Lemmatize text
lemmatizer = WordNetLemmatizer()
data.loc[:,'Translated_Review'] = data['Translated_Review'].apply(lambda x: [lemmatizer.lemmatize(word) for word in x])

# Join tokens back into strings
data.loc[:,'Translated_Review'] = data['Translated_Review'].apply(lambda x: ' '.join(x))
data.head()

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  self._setitem_single_column(ilocs[0], value, pi)


Unnamed: 0,Translated_Review,result,original_Sentiment_Polarity
0,like eat delicious food cooking food case best...,very_positive,1.0
1,help eating healthy exercise regular basis,positive,0.25
3,work great especially going grocery store,positive,0.4
4,best idea u,very_positive,1.0
5,best way,very_positive,1.0


In [8]:
# For Machine Learning
m_data = data.copy()
review_data = m_data['Translated_Review']
result_data = m_data['result']

In [11]:
#-------------------------------------------------------------------------------------------------------------
# Preprocessing of Data 
#-------------------------------------------------------------------------------------------------------------

# Create a CountVectorizer object
vectorizer = CountVectorizer()

# Fit and transform the data using the vectorizer object
c_vectorized_data = vectorizer.fit_transform(review_data)

# Split the dataset into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(c_vectorized_data, result_data, test_size=0.2,random_state=42)


In [14]:
# Create a Tf-IDF Vectorizer object
t_vectorized_data = TfidfVectorizer(max_features=10000, stop_words='english')
t_vectorized_data.fit(review_data)

# Fit and transform the data using the vectorizer object
t_vectorized_data = vectorizer.transform(review_data)

# Split the dataset into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(t_vectorized_data, result_data, test_size=0.2,random_state=42)


In [15]:
# Logistic Regression and Count Vectorizer
# Train a machine learning model on the training set
l_model = LogisticRegression()

# For HyperParameter Tuning
param_grid={
    'warm_start': [True], 
    'solver': ['sag'], 
    'penalty': ['l2'], 
    'max_iter': [200], 
    'C': [206.913808111479]
}

# Perform random search with cross-validation
random_search = GridSearchCV(l_model, param_grid=param_grid, cv=5)
random_search.fit(X_train, y_train)

# Get the best hyperparameters and model
best_params = random_search.best_params_
best_model = random_search.best_estimator_

# Use the best model to make predictions
y_pred = best_model.predict(X_test)

# Evaluate the accuracy of the model
accuracy = accuracy_score(y_test, y_pred)
print("Accuracy:", accuracy)




Accuracy: 0.8402351055303233




In [31]:
# For TFIDF and Support Vector Machine
# Train a machine learning model on the training set
t_l_model = SVC()

t_l_model.fit(X_train, y_train)
y_pred = t_l_model.predict(X_test)

# Use the best model to make predictions
y_pred = t_l_model.predict(X_test)

# Evaluate the accuracy of the model
accuracy = accuracy_score(y_test, y_pred)
print("Accuracy:", accuracy)


Accuracy: 0.8088431739246593


In [8]:
# For Deep Learning
d_data = data.copy()
d_data = d_data[['Translated_Review','result']]
texts = d_data['Translated_Review']
max_length = texts.str.len().max()
print('This is maximum length review',max_length)

# consider only the top 10000 words
max_words = 20000 

# Tokenize and encode
vectorizer = TextVectorization(
    max_tokens=10000,  # maximum vocabulary size
    output_mode='int',  # output integer-encoded sequences
    output_sequence_length=100  # fixed sequence length
)
vectorizer.adapt(texts)
encoded_reviews = vectorizer(texts)

def getUniqueValue(x):
    if x=='very_positive':
        return 5
    if x=='positive':
        return 4
    if x=='neutral':
        return 3
    if x=='negative':
        return 2
    if x=='very_negative':
        return 1

# 2241 (is maximum length review)
# result = np.asarray(d_data['result'])
result = d_data['result'].apply(lambda x:getUniqueValue(x))
tensor_data =  encoded_reviews.numpy()
print("Padded Data: ", tensor_data)
print("Shape of data tensor: ", encoded_reviews.shape)
print("Shape of result tensor: ", result.shape)



This is maximum length review 2241
Padded Data:  [[   4  701 3142 ...    0    0    0]
 [  61 1223 1186 ...    0    0    0]
 [  11    8  344 ...    0    0    0]
 ...
 [1092    7  358 ...    0    0    0]
 [1413  477   82 ...    0    0    0]
 [ 200    7 2132 ...    0    0    0]]
Shape of data tensor:  (37427, 100)
Shape of result tensor:  (37427,)


In [None]:
max_words = 20000 
max_length = 100

b_model = Sequential()
b_model.add(Embedding(max_words, 128, input_length=max_length))
b_model.add(Bidirectional(LSTM(64)))
b_model.add(Dropout(0.5))
b_model.add(Dense(1, activation='sigmoid'))

# Split padded data into Training and Testing Data
x_train, x_test, Y_train, Y_test = train_test_split(tensor_data, result, test_size=0.2,random_state=42)
# Split the training data further into training and validation sets
x_train, x_val, Y_train, y_val = train_test_split(x_train, Y_train, test_size=0.2, random_state=42)

# x_train, Y_train will be used for training the BiLSTM model
# x_val, y_val will be used for validation during training
# x_test, Y_test will be used for final evaluation of the trained model

b_model.compile('adam', 'binary_crossentropy', metrics=['accuracy'])
batch_size = 128
print('Train .....')
b_model.fit(x_train, Y_train, batch_size=32, epochs=10, validation_data=(x_val, y_val))

Train .....
Epoch 1/10

In [None]:
# For Polarity Score Estimation
p_data = data.copy()