In [None]:
import pandas as pd
import numpy as np
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.metrics import accuracy_score
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Embedding, Bidirectional, LSTM, Dense, Dropout
from sklearn.ensemble import VotingClassifier
from nltk.sentiment.vader import SentimentIntensityAnalyzer

In [None]:
csv_data = pd.read_csv('extended_googleplaystore_user_reviews.csv', error_bad_lines=False)
csv_data = csv_data[csv_data['Translated_Review'].notna()]
X = csv_data['Translated_Review']



  csv_data = pd.read_csv('extended_googleplaystore_user_reviews.csv', error_bad_lines=False)


In [None]:
class SentimentSatisfaction:
    def __init__(self, satisfaction_index):
        self.satisfaction_index = satisfaction_index
        
    def get_sentiment_satisfaction(self):
        if self.satisfaction_index <= -0.6:
            return 0
        elif self.satisfaction_index <= -0.2:
            return 1
        elif self.satisfaction_index <= 0.2:
            return 2
        elif self.satisfaction_index <= 0.6:
            return 3
        else:
            return 4

# Map sentiment class labels
satisfaction_class = {
    "very_negative": 0,
    "negative": 1,
    "neutral": 2,
    "positive": 3,
    "very_positive": 4
}
csv_data['result'] = csv_data['original_Sentiment_Polarity'].apply(lambda x: SentimentSatisfaction(x).get_sentiment_satisfaction())

NameError: ignored

In [None]:
y = csv_data['result']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [None]:
# vectorizeing the text data
vectorizer = TfidfVectorizer()
X_train_vectorized = vectorizer.fit_transform(X_train)
X_test_vectorized = vectorizer.transform(X_test)

In [None]:
# LR model
lr_model = LogisticRegression()
lr_model.fit(X_train_vectorized, y_train)
lr_preds = lr_model.predict(X_test_vectorized)
print('Logistic Regression Accuracy:', accuracy_score(y_test, lr_preds))


In [None]:
# svm
svm_model = SVC(probability=True)
svm_model.fit(X_train_vectorized, y_train)
svm_preds = svm_model.predict(X_test_vectorized)
print('SVM Accuracy:', accuracy_score(y_test, svm_preds))

SVM Accuracy: 0.7972972972972973


In [None]:
# BiLSTM model
tokenizer = Tokenizer()
tokenizer.fit_on_texts(X_train)
X_train_sequences = tokenizer.texts_to_sequences(X_train)
X_test_sequences = tokenizer.texts_to_sequences(X_test)
vocab_size = len(tokenizer.word_index) + 1
max_len = 100
X_train_padded = pad_sequences(X_train_sequences, maxlen=max_len, padding='post')
X_test_padded = pad_sequences(X_test_sequences, maxlen=max_len, padding='post')

bi_model = Sequential()
bi_model.add(Embedding(vocab_size, 100, input_length=max_len))
bi_model.add(Bidirectional(LSTM(128)))
bi_model.add(Dropout(0.5))
bi_model.add(Dense(5, activation='softmax'))
bi_model.compile(loss='sparse_categorical_crossentropy', optimizer='adam', metrics=['accuracy'])
bi_model.fit(X_train_padded, y_train, validation_data=(X_test_padded, y_test), epochs=10, batch_size=32)

Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


<keras.callbacks.History at 0x7fed525f2ec0>

In [None]:
# predicting using BiLSTM model
bi_preds_prob = bi_model.predict(X_test_padded)
bi_preds = np.argmax(bi_preds_prob, axis=1)
print('BiLSTM Accuracy:', accuracy_score(y_test, bi_preds))

BiLSTM Accuracy: 0.8179650238473768


In [None]:
# voting Classifier 1 (SVM and Logistic Regression)
voting_clf1 = VotingClassifier(estimators=[('lr', lr_model), ('svm', svm_model)], voting='soft')
voting_clf1.fit(X_train_vectorized, y_train)
voting_preds1 = voting_clf1.predict(X_test_vectorized)
print('Voting Classifier 1 Accuracy:', accuracy_score(y_test, voting_preds1))

Voting Classifier 1 Accuracy: 0.8215421303656598


In [None]:
import nltk
nltk.download('vader_lexicon')
analyzer = SentimentIntensityAnalyzer()
vader_preds = [analyzer.polarity_scores(review)['compound'] for review in X_test]
vader_preds = [SentimentSatisfaction(pred).get_sentiment_satisfaction() for pred in vader_preds]
print('VADER Accuracy:', accuracy_score(y_test,vader_preds))

[nltk_data] Downloading package vader_lexicon to /root/nltk_data...


VADER Accuracy: 0.33465818759936405


In [None]:
# from keras.wrappers.scikit_learn import KerasClassifier
!pip install scikeras
from keras.models import Sequential
from keras.layers import Embedding, LSTM, Dense, Bidirectional
from scikeras.wrappers import KerasClassifier

# defineing a function to create the BiLSTM model
def create_bi_model():
    bi_model = Sequential()
    bi_model.add(Embedding(vocab_size, 100, input_length=max_len))
    bi_model.add(Bidirectional(LSTM(128)))
    bi_model.add(Dense(5, activation='softmax'))
    bi_model.compile(loss='sparse_categorical_crossentropy', optimizer='adam', metrics=['accuracy'])
    bi_model.fit(X_train_padded, y_train, validation_data=(X_test_padded, y_test), epochs=10, batch_size=32)
    return bi_model

# creating a KerasClassifier with the BiLSTM model function
bi_classifier = KerasClassifier(build_fn=create_bi_model)

# Creating the second VotingClassifier with bi_classifier and voting_clf1
voting_clf2 = VotingClassifier(estimators=[('bi_lstm', bi_classifier), ('voting_clf1', voting_clf1)], voting='soft')
voting_clf2.fit(X_train_padded, y_train)
voting_preds2 = voting_clf2.predict(X_test_padded)
print('Voting Classifier 2 Accuracy:', accuracy_score(y_test, voting_preds2))




Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


Voting Classifier 2 Accuracy: 0.8338632750397457


In [None]:
from sklearn.base import BaseEstimator, ClassifierMixin
from sklearn.ensemble import VotingClassifier
from sklearn.metrics import accuracy_score
from nltk.sentiment import SentimentIntensityAnalyzer

# custom classifier for VADER sentiment analysis
class VaderClassifier(BaseEstimator, ClassifierMixin):
    def __init__(self, threshold=0.1):
        self.threshold = threshold
        self.sid = SentimentIntensityAnalyzer()

    def fit(self, X, y):
        return self

    def predict(self, X):
        preds = []
        for text in X:
            scores = self.sid.polarity_scores(text)
            compound_score = scores['compound']
            pred = 1 if compound_score >= self.threshold else 0
            preds.append(pred)
        return preds

# create the VADER classifier
vader_clf = VaderClassifier(threshold=0.1)

# create the final VotingClassifier with vader_clf and voting_clf2
final_voting_clf = VotingClassifier(estimators=[('vader', vader_clf), ('voting_clf2', voting_clf2)], voting='soft')
final_voting_clf.fit(X_train_padded, y_train)
final_preds = final_voting_clf.predict(X_test_padded)
print('Final Voting Classifier Accuracy:', accuracy_score(y_test, final_preds))
# final_voting_clf = VotingClassifier(estimators=[('vader', vader_clf), ('voting_clf2', voting_clf2)], voting='soft')
# final_voting_clf.fit(X_train_padded, y_train)
# final_preds = final_voting_clf.predict(X_test_padded)
# print('Final Voting Classifier Accuracy:', accuracy_score(y_test, final_preds))

ValueError: ignored