In [None]:
import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
import matplotlib.pyplot as plt
import seaborn as sns
import datetime

#for sentiment analysis
import re
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize 
from wordcloud import WordCloud


from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer,CountVectorizer,TfidfTransformer
from sklearn.cluster import KMeans
from sklearn.svm import SVC
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix, ConfusionMatrixDisplay
from sklearn.pipeline import Pipeline

from imblearn.over_sampling import SMOTE,SVMSMOTE,ADASYN

import joblib

from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences
from keras.layers import Dense, Input, LSTM, Embedding, Dropout, Activation
from keras.layers import Bidirectional, GlobalMaxPool1D
from keras.models import Model

from gensim.models import KeyedVectors

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

In [None]:
df = pd.read_csv('../input/robinhood-app-reviews-on-google-play-store/Robinhood_GooglePlay_Reviews_enUS.csv')

In [None]:
df.dropna(subset=['content'], inplace = True)
df.head()

In [None]:
df.describe(include='all')

A few quick things to note here:
* **userName**: a lot of reviewers do not show their names and only appear as "A Google User"
* **content** : not all review contents are unique, and from here we can speculate that many of them are probably one-(or few-) word reviews, thus the duplicates
* **score**: there are a lot of 1s. In fact 50% of all scores are <= 1 (or just 1 in this case because there's no 0 score)
* **replyContent**: replies from the company are large canned, which results in a rather low number of unique replies as we can see from the table

# Distributions of Scores

In [None]:
sns.set_context('paper')
ax = sns.displot(df['score'], kde = False, color = 'seagreen', bins = 5)
ax.fig.set_figwidth(15)
ax.fig.set_figheight(10)

From the histogram, we can confirm that indeed most scores are 1. A score of 5 is the second most popular, and 3 (a neutral score) is the least likely. This means this app gets very polarizing reviews: it is either "loved" (score of 5) or "hated" (score of 1).

# Datetime Distribution of when 1 ☆ happened

In [None]:
df['datetime64'] = df['at'].apply(lambda x: datetime.datetime.strptime(x, '%Y-%m-%d %H:%M:%S')) #turn the string representation of datetime into datetime type

In [None]:
score_1 = df[df['score'] == 1]

In [None]:
score_1.groupby([score_1["datetime64"].dt.year]).count().plot(kind = "pie",y = 'datetime64', figsize=(10,10), colormap='Spectral')
plt.title("'When did 1☆ happen?'", fontsize = 18)

It is immediately evident from this pie chart that most 1star reviews are from the year of 2021. We can continue to restrict the time range to specific months and days of 2021.

In [None]:
ax = score_1[(score_1['datetime64'] > pd.to_datetime('2021-01-01')) & (score_1['datetime64'] < pd.to_datetime('2021-04-30'))].groupby([score_1["datetime64"].dt.month,score_1["datetime64"].dt.day]).count().plot(kind = "bar", y = "datetime64",  figsize=(20,10), colormap = 'Spectral')
plt.xlabel('Date', fontsize = 15)
plt.ylabel('Count', fontsize = 15)
plt.title("'When did 1☆ happen?'", fontsize = 18)

This agrees perfectly with the fact that Robinhood started [restricting GME transactions](https://www.bloomberg.com/news/articles/2021-01-28/robinhood-clients-report-trading-restrictions-on-gamestop-amc) on Jan 28th, which caused a major upset and prompted users to leave bad reviews and boycott the app. Most 1☆ reviews of 2021 come from the week of Jan 28th as we see almost no reviews on other dates. It is also noteworthy that the total counts of 1☆ reviews on Jan 28th and Jan 29th (\~70k) make up more than half of the all-time total of 1☆ reviews (\~135k, which can be confirmed using describe() on the **score_1** dataframe). 

In [None]:
score_1.describe(include = "all", datetime_is_numeric=True)['score']['count']

On another hand, if we explore the 5☆ reviews we see a very balanced distribution across the years (except for 2015 when the app first came out).

In [None]:
df[df['score'] == 5].groupby([df["datetime64"].dt.year]).count().plot(kind = "pie",y = 'datetime64', figsize=(10,10), colormap='Spectral')
plt.title("'When did 5☆ happen?'", fontsize = 18)

# Sentiment Analysis: Building the dataset and quick look with Wordcloud

First let's assign labels for reviews based on the score:

* Positive (label 1): a score of 4 or 5
* Negative (label 0): a score of 1 or 2
* Neutral (label 2): a score of 3

In [None]:
def assign_label(x):
    if x >= 4:
        return 1
    elif x <= 2:
        return 0
    else:
        return 2
    
df['label'] = df['score'].apply(assign_label)

In [None]:
Y = [label for label in df['label']]

Now we can focus on the NLP part of the sentiment analysis. Below we will do multiple modifications to make the reviews more "readable" by machine learning: 
1. make everything lowercase
2. remove bad symbols (unwanted characters)
3. expand contractions (for example: don't -> do not)
4. remove punctuations
5. remove stopwords

In [None]:
contractions = pd.read_json('../input/english-contractions/contractions.json', typ='series') #getting the list of English contractions
contractions = contractions.to_dict()

In [None]:
c_re = re.compile('(%s)' % '|'.join(contractions.keys()))

def expandContractions(text, c_re=c_re):
    def replace(match):
        return contractions[match.group(0)]
    return c_re.sub(replace, text)

In [None]:
BAD_SYMBOLS_RE = re.compile('[^0-9a-z #+_]')

def clean_reviews(reviews):
    cleaned_reviews = []
    for review in reviews:
        review = str(review)
        review = review.lower()
        review = BAD_SYMBOLS_RE.sub(' ', review)
        
        #expand contraction
        review = expandContractions(review)

        #remove punctuation
        review = ' '.join(re.sub("([^0-9A-Za-z \t])", " ", review).split())

        #stop words
        stop_words = set(stopwords.words('english'))
        word_tokens = nltk.word_tokenize(review) 
        filtered_sentence = [w for w in word_tokens if not w in stop_words]
        review = ' '.join(filtered_sentence)
        
        cleaned_reviews.append(review)
        
    return cleaned_reviews

In [None]:
X = clean_reviews([review for review in df['content']]) 

In [None]:
df['text'] = [x for x in X]

With WordCloud we can see what words/phrases come up the most for different sentiments:

In [None]:
textt = " ".join(X[i] for i in range(len(X)) if Y[i] == 1) #only take into account positive reviews
wordcloud = WordCloud(width = 512,height = 512, collocations=True, colormap="Greens").generate(textt)
plt.figure(figsize = (10, 10), facecolor = 'k')
plt.imshow(wordcloud, interpolation='bilinear')
plt.axis("off")
plt.show()

For positive reviews we see a lot of "good" words: great, user friendly, nice, easy, fun, etc. The most notable is "easy use", which imho is a true statement of how streamlined and simple the app is to trade. 

In [None]:
textt = " ".join(X[i] for i in range(len(X)) if Y[i] == 0) #only take into account negative reviews
wordcloud = WordCloud(width = 512,height = 512, collocations=True, colormap="Reds").generate(textt)
plt.figure(figsize = (10, 10), facecolor = 'k')
plt.imshow(wordcloud, interpolation='bilinear')
plt.axis("off")
plt.show()

Bad reviews are filled with words such as: trash, criminal, lose money, poor, corrupt. But most significant are: **market manipulation** and **hedge fund**. These are very consistent with why users boycotted the app: because they believed it was controlled by hedge funds and manipulated the supposedly "free" market for their own good.

In [None]:
textt = " ".join(X[i] for i in range(len(X)) if Y[i] == 2) #only take into account neutral reviews
wordcloud = WordCloud(width = 512,height = 512, collocations=True, colormap="Oranges").generate(textt)
plt.figure(figsize = (10, 10), facecolor = 'k')
plt.imshow(wordcloud, interpolation='bilinear')
plt.axis("off")
plt.show()

And finally neutral reviews have mostly very generic, non-sentimental words. 

# Sentiment Analysis: Positive vs. Negative with Naive Bayes

In [None]:
MAX_SEQ_LENGTH = max(len(text.split()) for text in df['text']) #maximum length of a review
MAX_FEATURES = 10000 #only consider the top 10000 most frequent words in the corpus

In [None]:
df_1 = df[df['score'] != 3] #disregard all neutral reviews

In [None]:
#tokenize and vectorize reviews
corpus = df_1['text'].values.astype('U') 
tfidf = TfidfVectorizer(max_features = MAX_FEATURES, ngram_range = (1, 2))   
tdidf_tensor = tfidf.fit_transform(corpus)

Note that when we vectorize the corpus we consider both unigrams and bigrams (two consecutive words), and only consider the top 10000 most frequent words in the corpus. 

In [None]:
X_train, X_test, Y_train, Y_test = train_test_split(tdidf_tensor, df_1['label'].values, test_size = 0.3)

In [None]:
from sklearn.naive_bayes import MultinomialNB
nb = MultinomialNB()
nb.fit(X_train, Y_train)

In [None]:
predictions = nb.predict(X_test)

In [None]:
print(classification_report(Y_test, predictions, digits=3))

Using a very simple model we reach an accuracy of near 95% and acceptable F1 scores for both classes. Results for 0 (negative) is understandably better because we have twice as many samples compared to 1 (postive). The next challenge is to incorporate neutral reviews into the classification. We will have two challenges to overcome:  
1. neutral reviews can be ambiguous and hard to recognize
2. there are very few neutral reviews to train

# Sentiment Analysis: 3 classes - Baseline model with tf-idf and linear SVM

In [None]:
#tokenize and vectorize reviews
corpus = df['text'].values.astype('U') 
tfidf = TfidfVectorizer(max_features = MAX_FEATURES, ngram_range = (1, 2))  
tdidf_tensor = tfidf.fit_transform(corpus)

In [None]:
X_train, X_test, Y_train, Y_test = train_test_split(tdidf_tensor, df['label'].values, test_size = 0.3)

In [None]:
baseline_model = SVC(kernel = 'linear', decision_function_shape = 'ovo') #for multi-class classification
baseline_model.fit(X_train, Y_train)

In [None]:
predictions = baseline_model.predict(X_test)

In [None]:
print(classification_report(Y_test, predictions, digits=3))

On the first look, 89% accuracy sounds awesome for this baseline model. However, because it does not predict anything to be "neutral", we have reasons to suspect that this only reflects the underlying class distribution, i.e. because there is such a small number of neutral reviews for training, the model will just predict everything to be either "positive" or "negative", hence the high accuracy.

Here we have a case of a significant unbalance in the training set leading to a classification model that heavily biases towards the more commons classes. 

# Baseline model using balanced class_weight parameter

In [None]:
baseline_model_2 = SVC(kernel = 'linear', class_weight = 'balanced', decision_function_shape = 'ovo') #setting the class weight to be balanced
baseline_model_2.fit(X_train, Y_train)
predictions = baseline_model_2.predict(X_test)
print(classification_report(Y_test, predictions, digits=3))

This classifier does recognize some neutral reviews, but at the cost of lowering F1 scores for the other two classes and the overall accuracy. It's easy to see that it misclassifies a lot of positive and negative reviews as neutral, resulting in a rather abysmal precision score of ~10% for the neutral class. This calls for a better way to treat the unbalance. 

# Oversampling with SMOTE

In [None]:
smote = SVMSMOTE()
X_train_smote, Y_train_smote = smote.fit_resample(X_train, Y_train)

In [None]:
svm_smote = SVC(kernel = 'linear', decision_function_shape = 'ovo')
svm_smote.fit(X_train_smote, Y_train_smote)

In [None]:
predictions = svm_smote.predict(X_test)
print(classification_report(Y_test, predictions, digits=3))

# Oversampling with ADASYN

In [None]:
adasyn = ADASYN()
X_train_adasyn, Y_train_adasyn = adasyn.fit_resample(X_train, Y_train)

In [None]:
svm_adasyn = SVC(kernel = 'linear', decision_function_shape = 'ovo')
svm_adasyn.fit(X_train_adasyn, Y_train_adasyn)

In [None]:
predictions = svm_adasyn.predict(X_test)
print(classification_report(Y_test, predictions, digits=3))