# Installation and imports

In [None]:
import pandas as pd
import numpy as np
import warnings
import re

import nltk

from nltk.tokenize import word_tokenize
from nltk.stem import WordNetLemmatizer
from nltk.stem import PorterStemmer
from nltk.corpus import stopwords 
from nltk.corpus import wordnet
from nltk.util import ngrams

from textblob import TextBlob

from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.decomposition import LatentDirichletAllocation

# To import CSV to colab
from google.colab import files
import io

In [None]:
# Download nltk packages
nltk.download('punkt')
nltk.download('stopwords')
nltk.download('wordnet')

# Install VADER sentiment
!pip install vaderSentiment

warnings.filterwarnings('ignore')

from vaderSentiment.vaderSentiment import SentimentIntensityAnalyzer

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.
[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Unzipping corpora/wordnet.zip.
Collecting vaderSentiment
  Downloading vaderSentiment-3.3.2-py2.py3-none-any.whl (125 kB)
[K     |████████████████████████████████| 125 kB 7.2 MB/s 
Installing collected packages: vaderSentiment
Successfully installed vaderSentiment-3.3.2


# Task 2: Preprocessing App Reviews

###### There are 5 CSVs containing App data. We will pre-process them below

In [None]:
# Upload the following CSVs to Google Colab: 
# at.linuxtage.companion.csv, com.jtechme.jumpgo.csv, com.qubling.sidekick.csv, nya.miku.wishmaster.csv, org.ligi.ajsha.csv
uploaded = files.upload()

Saving at.linuxtage.companion.csv to at.linuxtage.companion.csv
Saving com.jtechme.jumpgo.csv to com.jtechme.jumpgo.csv
Saving com.qubling.sidekick.csv to com.qubling.sidekick.csv
Saving nya.miku.wishmaster.csv to nya.miku.wishmaster.csv
Saving org.ligi.ajsha.csv to org.ligi.ajsha.csv


For convenience, we will refer to the specific files/apps as follow:

* shell = org.ligi.ajsha.csv
* chan = nya.miku.wishmaster.csv
* cpan = com.qubling.sidekick.csv
* scheduler = at.linuxtage.companion.csv
* browser = com.jtechme.jumpgo.csv

In [None]:
shell_df = pd.read_csv(io.BytesIO(uploaded['org.ligi.ajsha.csv']))
chan_df = pd.read_csv(io.BytesIO(uploaded['nya.miku.wishmaster.csv']))
cpan_df = pd.read_csv(io.BytesIO(uploaded['com.qubling.sidekick.csv']))
scheduler_df = pd.read_csv(io.BytesIO(uploaded['at.linuxtage.companion.csv']))
browser_df = pd.read_csv(io.BytesIO(uploaded['com.jtechme.jumpgo.csv']))

In [None]:
number_word_map = {
    '0': "zero ",
    '1': "one ",
    '2': "two ",
    '3': "three ",
    '4': "four ",
    '5': "five ",
    '6': "six ",
    '7': "seven ",
    '8': "eight ",
    '9': "nine ",
}

def numberToWord(description):
  """
    Takes the app `description` and converts digit into it's word form. For example, '2' --> 'two'
  """
  description = list(description)
  for idx in range(len(description)):
    if description[idx] in number_word_map:
      description[idx] = number_word_map[description[idx]]
  
  return "".join(description)

def pre_process_reviews(df):
    '''
        Applies pre-processing steps to the `Review` column in the given `df`

        Parameters:
          df: a DataFrame containing app data

        Returns:
          an list of pre-processed and tokenized review
    '''

    # Cleaning null data
    does_null_reviews_exist = df['Review'].isnull().values.any()
    does_null_rating_exist = df['Rating'].isnull().values.any()

    if does_null_reviews_exist:
        df['Review'] = df['Review'].fillna("")

    if does_null_rating_exist:
        df['Rating'] = df['Rating'].fillna(0)

    # Remove punctuations and special characterstics
    review_cleaned = df['Review'].str.replace(
        pat=r'[^\w\s]+', repl="", regex=True)

    # Remove emojis
    review_cleaned = review_cleaned.apply(
        lambda x: x.encode('ascii', 'ignore').decode('ascii'))

    # Turn numbers into text
    review_cleaned = review_cleaned.apply(numberToWord)

    cleaned_reviews = review_cleaned.to_list()
    # Remove white spaces (tokenize)
    reviews_tokenized = word_tokenize(" ".join(cleaned_reviews))

    # Turn all words into lowercase
    reviews_tokenized = [word.lower() for word in reviews_tokenized]

    # Remove stop words
    english_stop_words = set(stopwords.words('english'))
    tokens_after_removing_stop_words = [
        word for word in reviews_tokenized if word not in english_stop_words]

    # Lemmatize the reviews
    lemmatizer = WordNetLemmatizer()
    lemmatized_tokens = [lemmatizer.lemmatize(
        token, pos="v") for token in tokens_after_removing_stop_words]
    

    return lemmatized_tokens

In [None]:
cleaned_reviews_shell = pre_process_reviews(shell_df)
cleaned_reviews_chan = pre_process_reviews(chan_df)
cleaned_reviews_cpan = pre_process_reviews(cpan_df)
cleaned_reviews_scheduler = pre_process_reviews(scheduler_df)
cleaned_reviews_browser = pre_process_reviews(browser_df)

# Sample cleaned data
print(cleaned_reviews_shell[:15])
print(cleaned_reviews_chan[:15])
print(cleaned_reviews_cpan[:15])
print(cleaned_reviews_scheduler[:15])
print(cleaned_reviews_browser[:15])

['please', 'add', 'advance', 'java', 'feature', 'would', 'helpful', 'decent', 'itd', 'useful', 'gradle', 'support', 'upgrade', 'java', 'version']
['think', 'glowies', 'delete', 'phone', 'do', 'change', 'name', 'good', 'app', 'although', 'wont', 'let', 'post', 'vpn', 'whereas']
['useful', 'info', 'detail', 'place', 'case', 'difficult', 'find', 'need', 'find', 'useful', 'show', 'unnecessary', 'ad', 'first', 'stop']
['keep', 'say', 'problem', 'task', 'event', 'host', 'sign', 'session', 'single', 'event', 'add', 'work', 'work', 'slowly', 'slow']
['simple', 'easy', 'use', 'security', 'options', 'minimalist', 'browsers', 'seem', 'lack', 'would', 'nice', 'whitelist', 'sit', 'example', 'cookies']


### Task 2 Bonus:
Yes, the stop words has an impact on the analysis of the reviews. The risk of using a non-customized list of stop words is that the generic list of stop words for english may not be sufficient since it is not domain specific. There are words, which are mobile app domain-specific, that offer no information to the reviews; as such, they should be considered as stop-words. But, using the generic list of stop words will treat these words as important words. 

# Task 3: Sentiment Analysis

In [None]:
# Check if any null values exist in the data and replace them with default values
does_null_reviews_exist = shell_df['Review'].isnull().values.any()
does_null_rating_exist = shell_df['Rating'].isnull().values.any()

if does_null_reviews_exist:
  shell_df['Review'] = shell_df['Review'].fillna("")

if does_null_rating_exist:
  shell_df['Rating'] = shell_df['Rating'].fillna(0)

In [None]:
def sentiment_analyis_with_text_blob(reviews, package_id):
    review_polarities = []
    for review in reviews:
        text_blob = TextBlob(review)
        polarity, subjectivity = text_blob.sentiment
        review_polarities.append(polarity)

    analysis_df = pd.DataFrame({
        "App’s package name": package_id,
        "Review": reviews,
        "Polarity": review_polarities
    })

    return analysis_df
    

def sentiment_analysis_with_VADER(reviews, package_id):
    analyzer = SentimentIntensityAnalyzer()
    polarities = []
    for review in reviews:
        vs = analyzer.polarity_scores(review)
        polarities.append(str(vs))

    analysis_df = pd.DataFrame({
        "App’s package name": package_id,
        "Review": reviews,
        "Polarity": polarities
    })

    return analysis_df

In [None]:
# Apply TextBlob and VADER sentiment analysis to 5 apps
text_blob_shell_sentiment_analysis = sentiment_analyis_with_text_blob(shell_df['Review'].to_list(), "org.ligi.ajsha.csv")
vader_shell_sentiment_analysis = sentiment_analysis_with_VADER(shell_df['Review'].to_list(), "org.ligi.ajsha.csv")

text_blob_chan_sentiment_analysis = sentiment_analyis_with_text_blob(shell_df['Review'].to_list(), "nya.miku.wishmaster.csv")
vader_chan_sentiment_analysis = sentiment_analysis_with_VADER(shell_df['Review'].to_list(), "nya.miku.wishmaster.csv")

text_blob_cpan_sentiment_analysis = sentiment_analyis_with_text_blob(shell_df['Review'].to_list(), "com.qubling.sidekick.csv")
vader_cpan_sentiment_analysis = sentiment_analysis_with_VADER(shell_df['Review'].to_list(), "com.qubling.sidekick.csv")

text_blob_scheduler_sentiment_analysis = sentiment_analyis_with_text_blob(shell_df['Review'].to_list(), "at.linuxtage.companion.csv")
vader_scheduler_sentiment_analysis = sentiment_analysis_with_VADER(shell_df['Review'].to_list(), "at.linuxtage.companion.csv")

text_blob_browser_sentiment_analysis = sentiment_analyis_with_text_blob(shell_df['Review'].to_list(), "com.jtechme.jumpgo.csv")
vader_browser_sentiment_analysis = sentiment_analysis_with_VADER(shell_df['Review'].to_list(), "com.jtechme.jumpgo.csv")

In [None]:
# Combining sentiment analysis by TextBlob
text_blob_sentiment_analysis_data = pd.concat([
                                        text_blob_shell_sentiment_analysis,
                                        text_blob_chan_sentiment_analysis,
                                        text_blob_cpan_sentiment_analysis,
                                        text_blob_scheduler_sentiment_analysis,
                                        text_blob_browser_sentiment_analysis
])

In [None]:
# Combining sentiment analysis by VADER
vader_sentiment_analysis_data = pd.concat([
                                           vader_shell_sentiment_analysis,
                                           vader_chan_sentiment_analysis,
                                           vader_cpan_sentiment_analysis,
                                           vader_scheduler_sentiment_analysis,
                                           vader_browser_sentiment_analysis
])

In [None]:
text_blob_sentiment_analysis_data

Unnamed: 0,App’s package name,Review,Polarity
0,org.ligi.ajsha.csv,Please add some advance java features in it. I...,0.000000
1,org.ligi.ajsha.csv,"It's decent, but it'd be useful to have Gradle...",0.155556
2,org.ligi.ajsha.csv,Nice,0.600000
3,org.ligi.ajsha.csv,Good app but lots of add :),0.600000
4,org.ligi.ajsha.csv,"This app is very useful and handy, so I offer ...",0.495000
...,...,...,...
6844,com.jtechme.jumpgo.csv,"Very comfortable app, good design and good per...",0.640000
6845,com.jtechme.jumpgo.csv,Love it,0.500000
6846,com.jtechme.jumpgo.csv,"sorry for my previous rates, actually it was c...",0.020833
6847,com.jtechme.jumpgo.csv,Nice app,0.600000


In [None]:
vader_sentiment_analysis_data

Unnamed: 0,App’s package name,Review,Polarity
0,org.ligi.ajsha.csv,Please add some advance java features in it. I...,"{'neg': 0.0, 'neu': 0.662, 'pos': 0.338, 'comp..."
1,org.ligi.ajsha.csv,"It's decent, but it'd be useful to have Gradle...","{'neg': 0.0, 'neu': 0.797, 'pos': 0.203, 'comp..."
2,org.ligi.ajsha.csv,Nice,"{'neg': 0.0, 'neu': 0.0, 'pos': 1.0, 'compound..."
3,org.ligi.ajsha.csv,Good app but lots of add :),"{'neg': 0.0, 'neu': 0.457, 'pos': 0.543, 'comp..."
4,org.ligi.ajsha.csv,"This app is very useful and handy, so I offer ...","{'neg': 0.0, 'neu': 0.722, 'pos': 0.278, 'comp..."
...,...,...,...
6844,com.jtechme.jumpgo.csv,"Very comfortable app, good design and good per...","{'neg': 0.0, 'neu': 0.341, 'pos': 0.659, 'comp..."
6845,com.jtechme.jumpgo.csv,Love it,"{'neg': 0.0, 'neu': 0.192, 'pos': 0.808, 'comp..."
6846,com.jtechme.jumpgo.csv,"sorry for my previous rates, actually it was c...","{'neg': 0.055, 'neu': 0.809, 'pos': 0.136, 'co..."
6847,com.jtechme.jumpgo.csv,Nice app,"{'neg': 0.0, 'neu': 0.263, 'pos': 0.737, 'comp..."


The sentiment analysis retrieved using TextBlob only gives a polarity and subjectivity number. The polarity, as per the documentation, is a value in the range of -1.0 and 1.0. Thus, it is hard to discern what proportion of happiness/neutrality/sadness is present in the text. 

However, the sentiment analysis retrieved using VADER gives fixed proportions of positivity, neutrality and negativity. Thus, it is easier to analyze and understand the review. Furthermore, it also provides a compound score value, which is commonly used in sentiment analysis by the researchers.

`VADER` is the better option for reviews of the apps since it is more informative as it provides proportions of positivity, neutrality and negativity present in the text.

# Task 4: Topic Modelling with LDA

Task 4: part II

In [None]:
# We will topic model using the data with most reviews
print(len(cleaned_reviews_shell))
print(len(cleaned_reviews_chan))
print(len(cleaned_reviews_cpan))
print(len(cleaned_reviews_scheduler))
print(len(cleaned_reviews_browser)) 

# Browser app's data has the most data so we will use it for LDA 

55577
10013
123
57748
98597


In [None]:
def topic_modelling(cleaned_app_reviews, number_of_topics):
    """
        This function returns the topics modelled given a list of clean app reviews, `cleaned_app_reviews`, 
        and an integer, `number_of_topics`.
    """
    preprocessed_corpus = [" ".join(cleaned_app_reviews)]

    # Tf-idf model
    tfidf_creator = TfidfVectorizer(min_df=0.2)
    tfidf = tfidf_creator.fit_transform(preprocessed_corpus)

    # Tf-idf LDA model
    lda_tfidf_creator = LatentDirichletAllocation(learning_method='online', n_components=number_of_topics)
    lda_tfidf = lda_tfidf_creator.fit_transform(tfidf)

    topics_found = []

    for topic_number, topic in enumerate(lda_tfidf_creator.components_):
        result =  " ".join([tfidf_creator.get_feature_names()[i] for i in topic.argsort()[:-8 : -1]])
        topics_found.append(result)
    
    return topics_found

def print_topics(topics_list):
  """
    Print the topics in `topics_list`
  """
  topic_count = len(topics_list)
  i = 0
  print("-----Topics found-----")
  while i < topic_count:
    message = "Topic #{}: ".format(i + 1)
    message += topics_list[i]
    print(message)
    i += 1

Topic Modelling Browser App Reviews

In [None]:
assigned_app_topics = topic_modelling(cleaned_reviews_browser, number_of_topics=10)
print_topics(assigned_app_topics)

-----Topics found-----
Topic #1: widget happily inteded honest oo intrusion hmm
Topic #2: safer divice pizza appearance pal thoroughly gosh
Topic #3: workit tx usara valley dolanjski rom identify
Topic #4: browser app use search good one love
Topic #5: kroll frequently exilant my_ enhance whats thos
Topic #6: everyday awoesome need beforebut image rid ultranationalists
Topic #7: dyor unfunctional fasteasy cue pink oreo charge
Topic #8: kikis nevigate local unedited fonts deletedof scrollbar
Topic #9: cn amaizingfast thathoping jus compatible oru disruptive
Topic #10: yasssss noticeable buttonsheet til reality superfasta monster


Task 4: Part III



**In case the topics change, below are the topics for which the question have been answered**

* Topic #1: widget happily inteded honest oo intrusion hmm
* Topic #2: safer divice pizza appearance pal thoroughly gosh
* Topic #3: workit tx usara valley dolanjski rom identify
* Topic #4: browser app use search good one love
* Topic #5: kroll frequently exilant my_ enhance whats thos
* Topic #6: everyday awoesome need beforebut image rid ultranationalists
* Topic #7: dyor unfunctional fasteasy cue pink oreo charge
* Topic #8: kikis nevigate local unedited fonts deletedof scrollbar
* Topic #9: cn amaizingfast thathoping jus compatible oru disruptive
* Topic #10: yasssss noticeable buttonsheet til reality superfasta monster

**How do the topics relate to the features of the app?**

* Topic #2 relates to the app features in that the users seem to agree that the broswer is safe, thus conforming to the app's privacy feature

* Topic #4 relates to the app features in that the users seem to love the 'search suggestion' feature of the app

* Topic #7, Topic #9 and Topic #10 all agree that the browser is very fast. Thus conforming to the app's high-speed feature.

#### Grouping similar and competitor app separately

In [None]:
def get_competitor_or_similar_app_df(browser_df, package_id):
  """
    Slice a given app data frame `browser_df` based on a package name `package_id`
  """
  df_indices = browser_df.index[browser_df["Package name"]== package_id].tolist()
  df_indices_range = (df_indices[0], df_indices[-1])
  df = browser_df[df_indices_range[0]: df_indices_range[-1] + 1]

  return df

# Similar and competitor apps for the browser app
similar_app_df = get_competitor_or_similar_app_df(browser_df, "acr.browser.barebones")
competitor_app_1_df = get_competitor_or_similar_app_df(browser_df, "com.duckduckgo.mobile.android")
competitor_app_2_df = get_competitor_or_similar_app_df(browser_df, "mark.via.gp")
competitor_app_3_df = get_competitor_or_similar_app_df(browser_df, "com.kiwibrowser.browser")

In [None]:
# Pre-process each similar and competitor app reviews
similar_app_reviews = pre_process_reviews(similar_app_df)
competitor_app_1_reviews = pre_process_reviews(competitor_app_1_df)
competitor_app_2_reviews = pre_process_reviews(competitor_app_2_df)
competitor_app_3_reviews = pre_process_reviews(competitor_app_3_df)

**Task 4: part IV**

Topic Modelling for Similar App

In [None]:
similar_app_topics = topic_modelling(similar_app_reviews, number_of_topics=5)
print_topics(similar_app_topics)

-----Topics found-----
Topic #1: please unfixed stumble low kullanmaya slow twitter
Topic #2: machine wo storage optimizations match pls ability
Topic #3: browser fast use app like one get
Topic #4: worst news soooo najbolji january minimalism iphone
Topic #5: hopefully atm intrusive principles hassle tight guy


Topic Modelling for Competitor App 1

In [None]:
competitor_app_1_topics = topic_modelling(competitor_app_1_reviews, number_of_topics=5)
print_topics(competitor_app_1_topics)

-----Topics found-----
Topic #1: app search use google good like privacy
Topic #2: suppress bookmarks though decision especially gabriel fup
Topic #3: brownshirts redirections moderation alternatives downrank properly de
Topic #4: ludzi stall guide notification tire key throw
Topic #5: conlusions kick please correct memes gaberdusa larger


Topic Modelling for Competitor App 2

In [None]:
competitor_app_2_topics = topic_modelling(competitor_app_2_reviews, number_of_topics=5)
print_topics(competitor_app_2_topics)

-----Topics found-----
Topic #1: browser app best one good fast add
Topic #2: exsoundcloud next didnt exist install consistently rotate
Topic #3: issue seek join secrets soulbrowser surf il
Topic #4: function huawei yo browserbut postiphone explorar hind
Topic #5: necesitar tabplease sight describe disturbancesyou elsefire electronic


Topic Modelling for Competitor App 3

In [None]:
competitor_app_3_topics = topic_modelling(competitor_app_3_reviews, number_of_topics=5)
print_topics(competitor_app_3_topics)

-----Topics found-----
Topic #1: strange vital menu supposedly like installbut mozila
Topic #2: browser app good use best work chrome
Topic #3: eg ch track kendall range ii shortcut
Topic #4: terzos bookmarking sis dega bits helpul simply
Topic #5: saport freak business vectr reallly ammounts truncate


Task 4: Part V

The apps are compared using cosine similarity matrix such that only cells of the matrix with similarity > 0 are shown

In [None]:
# Similarity matrix
corpus = assigned_app_topics + similar_app_topics + competitor_app_1_topics + competitor_app_2_topics + competitor_app_3_topics
vectorizer = TfidfVectorizer()

# Displays similarities > 0 between the assigned app and similar + competitor apps
tfidf = vectorizer.fit_transform(corpus)
pairwise_similarity = tfidf * tfidf.T

print(pairwise_similarity)

  (0, 0)	1.0000000000000002
  (1, 1)	1.0000000000000002
  (2, 2)	1.0000000000000002
  (3, 26)	0.4570983819033667
  (3, 15)	0.49897596095774066
  (3, 20)	0.4818657025307194
  (3, 12)	0.48846028446147605
  (3, 3)	1.0000000000000002
  (4, 4)	1.0000000000000002
  (5, 5)	1.0000000000000002
  (6, 6)	1.0000000000000002
  (7, 7)	1.0000000000000002
  (8, 8)	1.0000000000000002
  (9, 9)	1.0000000000000002
  (10, 19)	0.11699396642956693
  (10, 10)	1.0
  (11, 11)	1.0000000000000002
  (12, 26)	0.33578841643186885
  (12, 3)	0.48846028446147605
  (12, 20)	0.5236901807837273
  (12, 25)	0.11783362820308785
  (12, 15)	0.3516958317905625
  (12, 12)	1.0
  (13, 13)	1.0000000000000002
  (14, 14)	1.0000000000000002
  :	:
  (16, 16)	1.0000000000000002
  (17, 17)	1.0000000000000002
  (18, 18)	1.0000000000000002
  (19, 10)	0.11699396642956693
  (19, 19)	1.0
  (20, 15)	0.2140453306928336
  (20, 3)	0.4818657025307194
  (20, 12)	0.5236901807837273
  (20, 26)	0.4919140841068451
  (20, 20)	0.9999999999999998
  (21, 2

# Task 5: Recommendation

https://github.com/tuyafeng/Via/issues/1069