In [None]:
import pandas as pd
import numpy as np
import plotly.express as px
import plotly.io as pio
from nltk.sentiment.vader import SentimentIntensityAnalyzer
import nltk
import webbrowser
import os

In [None]:
nltk.download('vader_lexicon')

[nltk_data] Downloading package vader_lexicon to /root/nltk_data...


True

## Dataset Loading

In [None]:
#Dataset Loading

#Play Store Data
playstore_df= pd.read_csv('play_store.csv')

#User Reviews Data
user_reviews_df= pd.read_csv('user_reviews.csv')

In [None]:
playstore_df.head()

Unnamed: 0,App,Category,Rating,Reviews,Size,Installs,Type,Price,Content Rating,Genres,Last Updated,Current Ver,Android Ver
0,Photo Editor & Candy Camera & Grid & ScrapBook,ART_AND_DESIGN,4.1,159,19M,"10,000+",Free,0,Everyone,Art & Design,"January 7, 2018",1.0.0,4.0.3 and up
1,Coloring book moana,ART_AND_DESIGN,3.9,967,14M,"500,000+",Free,0,Everyone,Art & Design;Pretend Play,"January 15, 2018",2.0.0,4.0.3 and up
2,"U Launcher Lite – FREE Live Cool Themes, Hide ...",ART_AND_DESIGN,4.7,87510,8.7M,"5,000,000+",Free,0,Everyone,Art & Design,"August 1, 2018",1.2.4,4.0.3 and up
3,Sketch - Draw & Paint,ART_AND_DESIGN,4.5,215644,25M,"50,000,000+",Free,0,Teen,Art & Design,"June 8, 2018",Varies with device,4.2 and up
4,Pixel Draw - Number Art Coloring Book,ART_AND_DESIGN,4.3,967,2.8M,"100,000+",Free,0,Everyone,Art & Design;Creativity,"June 20, 2018",1.1,4.4 and up


In [None]:
user_reviews_df.head()

Unnamed: 0,App,Translated_Review,Sentiment,Sentiment_Polarity,Sentiment_Subjectivity
0,10 Best Foods for You,I like eat delicious food. That's I'm cooking ...,Positive,1.0,0.533333
1,10 Best Foods for You,This help eating healthy exercise regular basis,Positive,0.25,0.288462
2,10 Best Foods for You,,,,
3,10 Best Foods for You,Works great especially going grocery store,Positive,0.4,0.875
4,10 Best Foods for You,Best idea us,Positive,1.0,0.3


## Data Cleaning

In [None]:
#missing values
playstore_df.isnull().sum()


Unnamed: 0,0
App,0
Category,0
Rating,1474
Reviews,0
Size,0
Installs,0
Type,1
Price,0
Content Rating,1
Genres,0


In [None]:
user_reviews_df.isnull().sum()

Unnamed: 0,0
App,0
Translated_Review,26868
Sentiment,26863
Sentiment_Polarity,26863
Sentiment_Subjectivity,26863


In [None]:
#Data Cleaning
playstore_df=playstore_df.dropna(subset=['Rating'])
for column in playstore_df.columns:
    playstore_df[column].fillna(playstore_df[column].mode()[0],inplace=True)
playstore_df.drop_duplicates(inplace=True)
playstore_df=playstore_df[playstore_df['Rating']<=5]
user_reviews_df.dropna(subset=['Translated_Review'],inplace=True)

The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  playstore_df[column].fillna(playstore_df[column].mode()[0],inplace=True)
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  playstore_df[column].fillna(playstore_df[column].mode()[0],inplace=True)
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  playstore_df.drop_duplicates(inplace=True)


In [None]:
playstore_df.dtypes

Unnamed: 0,0
App,object
Category,object
Rating,float64
Reviews,object
Size,object
Installs,object
Type,object
Price,object
Content Rating,object
Genres,object


## Data Transformation

In [None]:
#Data Transformation
playstore_df['Installs']=playstore_df['Installs'].str.replace(',','').str.replace('+','').astype(int)
playstore_df['Price']=playstore_df['Price'].str.replace('$','').astype(float)
playstore_df['Reviews']=playstore_df['Reviews'].astype(int)

## Merged Dataset

In [None]:
#Combined dataset
merge_df=pd.merge(playstore_df,user_reviews_df,on='App',how='inner')
merge_df.head()

#Save the merged dataset
merge_df.to_csv('dataset.csv',index=False)

In [None]:
for column in merge_df.columns:
    print(merge_df.value_counts(column))

App
Helix Jump                                            1638
Bowmasters                                            1560
8 Ball Pool                                           1533
Angry Birds Classic                                   1365
Candy Crush Saga                                      1200
                                                      ... 
Apartment Decorating Ideas                               1
Calculator - unit converter                              1
CBS News                                                 1
Best Ovulation Tracker Fertility Calendar App Glow       1
HD Camera                                                1
Name: count, Length: 816, dtype: int64
Category
GAME                   17270
FAMILY                  5436
HEALTH_AND_FITNESS      2848
DATING                  2533
TRAVEL_AND_LOCAL        2419
PRODUCTIVITY            2340
PHOTOGRAPHY             2334
SPORTS                  2289
FINANCE                 1836
COMMUNICATION           1810
TOOLS    

In [None]:
merge_df.dtypes

Unnamed: 0,0
App,object
Category,object
Rating,float64
Reviews,int64
Size,object
Installs,int64
Type,object
Price,float64
Content Rating,object
Genres,object


## Data Transformation (continued....)

In [None]:
def convert_size(size):
  if 'M' in size:
    return float(size.replace('M',''))
  elif 'k' in size:
    return float(size.replace('k',''))/1024
  else:
    return np.nan

In [None]:
merge_df['Size']=merge_df['Size'].apply(convert_size)

In [None]:
#Log Installs & Log Reviews
merge_df['Log_Installs']=np.log1p(merge_df['Installs'])
merge_df['Log_Reviews']=np.log1p(merge_df['Reviews'])

In [None]:
merge_df.head()

Unnamed: 0,App,Category,Rating,Reviews,Size,Installs,Type,Price,Content Rating,Genres,Last Updated,Current Ver,Android Ver,Translated_Review,Sentiment,Sentiment_Polarity,Sentiment_Subjectivity,Log_Installs,Log_Reviews
0,Coloring book moana,ART_AND_DESIGN,3.9,967,14.0,500000,Free,0.0,Everyone,Art & Design;Pretend Play,"January 15, 2018",2.0.0,4.0.3 and up,A kid's excessive ads. The types ads allowed a...,Negative,-0.25,1.0,13.122365,6.875232
1,Coloring book moana,ART_AND_DESIGN,3.9,967,14.0,500000,Free,0.0,Everyone,Art & Design;Pretend Play,"January 15, 2018",2.0.0,4.0.3 and up,It bad >:(,Negative,-0.725,0.833333,13.122365,6.875232
2,Coloring book moana,ART_AND_DESIGN,3.9,967,14.0,500000,Free,0.0,Everyone,Art & Design;Pretend Play,"January 15, 2018",2.0.0,4.0.3 and up,like,Neutral,0.0,0.0,13.122365,6.875232
3,Coloring book moana,ART_AND_DESIGN,3.9,967,14.0,500000,Free,0.0,Everyone,Art & Design;Pretend Play,"January 15, 2018",2.0.0,4.0.3 and up,I love colors inspyering,Positive,0.5,0.6,13.122365,6.875232
4,Coloring book moana,ART_AND_DESIGN,3.9,967,14.0,500000,Free,0.0,Everyone,Art & Design;Pretend Play,"January 15, 2018",2.0.0,4.0.3 and up,I hate,Negative,-0.8,0.9,13.122365,6.875232


In [None]:
#Add Rating Group column
def rating_group(rating):
    if rating >= 4:
        return 'Top Rated'
    elif rating >= 3:
        return 'Above Average'
    elif rating >= 2:
        return 'Average'
    else:
        return 'Below Average'

merge_df['Rating_Group'] = merge_df['Rating'].apply(rating_group)

In [None]:
#Revenue column
merge_df['Revenue']=merge_df['Price']*merge_df['Installs']

## Sentiment Analysis

> NLP



In [None]:
sia=SentimentIntensityAnalyzer()
merge_df['Sentiment_Score']=merge_df['Translated_Review'].apply(lambda x:sia.polarity_scores(x)['compound'])

#Polarity scores in SIA: +ve,-ve,neutral,compound
#Compound: -1(very -ve) to +1(very +ve)

In [None]:
merge_df.head()

Unnamed: 0,App,Category,Rating,Reviews,Size,Installs,Type,Price,Content Rating,Genres,...,Android Ver,Translated_Review,Sentiment,Sentiment_Polarity,Sentiment_Subjectivity,Log_Installs,Log_Reviews,Rating_Group,Revenue,Sentiment_Score
0,Coloring book moana,ART_AND_DESIGN,3.9,967,14.0,500000,Free,0.0,Everyone,Art & Design;Pretend Play,...,4.0.3 and up,A kid's excessive ads. The types ads allowed a...,Negative,-0.25,1.0,13.122365,6.875232,Above Average,0.0,-0.25
1,Coloring book moana,ART_AND_DESIGN,3.9,967,14.0,500000,Free,0.0,Everyone,Art & Design;Pretend Play,...,4.0.3 and up,It bad >:(,Negative,-0.725,0.833333,13.122365,6.875232,Above Average,0.0,-0.802
2,Coloring book moana,ART_AND_DESIGN,3.9,967,14.0,500000,Free,0.0,Everyone,Art & Design;Pretend Play,...,4.0.3 and up,like,Neutral,0.0,0.0,13.122365,6.875232,Above Average,0.0,0.3612
3,Coloring book moana,ART_AND_DESIGN,3.9,967,14.0,500000,Free,0.0,Everyone,Art & Design;Pretend Play,...,4.0.3 and up,I love colors inspyering,Positive,0.5,0.6,13.122365,6.875232,Above Average,0.0,0.6369
4,Coloring book moana,ART_AND_DESIGN,3.9,967,14.0,500000,Free,0.0,Everyone,Art & Design;Pretend Play,...,4.0.3 and up,I hate,Negative,-0.8,0.9,13.122365,6.875232,Above Average,0.0,-0.5719


In [None]:
#Extract year
merge_df['Last Updated']=pd.to_datetime(merge_df['Last Updated'],errors='coerce')
merge_df['Year']=merge_df['Last Updated'].dt.year

merge_df.head()

Unnamed: 0,App,Category,Rating,Reviews,Size,Installs,Type,Price,Content Rating,Genres,...,Translated_Review,Sentiment,Sentiment_Polarity,Sentiment_Subjectivity,Log_Installs,Log_Reviews,Rating_Group,Revenue,Sentiment_Score,Year
0,Coloring book moana,ART_AND_DESIGN,3.9,967,14.0,500000,Free,0.0,Everyone,Art & Design;Pretend Play,...,A kid's excessive ads. The types ads allowed a...,Negative,-0.25,1.0,13.122365,6.875232,Above Average,0.0,-0.25,2018
1,Coloring book moana,ART_AND_DESIGN,3.9,967,14.0,500000,Free,0.0,Everyone,Art & Design;Pretend Play,...,It bad >:(,Negative,-0.725,0.833333,13.122365,6.875232,Above Average,0.0,-0.802,2018
2,Coloring book moana,ART_AND_DESIGN,3.9,967,14.0,500000,Free,0.0,Everyone,Art & Design;Pretend Play,...,like,Neutral,0.0,0.0,13.122365,6.875232,Above Average,0.0,0.3612,2018
3,Coloring book moana,ART_AND_DESIGN,3.9,967,14.0,500000,Free,0.0,Everyone,Art & Design;Pretend Play,...,I love colors inspyering,Positive,0.5,0.6,13.122365,6.875232,Above Average,0.0,0.6369,2018
4,Coloring book moana,ART_AND_DESIGN,3.9,967,14.0,500000,Free,0.0,Everyone,Art & Design;Pretend Play,...,I hate,Negative,-0.8,0.9,13.122365,6.875232,Above Average,0.0,-0.5719,2018


In [None]:
merge_df['Last Updated'].head()

Unnamed: 0,Last Updated
0,2018-01-15
1,2018-01-15
2,2018-01-15
3,2018-01-15
4,2018-01-15


## Plotly

> Interactive Visualization

In [None]:
html_files_path="./"
if not os.path.exists(html_files_path):
    os.makedirs(html_files_path)

In [None]:
plot_containers=""

In [None]:
#Save each Plotly figure to HTML file
def save_plot_as_html(fig,filename,insight):
  global plot_containers
  filepath=os.path.join(html_files_path,filename)
  html_content = pio.to_html(fig, full_html=False, include_plotlyjs='inline')
  plot_containers += f"""
    <div class="plot-container" id="{filename}" onclick="openPlot('{filename}')">
        <div class="plot">{html_content}</div>
        <div class="insights">{insight}</div>
    </div>
    """
  fig.write_html(filepath, full_html=False, include_plotlyjs='inline')

> Figures

In [None]:
plot_width=400
plot_height=300
plot_bg_color='black'
text_color='white'
title_font={'size':16}
axis_font={'size':12}

In [None]:
#Figure 1- Category
category_counts=merge_df['Category'].value_counts().nlargest(10)
fig1=px.bar(
    x=category_counts.index,
    y=category_counts.values,
    labels={'x':'Category','y':'Count'},
    title='Top Categories on Play Store',
    color=category_counts.index,
    color_discrete_sequence=px.colors.sequential.Plasma,
    width=400,
    height=300
)
fig1.update_layout(
    plot_bgcolor='black',
    paper_bgcolor='black',
    font_color='white',
    title_font={'size':16},
    xaxis=dict(title_font={'size':12}),
    yaxis=dict(title_font={'size':12}),
    margin=dict(l=10,r=10,t=30,b=10)
)
save_plot_as_html(fig1,"Category Graph 1.html","The top categories on Play Store are dominated by tools, entertainment, and productivity apps.")


In [None]:
#Figure 2- Type
type_cnts=merge_df['Type'].value_counts()
fig2=px.pie(
    names=type_cnts.index,
    values=type_cnts.values,
    title='App Type Distribution',
    color_discrete_sequence=px.colors.sequential.RdBu,
    width=400,
    height=300
)
fig2.update_layout(
    plot_bgcolor='black',
    paper_bgcolor='black',
    font_color='white',
    title_font={'size':16},
    margin=dict(l=10,r=10,t=30,b=10)
)
save_plot_as_html(fig2,"App Type Graph 2.html","The majority of apps on Play Store are free.")

In [None]:
#Figure 3- Rating
fig3=px.histogram(
    merge_df,
    x='Rating',
    nbins=20,
    title='Rating Distribution',
    color_discrete_sequence=['#636EFA'],
    width=400,
    height=300
)
fig3.update_layout(
    plot_bgcolor='black',
    paper_bgcolor='black',
    font_color='white',
    title_font={'size':16},
    xaxis=dict(title_font={'size':12}),
    yaxis=dict(title_font={'size':12}),
    margin=dict(l=10,r=10,t=30,b=10)
)
save_plot_as_html(fig3,"Rating Graph 3.html",
                  "Ratings are skewed towards higher values, suggesting that most apps are rated favorably by users.")

In [None]:
#Figure 4- Sentiment_Score
sentiment_counts=merge_df['Sentiment_Score'].value_counts()
fig4=px.bar(
    x=sentiment_counts.index,
    y=sentiment_counts.values,
    labels={'x':'Sentiment Score','y':'Count'},
    title='Sentiment Distribution',
    color=sentiment_counts.index,
    color_discrete_sequence=px.colors.sequential.RdPu,
    width=400,
    height=300
)
fig4.update_layout(
    plot_bgcolor='black',
    paper_bgcolor='black',
    font_color='white',
    title_font={'size':16},
    xaxis=dict(title_font={'size':12}),
    yaxis=dict(title_font={'size':12}),
    margin=dict(l=10,r=10,t=30,b=10)
)
save_plot_as_html(fig4,"Sentiment Graph 4.html",
                  "Sentiments in reviews show a mix of positive and negative feedback, with a slight lean towards positive sentiments.")


In [None]:
#Figure 5- Installs
installs_category=merge_df.groupby('Category')['Installs'].sum().nlargest(10)
fig5=px.bar(
    x=installs_category.index,
    y=installs_category.values,
    orientation='h',
    labels={'x':'Installs','y':'Category'},
    title='Installs by Category',
    color=installs_category.index,
    color_discrete_sequence=px.colors.sequential.Blues,
    width=400,
    height=300
)
fig5.update_layout(
    plot_bgcolor='black',
    paper_bgcolor='black',
    font_color='white',
    title_font={'size':16},
    xaxis=dict(title_font={'size':12}),
    yaxis=dict(title_font={'size':12}),
    margin=dict(l=10,r=10,t=30,b=10)
)
save_plot_as_html(fig5,"Installs Graph 5.html",
                  "The categories with the most installs are social and communication apps, reflecting their broad appeal and daily usage.")

In [None]:
#Figure 6- Updates Per Year Plot
updates_per_year = merge_df['Last Updated'].dt.year.value_counts().sort_index()
fig6 = px.line(
    x=updates_per_year.index,
    y=updates_per_year.values,
    labels={'x': 'Year', 'y': 'Number of Updates'},
    title='Number of Updates Over the Years',
    color_discrete_sequence=['#AB63FA'],
    width=plot_width,
    height=plot_height
)
fig6.update_layout(
    plot_bgcolor=plot_bg_color,
    paper_bgcolor=plot_bg_color,
    font_color=text_color,
    title_font=title_font,
    xaxis=dict(title_font=axis_font),
    yaxis=dict(title_font=axis_font),
    margin=dict(l=10, r=10, t=30, b=10)
)
save_plot_as_html(fig6, "Updates Graph 6.html", "Updates have been increasing over the years, showing that developers are actively maintaining and improving their apps.")

In [None]:
#Figure 7- Revenue
revenue_by_category=merge_df.groupby('Category')['Revenue'].sum().nlargest(10)
fig7=px.bar(
    x=revenue_by_category.index,
    y=revenue_by_category.values,
    labels={'x':'Category','y':'Revenue'},
    title='Revenue by Category',
    color=revenue_by_category.index,
    color_discrete_sequence=px.colors.sequential.Greens,
    width=400,
    height=300
)
fig7.update_layout(
    plot_bgcolor='black',
    paper_bgcolor='black',
    font_color='white',
    title_font={'size':16},
    xaxis=dict(title_font={'size':12}),
    yaxis=dict(title_font={'size':12}),
    margin=dict(l=10,r=10,t=30,b=10)
)
save_plot_as_html(fig7,"Revenue Graph 7.html","Categories such as Business and Productivity lead in revenue generation, indicating their monetization potential.")

In [None]:
#Figure 8- Genres
genre_counts=merge_df['Genres'].str.split(';',expand=True).stack().value_counts().nlargest(10)
fig8=px.bar(
    x=genre_counts.index,
    y=genre_counts.values,
    labels={'x':'Genre','y':'Count'},
    title='Top Genres',
    color=installs_category.index,
    color_discrete_sequence=px.colors.sequential.OrRd,
    width=400,
    height=300
)
fig8.update_layout(
    plot_bgcolor='black',
    paper_bgcolor='black',
    font_color='white',
    title_font={'size':16},
    xaxis=dict(title_font={'size':12}),
    yaxis=dict(title_font={'size':12}),
    margin=dict(l=10,r=10,t=30,b=10)
)
save_plot_as_html(fig8,"Genre Graph 8.html","Action and Casual genres are the most common, reflecting users' preference for engaging and easy-to-play game.")

In [None]:
#Figure 9
fig9=px.scatter(
    merge_df,
    x='Last Updated',
    y='Rating',
    color='Type',
    title='Impact of Last Update on Rating',
    color_discrete_sequence=px.colors.qualitative.Vivid,
    width=400,
    height=300
)
fig9.update_layout(
    plot_bgcolor='black',
    paper_bgcolor='black',
    font_color='white',
    title_font={'size':16},
    xaxis=dict(title_font={'size':12}),
    yaxis=dict(title_font={'size':12}),
    margin=dict(l=10,r=10,t=30,b=10)
)
save_plot_as_html(fig9,"Update Graph 9.html","The Scatter Plot shows a weak correlation between the last update and ratings, suggesting that more frequent updates don't always result in better ratings.")

In [None]:
#Figure 10
fig10=px.box(
    merge_df,
    x='Type',
    y='Rating',
    color='Type',
    title='Rating for Paid vs Free Apps',
    color_discrete_sequence=px.colors.qualitative.Pastel,
    width=400,
    height=300
)
fig10.update_layout(
    plot_bgcolor='black',
    paper_bgcolor='black',
    font_color='white',
    title_font={'size':16},
    xaxis=dict(title_font={'size':12}),
    yaxis=dict(title_font={'size':12}),
    margin=dict(l=10,r=10,t=30,b=10)
)
save_plot_as_html(fig10,"Paid Free Graph 10.html","Paid apps generally have higher ratings compared to free apps, suggesting that users expect higher quality from apps they pay for.")

In [None]:
#Splitting of different plots
plot_containers_split=plot_containers.split('</div>')

In [None]:
if len(plot_containers_split) > 1:
    final_plot=plot_containers_split[-2]+'</div>'
else:
    final_plot=plot_containers

## Web-based Dashboard

In [None]:
dashboard_html= """
<!DOCTYPE html>
<html lang="en">
<head>
    <meta charset="UTF-8">
    <meta name=viewport" content="width=device-width,initial-scale-1.0">
    <title> Google Play Store Review Analytics</title>
    <style>
        body {{
            font-family: Arial, sans-serif;
            background-color: #333;
            color: #fff;
            margin: 0;
            padding: 0;
        }}
        .header {{
            display: flex;
            align-items: center;
            justify-content: center;
            padding: 20px;
            background-color: #444
        }}
        .header img {{
            margin: 0 10px;
            height: 50px;
        }}
        .container {{
            display: flex;
            flex-wrap: wrap;
            justify_content: center;
            padding: 20px;
        }}
        .plot-container {{
            border: 2px solid #555;
            margin: 10px;
            padding: 10px;
            width: {plot_width}px;
            height: {plot_height}px;
            overflow: hidden;
            position: relative;
            cursor: pointer;
        }}
        .insights {{
            display: none;
            position: absolute;
            right: 10px;
            top: 10px;
            background-color: rgba(0,0,0,0.7);
            padding: 5px;
            border-radius: 5px;
            color: #fff;
        }}
        .plot-container:hover .insights {{
            display: block;
        }}
        </style>
    </head>
    <body>
        <div class= "header">
            <img src="https://upload.wikimedia.org/wikipedia/commons/thumb/4/4a/Logo_2013_Google.png/800px-Logo_2013_Google.png" alt="Google Logo">
            <h1>Google Play Store Reviews Analytics</h1>
            <img src="https://upload.wikimedia.org/wikipedia/commons/thumb/7/78/Google_Play_Store_badge_EN.svg/1024px-Google_Play_Store_badge_EN.svg.png" alt="Google Play Store Logo">
        </div>
        <div class="container">
            {plots}
        </div>
    </body>
    </html>
"""

In [None]:
final_html=dashboard_html.format(plots=plot_containers,plot_width=plot_width,plot_height=plot_height)

In [None]:
dashboard_path=os.path.join(html_files_path,"dashboard.html")
with open(dashboard_path, "w", encoding="utf-8") as f:
    f.write(final_html)

In [None]:
webbrowser.open('file://'+os.path.realpath(dashboard_path))

False