In [1]:
import pandas as pd
import numpy as np
import plotly.express as px
import plotly.io as pio
import plotly.graph_objects as go
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_squared_error, r2_score
from nltk.sentiment.vader import SentimentIntensityAnalyzer
import nltk
import nltk
import webbrowser
import os


In [2]:
nltk.download('vader_lexicon')

[nltk_data] Downloading package vader_lexicon to
[nltk_data]     C:\Users\Administrator\AppData\Roaming\nltk_data...
[nltk_data]   Package vader_lexicon is already up-to-date!


True

**Loading Data**

In [3]:
apps_df=pd.read_csv('Play Store Data.csv')
reviews_df=pd.read_csv('User Reviews.csv')

In [4]:
apps_df.head()

Unnamed: 0,App,Category,Rating,Reviews,Size,Installs,Type,Price,Content Rating,Genres,Last Updated,Current Ver,Android Ver
0,Photo Editor & Candy Camera & Grid & ScrapBook,ART_AND_DESIGN,4.1,159,19M,"10,000+",Free,0,Everyone,Art & Design,"January 7, 2018",1.0.0,4.0.3 and up
1,Coloring book moana,ART_AND_DESIGN,3.9,967,14M,"500,000+",Free,0,Everyone,Art & Design;Pretend Play,"January 15, 2018",2.0.0,4.0.3 and up
2,"U Launcher Lite – FREE Live Cool Themes, Hide ...",ART_AND_DESIGN,4.7,87510,8.7M,"5,000,000+",Free,0,Everyone,Art & Design,"August 1, 2018",1.2.4,4.0.3 and up
3,Sketch - Draw & Paint,ART_AND_DESIGN,4.5,215644,25M,"50,000,000+",Free,0,Teen,Art & Design,"June 8, 2018",Varies with device,4.2 and up
4,Pixel Draw - Number Art Coloring Book,ART_AND_DESIGN,4.3,967,2.8M,"100,000+",Free,0,Everyone,Art & Design;Creativity,"June 20, 2018",1.1,4.4 and up


In [5]:
reviews_df.head()

Unnamed: 0,App,Translated_Review,Sentiment,Sentiment_Polarity,Sentiment_Subjectivity
0,10 Best Foods for You,I like eat delicious food. That's I'm cooking ...,Positive,1.0,0.533333
1,10 Best Foods for You,This help eating healthy exercise regular basis,Positive,0.25,0.288462
2,10 Best Foods for You,,,,
3,10 Best Foods for You,Works great especially going grocery store,Positive,0.4,0.875
4,10 Best Foods for You,Best idea us,Positive,1.0,0.3


**Data Cleaning**

In [6]:
apps_df = apps_df.dropna(subset=['Rating'])
for column in apps_df.columns:
  apps_df[column].fillna(apps_df[column].mode()[0],inplace=True)
apps_df.drop_duplicates(inplace=True)
apps_df=apps_df=apps_df[apps_df['Rating']<=5]
reviews_df.dropna(subset=['Translated_Review'],inplace=True)

**Data Transformation**

In [7]:

apps_df['Installs']=apps_df['Installs'].str.replace(',','').str.replace('+','').astype(int)
apps_df['Price']=apps_df['Price'].str.replace('$','').astype(float)

  apps_df['Installs']=apps_df['Installs'].str.replace(',','').str.replace('+','').astype(int)
  apps_df['Price']=apps_df['Price'].str.replace('$','').astype(float)


In [8]:
apps_df.dtypes

App                object
Category           object
Rating            float64
Reviews            object
Size               object
Installs            int32
Type               object
Price             float64
Content Rating     object
Genres             object
Last Updated       object
Current Ver        object
Android Ver        object
dtype: object

In [9]:
#merge datasets
merged_df=pd.merge(apps_df,reviews_df,on='App',how='inner')

In [10]:
merged_df

Unnamed: 0,App,Category,Rating,Reviews,Size,Installs,Type,Price,Content Rating,Genres,Last Updated,Current Ver,Android Ver,Translated_Review,Sentiment,Sentiment_Polarity,Sentiment_Subjectivity
0,Coloring book moana,ART_AND_DESIGN,3.9,967,14M,500000,Free,0.0,Everyone,Art & Design;Pretend Play,"January 15, 2018",2.0.0,4.0.3 and up,A kid's excessive ads. The types ads allowed a...,Negative,-0.250000,1.000000
1,Coloring book moana,ART_AND_DESIGN,3.9,967,14M,500000,Free,0.0,Everyone,Art & Design;Pretend Play,"January 15, 2018",2.0.0,4.0.3 and up,It bad >:(,Negative,-0.725000,0.833333
2,Coloring book moana,ART_AND_DESIGN,3.9,967,14M,500000,Free,0.0,Everyone,Art & Design;Pretend Play,"January 15, 2018",2.0.0,4.0.3 and up,like,Neutral,0.000000,0.000000
3,Coloring book moana,ART_AND_DESIGN,3.9,967,14M,500000,Free,0.0,Everyone,Art & Design;Pretend Play,"January 15, 2018",2.0.0,4.0.3 and up,I love colors inspyering,Positive,0.500000,0.600000
4,Coloring book moana,ART_AND_DESIGN,3.9,967,14M,500000,Free,0.0,Everyone,Art & Design;Pretend Play,"January 15, 2018",2.0.0,4.0.3 and up,I hate,Negative,-0.800000,0.900000
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
59119,A+ Gallery - Photos & Videos,PHOTOGRAPHY,4.5,223941,Varies with device,10000000,Free,0.0,Everyone,Photography,"August 6, 2018",Varies with device,Varies with device,Overall great app. Best gallery seen far,Positive,0.475000,0.512500
59120,A+ Gallery - Photos & Videos,PHOTOGRAPHY,4.5,223941,Varies with device,10000000,Free,0.0,Everyone,Photography,"August 6, 2018",Varies with device,Varies with device,"Recommended, 100% love it, keep good work dev ...",Positive,0.566667,0.733333
59121,A+ Gallery - Photos & Videos,PHOTOGRAPHY,4.5,223941,Varies with device,10000000,Free,0.0,Everyone,Photography,"August 6, 2018",Varies with device,Varies with device,Too much ads,Positive,0.200000,0.200000
59122,A+ Gallery - Photos & Videos,PHOTOGRAPHY,4.5,223941,Varies with device,10000000,Free,0.0,Everyone,Photography,"August 6, 2018",Varies with device,Varies with device,Just allow time ...,Neutral,0.000000,0.000000


In [11]:
def convert_size(size):
  if isinstance(size,str):
    if 'M'in size:
      return float(size.replace('M',''))
    elif 'k' in size:
      return float(size.replace('k',''))/1024
    else:
      return np.nan

In [12]:
apps_df['Size']=apps_df['Size'].apply(convert_size)

In [13]:
#logarithmic
apps_df['Log_Installs']=np.log(apps_df['Installs'])
apps_df['Reviews'] = pd.to_numeric(apps_df['Reviews'], errors='coerce')
apps_df['Log_Reviews']=np.log(apps_df['Reviews'])

In [14]:
apps_df.dtypes

App                object
Category           object
Rating            float64
Reviews             int64
Size              float64
Installs            int32
Type               object
Price             float64
Content Rating     object
Genres             object
Last Updated       object
Current Ver        object
Android Ver        object
Log_Installs      float64
Log_Reviews       float64
dtype: object

In [15]:
#categorise rating

def rating_group(rating):
    if rating >= 4:
        return 'Top rated app'
    elif rating >=3:
        return 'Above average'
    elif rating >=2:
        return 'Average'
    else:
        return 'Below Average'
apps_df['Rating_Group']=apps_df['Rating'].apply(rating_group)

In [16]:
#metric - revenue column for an app
apps_df['Revenue']=apps_df['Price']*apps_df['Installs']

In [17]:
#extracting year column
apps_df['Last Updated']=pd.to_datetime(apps_df['Last Updated'],errors='coerce')

In [18]:
apps_df['Year']=apps_df['Last Updated'].dt.year

In [19]:
apps_df['Size'] = pd.to_numeric(apps_df['Size'], errors='coerce')

In [20]:
apps_df.head()

Unnamed: 0,App,Category,Rating,Reviews,Size,Installs,Type,Price,Content Rating,Genres,Last Updated,Current Ver,Android Ver,Log_Installs,Log_Reviews,Rating_Group,Revenue,Year
0,Photo Editor & Candy Camera & Grid & ScrapBook,ART_AND_DESIGN,4.1,159,19.0,10000,Free,0.0,Everyone,Art & Design,2018-01-07,1.0.0,4.0.3 and up,9.21034,5.068904,Top rated app,0.0,2018
1,Coloring book moana,ART_AND_DESIGN,3.9,967,14.0,500000,Free,0.0,Everyone,Art & Design;Pretend Play,2018-01-15,2.0.0,4.0.3 and up,13.122363,6.874198,Above average,0.0,2018
2,"U Launcher Lite – FREE Live Cool Themes, Hide ...",ART_AND_DESIGN,4.7,87510,8.7,5000000,Free,0.0,Everyone,Art & Design,2018-08-01,1.2.4,4.0.3 and up,15.424948,11.379508,Top rated app,0.0,2018
3,Sketch - Draw & Paint,ART_AND_DESIGN,4.5,215644,25.0,50000000,Free,0.0,Teen,Art & Design,2018-06-08,Varies with device,4.2 and up,17.727534,12.281384,Top rated app,0.0,2018
4,Pixel Draw - Number Art Coloring Book,ART_AND_DESIGN,4.3,967,2.8,100000,Free,0.0,Everyone,Art & Design;Creativity,2018-06-20,1.1,4.4 and up,11.512925,6.874198,Top rated app,0.0,2018


**Sentiment Analysis**

In [21]:
sia = SentimentIntensityAnalyzer()

#Polarity Scores in SIA
#Positive, Negative, Neutral and Compound: close to -1 - Very negative ; close to +1 - Very positive

In [22]:
#sample review 1 - positive
review = "This app is amazing! I love the new features."
sentiment_score= sia.polarity_scores(review)
print(sentiment_score)

{'neg': 0.0, 'neu': 0.42, 'pos': 0.58, 'compound': 0.8516}


In [23]:
#sample review 2 - negative
review = "This app is very bad! I hate the new features."
sentiment_score= sia.polarity_scores(review)
print(sentiment_score)

{'neg': 0.535, 'neu': 0.465, 'pos': 0.0, 'compound': -0.8427}


In [24]:
#sample review 3 - average
review = "This app is okay"
sentiment_score= sia.polarity_scores(review)
print(sentiment_score)

{'neg': 0.0, 'neu': 0.612, 'pos': 0.388, 'compound': 0.2263}


In [27]:
reviews_df['Sentiment_Score']=reviews_df['Translated_Review'].apply(lambda x: sia.polarity_scores(str(x))['compound'])

In [28]:
reviews_df.head()

Unnamed: 0,App,Translated_Review,Sentiment,Sentiment_Polarity,Sentiment_Subjectivity,Sentiment_Score
0,10 Best Foods for You,I like eat delicious food. That's I'm cooking ...,Positive,1.0,0.533333,0.9531
1,10 Best Foods for You,This help eating healthy exercise regular basis,Positive,0.25,0.288462,0.6597
3,10 Best Foods for You,Works great especially going grocery store,Positive,0.4,0.875,0.6249
4,10 Best Foods for You,Best idea us,Positive,1.0,0.3,0.6369
5,10 Best Foods for You,Best way,Positive,1.0,0.3,0.6369


**Data Visualization**

In [29]:
#creating a directory for the html files
html_files_path="./"
if not os.path.exists(html_files_path):
    os.makedirs(html_files_path)

In [30]:
plot_containers=""

In [31]:
# Save each Plotly figure to an HTML file
def save_plot_as_html(fig, filename, insight):
    global plot_containers
    filepath = os.path.join(html_files_path, filename)
    html_content = pio.to_html(fig, full_html=False, include_plotlyjs='inline')
    # Append the plot and its insight to plot_containers
    plot_containers += f"""
    <div class="plot-container" id="{filename}" onclick="openPlot('{filename}')">
        <div class="plot">{html_content}</div>
        <div class="insights">{insight}</div>
    </div>
    """
    fig.write_html(filepath, full_html=False, include_plotlyjs='inline')

#defining plots
plot_width=400
plot_height=300
plot_bg_color='black'
text_color='white'
title_font={'size':16}
axis_font={'size':12}

In [32]:
#figure 1 - stacked bar chart

#filters to be applied -
#1.apps with more than 1,000 reviews
apps_filtered = apps_df[apps_df['Reviews'] > 1000]

#2.group by the top 5 categories.
top_categories = apps_filtered.groupby('Category')['Reviews'].sum().nlargest(5).index
apps_filtered = apps_filtered[apps_filtered['Category'].isin(top_categories)]


merged_df = reviews_df.merge(apps_filtered[['App', 'Category', 'Rating']], on='App')

#3.segmented by rating groups
def categorize_rating(rating):
    if rating >= 4:
        return '4-5 Stars'
    elif rating >= 3:
        return '3-4 Stars'
    else:
        return '1-2 Stars'

merged_df['Rating Group'] = merged_df['Rating'].apply(categorize_rating)

sentiment_counts = merged_df.groupby(['Category', 'Rating Group', 'Sentiment']).size().reset_index(name='Count')


fig1 = px.bar(
    sentiment_counts,
    x='Rating Group',
    y='Count',
    color='Sentiment',
    barmode='stack',
    facet_col='Category',
    title='Sentiment Distribution by Rating Group (Top 5 Categories)',
    labels={'Count': 'Number of Reviews', 'Rating Group': 'Star Rating'},
    color_discrete_map={'Positive': 'green', 'Neutral': 'gray', 'Negative': 'red'}
)

fig1.update_layout(
    plot_bgcolor='black',
    paper_bgcolor='black',
    font_color='white',
    title_font={'size':16},
    xaxis=dict(title_font={'size':12}),
    yaxis=dict(title_font={'size':12}),
    margin=dict(l=10,r=10,t=30,b=10)
)

save_plot_as_html(fig1,"Sentiment Distribution 1.html","Paid apps generally have higher ratings compared to free apps, suggesting that users expect higher quality from apps they pay for")


In [33]:
#figure 2 - dual axis chart

apps_df['Android Ver'] = apps_df['Android Ver'].astype(str).str.extract(r'(\d+\.\d+)')[0].astype(float)

#filters to be applied -

#1.applying required filters:
filtered_apps = apps_df[
    (apps_df['Installs'] >= 10000) &
    ((apps_df['Revenue'] >= 10000) | (apps_df['Type'] == 'Free')) &  # Allow Free apps even if revenue < 10,000
    (apps_df['Android Ver'] > 4.0) &
    (apps_df['Size'] > 15) &
    (apps_df['Content Rating'] == 'Everyone') &
    (apps_df['App'].str.len() <= 30)
]

#2.top 3 categories based on the number of filtered apps
top_3_categories = filtered_apps['Category'].value_counts().nlargest(3).index
filtered_apps = filtered_apps[filtered_apps['Category'].isin(top_3_categories)]

#3.caluclating installs and revenue separately for Free and Paid apps
category_summary = filtered_apps.groupby(['Category', 'Type'])[['Installs', 'Revenue']].mean().reset_index()

categories = top_3_categories.tolist()
types = ['Free', 'Paid']
full_index = pd.MultiIndex.from_product([categories, types], names=['Category', 'Type'])

category_summary = category_summary.set_index(['Category', 'Type']).reindex(full_index).fillna(0).reset_index()

fig2 = go.Figure()

# Add bar chart for Installs (Free & Paid separately)
fig2.add_trace(go.Bar(
    x=category_summary[category_summary['Type'] == 'Free']['Category'],
    y=category_summary[category_summary['Type'] == 'Free']['Installs'],
    name='Avg Installs (Free)',
    marker=dict(color='lightblue', opacity=0.7),
    yaxis='y1'
))

fig2.add_trace(go.Bar(
    x=category_summary[category_summary['Type'] == 'Paid']['Category'],
    y=category_summary[category_summary['Type'] == 'Paid']['Installs'],
    name='Avg Installs (Paid)',
    marker=dict(color='orange', opacity=0.9),
    yaxis='y1'
))


fig2.add_trace(go.Scatter(
    x=category_summary[category_summary['Type'] == 'Paid']['Category'],
    y=category_summary[category_summary['Type'] == 'Paid']['Revenue'],
    name='Avg Revenue (Paid)',
    mode='lines+markers',
    line=dict(color='red', width=3),
    marker=dict(size=10), 
    yaxis='y2'
))

fig2.update_layout(
    title='Avg Installs & Revenue for Free vs Paid Apps (Top 3 Categories)',
    xaxis=dict(title='Category', categoryorder='category ascending'),
    yaxis=dict(title='Average Installs', side='left', showgrid=True, rangemode='tozero'),
    yaxis2=dict(
        title='Average Revenue (Log Scale)',
        side='right',
        overlaying='y',
        showgrid=True,
        tickmode='sync',
        rangemode='tozero',
        type='log'
    ),
    barmode='group', 
    legend=dict(title='Metric', bgcolor='rgba(255, 255, 255, 0.7)'),
    plot_bgcolor='black',
    paper_bgcolor='black',
    font=dict(size=14, color='white')
)

save_plot_as_html(fig2,"Installs and Revenue Comparison 2.html","The revenue generated is high in the 'Family' category apps as the most number apps are paid, whereas there are more number of installs in the 'Gaming'")


In [34]:
#figure 3 - bubble chart

#filters to be applied

#1.required categories
allowed_categories = ['GAME', 'BEAUTY', 'BUSINESS', 'COMICS', 'COMMUNICATION', 'DATING', 'ENTERTAINMENT', 'SOCIAL', 'EVENTS']

#2.apply filters
bubble_chart_filtered = apps_df.merge(reviews_df[['App', 'Sentiment_Subjectivity', 'Translated_Review']], on='App')
bubble_chart_filtered = bubble_chart_filtered[
    (bubble_chart_filtered['Rating'] > 3.5) &
    (bubble_chart_filtered['Category'].isin(allowed_categories)) &
    (bubble_chart_filtered['Reviews'] > 500) &
    (bubble_chart_filtered['Sentiment_Subjectivity'] > 0.5) &
    (bubble_chart_filtered['Installs'] > 50000) &
    (bubble_chart_filtered['Size'] > 15)
]

# Create bubble chart
fig3 = px.scatter(
    bubble_chart_filtered,
    x='Size',
    y='Rating',
    size='Installs',
    color='Category',
    title='App Size vs. Average Rating (Bubble Size = Installs)',
    labels={'Size': 'App Size (MB)', 'Rating': 'Average Rating'},
    hover_name='App'
)

fig3.update_layout(
    plot_bgcolor='black', 
    paper_bgcolor='black', 
    font=dict(size=14)
)

save_plot_as_html(fig3,"Bubble chart 3.html","Out of all the categories, Gaming apps have more number installs with a decent rating")

**Rendering into html file**

In [35]:
plot_containers_split=plot_containers.split('</div>')

In [36]:
if len(plot_containers_split) > 1:
    final_plot=plot_containers_split[-2]+'</div>'
else:
    final_plot=plot_containers

In [37]:
dashboard_html= """
<!DOCTYPE html>
<html lang="en">
<head>
    <meta charset="UTF-8">
    <meta name=viewport" content="width=device-width,initial-scale-1.0">
    <title> Google Play Store Review Analytics</title>
    <style>
        body {{
            font-family: Arial, sans-serif;
            background-color: #333;
            color: #fff;
            margin: 0;
            padding: 0;
        }}
        .header {{
            display: flex;
            align-items: center;
            justify-content: center;
            padding: 20px;
            background-color: #444
        }}
        .header img {{
            margin: 0 10px;
            height: 50px;
        }}
        .container {{
            display: flex;
            flex-wrap: wrap;
            justify_content: center;
            padding: 20px;
        }}
        .plot-container {{
            border: 2px solid #555
            margin: 10px;
            padding: 10px;
            width: {plot_width}px;
            height: {plot_height}px;
            overflow: hidden;
            position: relative;
            cursor: pointer;
        }}
        .insights {{
            display: none;
            position: absolute;
            right: 10px;
            top: 10px;
            background-color: rgba(0,0,0,0.7);
            padding: 5px;
            border-radius: 5px;
            color: #fff;
        }}
        .plot-container: hover .insights {{
            display: block;
        }}
        </style>
        <script>
            function openPlot(filename) {{
                window.open(filename, '_blank');
                }}
        </script>
    </head>
    <body>
        <div class= "header">
            <img src="https://upload.wikimedia.org/wikipedia/commons/thumb/4/4a/Logo_2013_Google.png/800px-Logo_2013_Google.png" alt="Google Logo">
            <h1>Google Play Store Reviews Analytics</h1>
            <img src="https://upload.wikimedia.org/wikipedia/commons/thumb/7/78/Google_Play_Store_badge_EN.svg/1024px-Google_Play_Store_badge_EN.svg.png" alt="Google Play Store Logo">
        </div>
        <div class="container">
            {plots}
        </div>
    </body>
    </html>
    """

In [38]:
final_html=dashboard_html.format(plots=plot_containers,plot_width=plot_width,plot_height=plot_height)

In [39]:
dashboard_path=os.path.join(html_files_path,"web page2.html")

In [40]:
with open(dashboard_path, "w", encoding="utf-8") as f:
    f.write(final_html)

In [41]:
webbrowser.open('file://'+os.path.realpath(dashboard_path))

True