<a href="https://colab.research.google.com/github/smileSD/Data-manipulation-and-visualization/blob/main/Google_Play_Store_App_Analytics_.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Introduction

Comprehensive analysis of the Android app market by comparing thousands of apps in the Google Play store.

# About the Dataset of Google Play Store Apps & Reviews

In [None]:
import pandas as pd
import plotly.express as px

In [None]:
# Show numeric output in decimal format e.g., 2.15
pd.options.display.float_format = '{:,.2f}'.format

In [None]:
df_apps = pd.read_csv('apps.csv')

# Data Cleaning

In [None]:
df_apps.shape

(10841, 12)

In [None]:
df_apps.columns

Index(['App', 'Category', 'Rating', 'Reviews', 'Size_MBs', 'Installs', 'Type',
       'Price', 'Content_Rating', 'Genres'],
      dtype='object')

In [None]:
df_apps.sample(n=5)

Unnamed: 0,App,Category,Rating,Reviews,Size_MBs,Installs,Type,Price,Content_Rating,Genres,Last_Updated,Android_Ver
6170,Map and Router Badge,MAPS_AND_NAVIGATION,4.4,3652,32.0,500000,Free,0,Everyone,Maps & Navigation,"July 25, 2018",4.1 and up
9241,Fun Kid Racing - Motocross,FAMILY,4.1,59768,19.0,10000000,Free,0,Everyone,Racing;Action & Adventure,"August 7, 2018",4.2 and up
5310,Room planner: Interior & Floorplan Design for ...,HOUSE_AND_HOME,4.0,4281,27.0,100000,Free,0,Everyone,House & Home,"August 5, 2018",4.0.3 and up
4796,Ghost Snap AR Horror Survival,GAME,3.8,898,33.0,100000,Free,0,Teen,Adventure,"December 21, 2016",2.3 and up
153,Ej-buy,BUSINESS,,2,4.1,5,Free,0,Everyone,Business,"August 2, 2018",4.1 and up


In [None]:
df_apps.drop(['Last_Updated', 'Android_Ver'],axis=1,inplace=True)
df_apps.head()

Unnamed: 0,App,Category,Rating,Reviews,Size_MBs,Installs,Type,Price,Content_Rating,Genres
0,Ak Parti Yardım Toplama,SOCIAL,,0,8.7,0,Paid,$13.99,Teen,Social
1,Ain Arabic Kids Alif Ba ta,FAMILY,,0,33.0,0,Paid,$2.99,Everyone,Education
2,Popsicle Launcher for Android P 9.0 launcher,PERSONALIZATION,,0,5.5,0,Paid,$1.49,Everyone,Personalization
3,Command & Conquer: Rivals,FAMILY,,0,19.0,0,,0,Everyone 10+,Strategy
4,CX Network,BUSINESS,,0,10.0,0,Free,0,Everyone,Business


In [None]:
nan_rows=df_apps[df_apps.Rating.isna()]
print(nan_rows.shape)


(1474, 10)


In [None]:
df_apps_clean=df_apps.dropna()
df_apps_clean.shape

(9367, 10)

In [None]:
df_duplicate=df_apps_clean.duplicated()
print(df_duplicate.shape)
df_duplicate.head()

(9367,)


21    False
28    False
47    False
82    False
99    False
dtype: bool

In [None]:
df_apps_clean=df_apps_clean.drop_duplicates(subset=['App','Installs','Price','Type'])
df_apps_clean.shape


(8211, 10)

In [None]:
df_apps_clean[df_apps_clean.App=='Instagram']

Unnamed: 0,App,Category,Rating,Reviews,Size_MBs,Installs,Type,Price,Content_Rating,Genres
10806,Instagram,SOCIAL,4.5,66577313,5.3,1000000000,Free,0,Teen,Social


# Find Highest Rated Apps


In [None]:
df_apps_clean.sort_values('Rating',ascending=False).head(10)

Unnamed: 0,App,Category,Rating,Reviews,Size_MBs,Installs,Type,Price,Content_Rating,Genres
21,KBA-EZ Health Guide,MEDICAL,5.0,4,25.0,1,Free,0,Everyone,Medical
1230,Sway Medical,MEDICAL,5.0,3,22.0,100,Free,0,Everyone,Medical
1227,AJ Men's Grooming,LIFESTYLE,5.0,2,22.0,100,Free,0,Everyone,Lifestyle
1224,FK Dedinje BGD,SPORTS,5.0,36,2.6,100,Free,0,Everyone,Sports
1223,CB VIDEO VISION,PHOTOGRAPHY,5.0,13,2.6,100,Free,0,Everyone,Photography
1222,"Beacon Baptist Jupiter, FL",LIFESTYLE,5.0,14,2.6,100,Free,0,Everyone,Lifestyle
1214,BV Mobile Apps,PRODUCTIVITY,5.0,3,4.8,100,Free,0,Everyone,Productivity
1206,ADS-B Driver,TOOLS,5.0,2,6.3,100,Paid,$1.99,Everyone,Tools
1183,Railroad Radio Vancouver BC,FAMILY,5.0,4,1.7,100,Free,0,Teen,Entertainment
2544,CS & IT Interview Questions,FAMILY,5.0,43,3.3,1000,Free,0,Everyone,Education


# Find 5 Largest Apps in terms of Size (MBs)

In [None]:
df_apps_clean.sort_values('Size_MBs',ascending=False).head()

Unnamed: 0,App,Category,Rating,Reviews,Size_MBs,Installs,Type,Price,Content_Rating,Genres
9942,Talking Babsy Baby: Baby Games,LIFESTYLE,4.0,140995,100.0,10000000,Free,0,Everyone,Lifestyle;Pretend Play
10687,Hungry Shark Evolution,GAME,4.5,6074334,100.0,100000000,Free,0,Teen,Arcade
9943,Miami crime simulator,GAME,4.0,254518,100.0,10000000,Free,0,Mature 17+,Action
9944,Gangster Town: Vice District,FAMILY,4.3,65146,100.0,10000000,Free,0,Mature 17+,Simulation
3144,Vi Trainer,HEALTH_AND_FITNESS,3.6,124,100.0,5000,Free,0,Everyone,Health & Fitness


# App with Most Reviews

In [None]:
ratings=df_apps_clean.Content_Rating.value_counts()
ratings

Everyone           6633
Teen                912
Mature 17+          357
Everyone 10+        305
Adults only 18+       3
Unrated               1
Name: Content_Rating, dtype: int64

# Pie Chart : Content Ratings



In [None]:
fig=px.pie(labels=ratings.index, values=ratings.values, title="Content Rtaing", names=ratings.index)
fig.update_traces(textposition="outside", textinfo="label+percent")
fig.show()

# Examine the Number of Installs

In [None]:
df_apps_clean.Installs=df_apps_clean.Installs.astype(str).str.replace(",","")
df_apps_clean.Installs=pd.to_numeric(df_apps_clean.Installs)
df_apps_clean[["App","Installs"]].groupby("Installs").count()

Unnamed: 0_level_0,App
Installs,Unnamed: 1_level_1
1,3
5,9
10,69
50,56
100,303
500,199
1000,698
5000,425
10000,988
50000,457


# Finding the Most Expensive Apps, Filter out the Junk


In [None]:
df_apps_clean.Price=df_apps_clean.Price.astype(str).str.replace("$","")
df_apps_clean.Price=pd.to_numeric(df_apps_clean.Price)
df_apps_clean.sort_values("Price",ascending=False).head(20)


The default value of regex will change from True to False in a future version. In addition, single character regular expressions will *not* be treated as literal strings when regex=True.



Unnamed: 0,App,Category,Rating,Reviews,Size_MBs,Installs,Type,Price,Content_Rating,Genres
3946,I'm Rich - Trump Edition,LIFESTYLE,3.6,275,7.3,10000,Paid,400.0,Everyone,Lifestyle
3221,I am Rich Plus,FAMILY,4.0,856,8.7,10000,Paid,399.99,Everyone,Entertainment
3145,I am rich(premium),FINANCE,3.5,472,0.94,5000,Paid,399.99,Everyone,Finance
1946,I am rich (Most expensive app),FINANCE,4.1,129,2.7,1000,Paid,399.99,Teen,Finance
2461,I AM RICH PRO PLUS,FINANCE,4.0,36,41.0,1000,Paid,399.99,Everyone,Finance
2775,I Am Rich Pro,FAMILY,4.4,201,2.7,5000,Paid,399.99,Everyone,Entertainment
1331,most expensive app (H),FAMILY,4.3,6,1.5,100,Paid,399.99,Everyone,Entertainment
3554,💎 I'm rich,LIFESTYLE,3.8,718,26.0,10000,Paid,399.99,Everyone,Lifestyle
4606,I Am Rich Premium,FINANCE,4.1,1867,4.7,50000,Paid,399.99,Everyone,Finance
3114,I am Rich,FINANCE,4.3,180,3.8,5000,Paid,399.99,Everyone,Finance


### The most expensive apps sub $250

In [None]:
df_apps_clean=df_apps_clean[df_apps_clean['Price']>250]
df_apps_clean.sort_values("Price", ascending=False).head(5)

Unnamed: 0,App,Category,Rating,Reviews,Size_MBs,Installs,Type,Price,Content_Rating,Genres
3946,I'm Rich - Trump Edition,LIFESTYLE,3.6,275,7.3,10000,Paid,400.0,Everyone,Lifestyle
1331,most expensive app (H),FAMILY,4.3,6,1.5,100,Paid,399.99,Everyone,Entertainment
1946,I am rich (Most expensive app),FINANCE,4.1,129,2.7,1000,Paid,399.99,Teen,Finance
2394,I am Rich!,FINANCE,3.8,93,22.0,1000,Paid,399.99,Everyone,Finance
2461,I AM RICH PRO PLUS,FINANCE,4.0,36,41.0,1000,Paid,399.99,Everyone,Finance


### Highest Grossing Paid Apps

In [None]:
df_apps_clean['Revenue_Estimate']=df_apps_clean.Installs.mul(df_apps_clean.Price)
df_apps_clean.sort_values("Revenue_Estimate", ascending=False).head(10)

Unnamed: 0,App,Category,Rating,Reviews,Size_MBs,Installs,Type,Price,Content_Rating,Genres,Revenue_Estimate
5765,I am rich,LIFESTYLE,3.8,3547,1.8,100000,Paid,399.99,Everyone,Lifestyle,39999000.0
4606,I Am Rich Premium,FINANCE,4.1,1867,4.7,50000,Paid,399.99,Everyone,Finance,19999500.0
3946,I'm Rich - Trump Edition,LIFESTYLE,3.6,275,7.3,10000,Paid,400.0,Everyone,Lifestyle,4000000.0
3221,I am Rich Plus,FAMILY,4.0,856,8.7,10000,Paid,399.99,Everyone,Entertainment,3999900.0
3554,💎 I'm rich,LIFESTYLE,3.8,718,26.0,10000,Paid,399.99,Everyone,Lifestyle,3999900.0
3897,I Am Rich,FAMILY,3.6,217,4.9,10000,Paid,389.99,Everyone,Entertainment,3899900.0
3856,I am rich VIP,LIFESTYLE,3.8,411,2.6,10000,Paid,299.99,Everyone,Lifestyle,2999900.0
2775,I Am Rich Pro,FAMILY,4.4,201,2.7,5000,Paid,399.99,Everyone,Entertainment,1999950.0
3114,I am Rich,FINANCE,4.3,180,3.8,5000,Paid,399.99,Everyone,Finance,1999950.0
3145,I am rich(premium),FINANCE,3.5,472,0.94,5000,Paid,399.99,Everyone,Finance,1999950.0


In [None]:
top_category=df_apps_clean.Category.value_counts().head(10)

### Bar Chart - Highest Competition (Number of Apps)

In [None]:
bar=px.bar(x=top_category.index, y=top_category.values)
bar.show()

### Horizontal Bar Chart - Most Popular Categories (Highest Downloads)

In [None]:
category_installs=df_apps_clean.groupby('Category').agg({"Installs":pd.Series.sum})
category_installs.sort_values("Installs", ascending=True, inplace =True)
category_installs

Unnamed: 0_level_0,Installs
Category,Unnamed: 1_level_1
FAMILY,25100
FINANCE,63000
LIFESTYLE,131000


In [None]:
h_bar=px.bar(x=category_installs.Installs, y=category_installs.index)
h_bar.update_layout(xaxis_title="Number of downloads", yaxis_title="category name")
h_bar.show()

### Category Concentration - Downloads vs. Competition

In [None]:
cat_number=df_apps_clean.groupby("Category").agg({"App":pd.Series.count})
merged_df=cat_number.merge(category_installs, on="Category", how="inner")
merged_df.sort_values("App", ascending=False)

Unnamed: 0_level_0,App,Installs
Category,Unnamed: 1_level_1,Unnamed: 2_level_1
FINANCE,6,63000
LIFESTYLE,5,131000
FAMILY,4,25100


In [None]:
scatter=px.scatter(x=merged_df.App, y=merged_df.Installs, size=merged_df.App,color=merged_df.Installs, hover_name=merged_df.Installs)
scatter.update_layout(xaxis_title="Number of Apps",
                      yaxis_title="Installs")
scatter.show()

In [None]:
stack=df_apps_clean.Genres.str.split(';', expand=True).stack()
stack.head()

1331  0    Entertainment
1946  0          Finance
2193  0        Lifestyle
2394  0          Finance
2461  0          Finance
dtype: object

In [None]:
num_genre=stack.value_counts()
num_genre

Finance          6
Lifestyle        5
Entertainment    4
dtype: int64

# Bar Charts - Competition in Genres

In [None]:
bar=px.bar(x=num_genre.index, y=num_genre.values, color=num_genre.values, color_continuous_scale='Agsunset')
bar.update_layout(xaxis_title="Genre", yaxis_title="Number of apps")
bar.show()

# Free vs. Paid Apps per Category

In [None]:
df_free_vs_paid=df_apps_clean.groupby(["Category","Type"], as_index=False).agg({"App":pd.Series.count})
df_free_vs_paid.head()

Unnamed: 0,Category,Type,App
0,FAMILY,Paid,4
1,FINANCE,Paid,6
2,LIFESTYLE,Paid,5


In [None]:
g_bar=px.bar(x=df_free_vs_paid.Category, y=df_free_vs_paid.App, color=df_free_vs_paid.Type, barmode="group")
g_bar.update_layout(xaxis_title="Category", yaxis_title="Number of Apps")
g_bar.show()

# Box Plots: Lost Downloads for Paid Apps


In [None]:
box=px.box(x=df_apps_clean.Type, y=df_apps_clean.Installs, color=df_apps_clean.Type, notched=True, points='all')
box.show()