## Import Data

In [1]:
# Import Libraries
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

In [2]:
# import and read data
df = pd.read_csv("E:/Data Analyst Portofilio Data/Datasets/Google Play Store Apps/googleplaystore.csv")
df

Unnamed: 0,App,Category,Rating,Reviews,Size,Installs,Type,Price,Content Rating,Genres,Last Updated,Current Ver,Android Ver
0,Photo Editor & Candy Camera & Grid & ScrapBook,ART_AND_DESIGN,4.1,159,19M,"10,000+",Free,0,Everyone,Art & Design,"January 7, 2018",1.0.0,4.0.3 and up
1,Coloring book moana,ART_AND_DESIGN,3.9,967,14M,"500,000+",Free,0,Everyone,Art & Design;Pretend Play,"January 15, 2018",2.0.0,4.0.3 and up
2,"U Launcher Lite – FREE Live Cool Themes, Hide ...",ART_AND_DESIGN,4.7,87510,8.7M,"5,000,000+",Free,0,Everyone,Art & Design,"August 1, 2018",1.2.4,4.0.3 and up
3,Sketch - Draw & Paint,ART_AND_DESIGN,4.5,215644,25M,"50,000,000+",Free,0,Teen,Art & Design,"June 8, 2018",Varies with device,4.2 and up
4,Pixel Draw - Number Art Coloring Book,ART_AND_DESIGN,4.3,967,2.8M,"100,000+",Free,0,Everyone,Art & Design;Creativity,"June 20, 2018",1.1,4.4 and up
...,...,...,...,...,...,...,...,...,...,...,...,...,...
10836,Sya9a Maroc - FR,FAMILY,4.5,38,53M,"5,000+",Free,0,Everyone,Education,"July 25, 2017",1.48,4.1 and up
10837,Fr. Mike Schmitz Audio Teachings,FAMILY,5.0,4,3.6M,100+,Free,0,Everyone,Education,"July 6, 2018",1.0,4.1 and up
10838,Parkinson Exercices FR,MEDICAL,,3,9.5M,"1,000+",Free,0,Everyone,Medical,"January 20, 2017",1.0,2.2 and up
10839,The SCP Foundation DB fr nn5n,BOOKS_AND_REFERENCE,4.5,114,Varies with device,"1,000+",Free,0,Mature 17+,Books & Reference,"January 19, 2015",Varies with device,Varies with device


## Data Preprocessing

In [3]:
# Display information about dataset 
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 10841 entries, 0 to 10840
Data columns (total 13 columns):
 #   Column          Non-Null Count  Dtype  
---  ------          --------------  -----  
 0   App             10841 non-null  object 
 1   Category        10841 non-null  object 
 2   Rating          9367 non-null   float64
 3   Reviews         10841 non-null  object 
 4   Size            10841 non-null  object 
 5   Installs        10841 non-null  object 
 6   Type            10840 non-null  object 
 7   Price           10841 non-null  object 
 8   Content Rating  10840 non-null  object 
 9   Genres          10841 non-null  object 
 10  Last Updated    10841 non-null  object 
 11  Current Ver     10833 non-null  object 
 12  Android Ver     10838 non-null  object 
dtypes: float64(1), object(12)
memory usage: 1.1+ MB


In [4]:
# Check null values
df.isnull().sum()

App                  0
Category             0
Rating            1474
Reviews              0
Size                 0
Installs             0
Type                 1
Price                0
Content Rating       1
Genres               0
Last Updated         0
Current Ver          8
Android Ver          3
dtype: int64

In [5]:
# Remove '+' character from Installs column to be numerical  
df['Installs'] = df['Installs'].str.replace("+","")

# Replace 'M' character to 'MB' in size column
df['Size'] = df['Size'].str.replace("M","MB")

# Replace null values by average in Rating column
df['Rating'].replace(np.nan,df['Rating'].mean(),inplace=True)

# Drop null values for the rest of columns
df.dropna(inplace=True)

  df['Installs'] = df['Installs'].str.replace("+","")


In [6]:
# Check null values
df.isnull().sum()

App               0
Category          0
Rating            0
Reviews           0
Size              0
Installs          0
Type              0
Price             0
Content Rating    0
Genres            0
Last Updated      0
Current Ver       0
Android Ver       0
dtype: int64

In [7]:
# Check duplicated values
df.duplicated().sum()

483

In [8]:
# Remove duplicated values
df.drop_duplicates(inplace=True)

In [9]:
# Remove '.' from Installs column to be numerical
df['Installs'] = df['Installs'].str.replace(',','')

# Remove '$' from Price column to be numerical
df['Price'] = df['Price'].str.replace('$','')

  df['Price'] = df['Price'].str.replace('$','')


In [10]:
# Convert data type for Installs column to int
df[['Installs']] = df[['Installs']].astype('int')

# Convert data type for Reviews column to int
df[['Reviews']] = df[['Reviews']].astype('int')

# Convert data type for Price column to float
df[['Price']] = df[['Price']].astype('float')

In [11]:
# Rename Price column 
df.rename(columns={'Price':'Price($)'},inplace=True)

In [12]:
# Check data types for columns
df.dtypes

App                object
Category           object
Rating            float64
Reviews             int32
Size               object
Installs            int32
Type               object
Price($)          float64
Content Rating     object
Genres             object
Last Updated       object
Current Ver        object
Android Ver        object
dtype: object

In [13]:
# Display first 5 rows
df.head()

Unnamed: 0,App,Category,Rating,Reviews,Size,Installs,Type,Price($),Content Rating,Genres,Last Updated,Current Ver,Android Ver
0,Photo Editor & Candy Camera & Grid & ScrapBook,ART_AND_DESIGN,4.1,159,19MB,10000,Free,0.0,Everyone,Art & Design,"January 7, 2018",1.0.0,4.0.3 and up
1,Coloring book moana,ART_AND_DESIGN,3.9,967,14MB,500000,Free,0.0,Everyone,Art & Design;Pretend Play,"January 15, 2018",2.0.0,4.0.3 and up
2,"U Launcher Lite – FREE Live Cool Themes, Hide ...",ART_AND_DESIGN,4.7,87510,8.7MB,5000000,Free,0.0,Everyone,Art & Design,"August 1, 2018",1.2.4,4.0.3 and up
3,Sketch - Draw & Paint,ART_AND_DESIGN,4.5,215644,25MB,50000000,Free,0.0,Teen,Art & Design,"June 8, 2018",Varies with device,4.2 and up
4,Pixel Draw - Number Art Coloring Book,ART_AND_DESIGN,4.3,967,2.8MB,100000,Free,0.0,Everyone,Art & Design;Creativity,"June 20, 2018",1.1,4.4 and up


In [14]:
# Display statistical summary for numerical columns
df.describe()

Unnamed: 0,Rating,Reviews,Installs,Price($)
count,10346.0,10346.0,10346.0,10346.0
mean,4.188718,406333.8,14172660.0,1.031561
std,0.484165,2698179.0,80280900.0,16.287252
min,1.0,0.0,0.0,0.0
25%,4.1,32.0,1000.0,0.0
50%,4.2,1688.5,100000.0,0.0
75%,4.5,46598.25,1000000.0,0.0
max,5.0,78158310.0,1000000000.0,400.0


In [15]:
# Display statistical summary for object columns
df.describe(include='object')

Unnamed: 0,App,Category,Size,Type,Content Rating,Genres,Last Updated,Current Ver,Android Ver
count,10346,10346,10346,10346,10346,10346,10346,10346,10346
unique,9648,33,457,2,6,119,1376,2831,33
top,ROBLOX,FAMILY,Varies with device,Free,Everyone,Tools,"August 3, 2018",Varies with device,4.1 and up
freq,9,1939,1525,9584,8372,840,298,1301,2379


In [16]:
# Display count of Apps that have the same name
df['App'].value_counts()

ROBLOX                                           9
8 Ball Pool                                      7
Zombie Catchers                                  6
Helix Jump                                       6
Bubble Shooter                                   6
                                                ..
Popsicle Launcher for Android P 9.0 launcher     1
PixelLab - Text on pictures                      1
P Launcher for Android™ 9.0                      1
Pacify (Android P theme) - Theme for Xperia™     1
iHoroscope - 2018 Daily Horoscope & Astrology    1
Name: App, Length: 9648, dtype: int64

In [17]:
# Display count of Categories for Apps 
df['Category'].value_counts()

FAMILY                 1939
GAME                   1121
TOOLS                   841
BUSINESS                427
MEDICAL                 408
PRODUCTIVITY            407
PERSONALIZATION         386
LIFESTYLE               373
COMMUNICATION           366
FINANCE                 360
SPORTS                  351
PHOTOGRAPHY             322
HEALTH_AND_FITNESS      306
SOCIAL                  280
NEWS_AND_MAGAZINES      264
TRAVEL_AND_LOCAL        237
BOOKS_AND_REFERENCE     229
SHOPPING                224
DATING                  196
VIDEO_PLAYERS           175
MAPS_AND_NAVIGATION     137
EDUCATION               130
FOOD_AND_DRINK          124
ENTERTAINMENT           111
AUTO_AND_VEHICLES        85
LIBRARIES_AND_DEMO       84
WEATHER                  82
HOUSE_AND_HOME           80
EVENTS                   64
ART_AND_DESIGN           64
PARENTING                60
COMICS                   60
BEAUTY                   53
Name: Category, dtype: int64

In [18]:
# Display count of Types for Apps
df['Type'].value_counts()

Free    9584
Paid     762
Name: Type, dtype: int64

In [19]:
 # Display count of Generes for Apps
df['Genres'].value_counts()

Tools                                840
Entertainment                        587
Education                            526
Business                             427
Medical                              408
                                    ... 
Parenting;Brain Games                  1
Travel & Local;Action & Adventure      1
Lifestyle;Pretend Play                 1
Tools;Education                        1
Strategy;Creativity                    1
Name: Genres, Length: 119, dtype: int64

## Exploratory Data Analysis

In [20]:
df.sample(10)

Unnamed: 0,App,Category,Rating,Reviews,Size,Installs,Type,Price($),Content Rating,Genres,Last Updated,Current Ver,Android Ver
5225,AJ Academy: Amazing Animals,FAMILY,4.5,1564,35MB,50000,Free,0.0,Everyone,Educational;Education,"May 25, 2018",1.1.0,4.1 and up
7390,CI Attendance,PRODUCTIVITY,4.193338,0,172k,5,Paid,0.99,Everyone,Productivity,"August 17, 2013",1.0.0,3.1 and up
5913,BA4You,BUSINESS,2.1,87,26MB,5000,Free,0.0,Everyone,Business,"April 26, 2018",1.6.0.0,4.0 and up
6542,Sam.BN,TOOLS,4.193338,83,2.0MB,1000,Free,0.0,Everyone,Tools,"March 27, 2015",1.0.0,4.0.3 and up
2422,Visualmed,MEDICAL,4.193338,0,3.1MB,1,Paid,2.99,Everyone,Medical,"August 1, 2018",1.0,4.1 and up
10532,Qarabağ FK,SPORTS,4.193338,9,5.5MB,100,Free,0.0,Everyone,Sports,"August 1, 2018",1.14,4.2 and up
2594,"Tagged - Meet, Chat & Dating",SOCIAL,4.1,486824,Varies with device,10000000,Free,0.0,Mature 17+,Social,"June 21, 2018",Varies with device,Varies with device
1058,Capitec Remote Banking,FINANCE,4.3,20672,Varies with device,1000000,Free,0.0,Everyone,Finance,"May 8, 2018",Varies with device,Varies with device
3184,Flights,TRAVEL_AND_LOCAL,4.4,18039,3.1MB,1000000,Free,0.0,Everyone,Travel & Local,"July 26, 2018",3.6.6,4.1 and up
10522,Drift Legends,GAME,4.2,33788,27MB,1000000,Free,0.0,Everyone,Racing,"March 29, 2018",1.8.5,4.1 and up


In [21]:
# 10 Apps that have highest Installs
df[['App','Installs']].groupby('App').sum()['Installs'].nlargest(10).to_frame()

Unnamed: 0_level_0,Installs
App,Unnamed: 1_level_1
Subway Surfers,5000000000
Google Photos,4000000000
Hangouts,4000000000
Google Chrome: Fast & Secure,3000000000
Google Drive,3000000000
Google News,3000000000
Instagram,3000000000
Maps - Navigate & Explore,3000000000
Candy Crush Saga,2500000000
Temple Run 2,2500000000


In [22]:
# 10 Apps that have highest Reviews
df[['App','Reviews']].groupby('App').sum()['Reviews'].nlargest(10).to_frame()

Unnamed: 0_level_0,Reviews
App,Unnamed: 1_level_1
Instagram,199664676
Facebook,156286514
Subway Surfers,138606606
WhatsApp Messenger,138228988
Clash of Clans,134667058
Messenger – Text and Video Chat for Free,113289425
Candy Crush Saga,112134492
8 Ball Pool,99386198
Clash Royale,92530298
Snapchat,68045010


In [23]:
# 10 Apps that have highest Price
df[['App','Price($)']].groupby('App').sum()['Price($)'].nlargest(10).to_frame()

Unnamed: 0_level_0,Price($)
App,Unnamed: 1_level_1
I'm Rich - Trump Edition,400.0
I AM RICH PRO PLUS,399.99
I Am Rich Premium,399.99
I Am Rich Pro,399.99
I am Rich,399.99
I am Rich Plus,399.99
I am Rich!,399.99
I am rich,399.99
I am rich (Most expensive app),399.99
I am rich(premium),399.99


In [24]:
# 10 Categories that have highest Installs
df[['Category','Installs']].groupby('Category').sum()['Installs'].nlargest(10).to_frame()

Unnamed: 0_level_0,Installs
Category,Unnamed: 1_level_1
GAME,31544024415
COMMUNICATION,24152276251
SOCIAL,12513867902
PRODUCTIVITY,12463091369
TOOLS,11452271905
FAMILY,10041632405
PHOTOGRAPHY,9721247655
TRAVEL_AND_LOCAL,6361887146
VIDEO_PLAYERS,6222002720
NEWS_AND_MAGAZINES,5393217760


In [25]:
# 10 Category that have highest Reviews
df[['Category','Reviews']].groupby('Category').sum()['Reviews'].nlargest(10).to_frame()

Unnamed: 0_level_0,Reviews
Category,Unnamed: 1_level_1
GAME,1415536650
COMMUNICATION,601273552
SOCIAL,533576829
FAMILY,396771746
TOOLS,273181033
PHOTOGRAPHY,204297410
VIDEO_PLAYERS,110380188
PRODUCTIVITY,102554498
SHOPPING,94931162
PERSONALIZATION,75192744


In [26]:

df_free_apps = df[df['Type'] == 'Free']

df_Paid_apps = df[df['Type'] == 'Paid']

In [27]:
# 10 Free Apps that have highest Installs
df_free_apps[['App','Installs']].groupby('App').sum()['Installs'].nlargest(10).to_frame()

Unnamed: 0_level_0,Installs
App,Unnamed: 1_level_1
Subway Surfers,5000000000
Google Photos,4000000000
Hangouts,4000000000
Google Chrome: Fast & Secure,3000000000
Google Drive,3000000000
Google News,3000000000
Instagram,3000000000
Maps - Navigate & Explore,3000000000
Candy Crush Saga,2500000000
Temple Run 2,2500000000


In [28]:
# 10 Paid Apps that have highest Installs
df_Paid_apps[['App','Installs']].groupby('App').sum()['Installs'].nlargest(10).to_frame()

Unnamed: 0_level_0,Installs
App,Unnamed: 1_level_1
Minecraft,20000000
Hitman Sniper,10000000
HD Widgets,2000000
Beautiful Widgets Pro,1000000
Bloons TD 5,1000000
Card Wars - Adventure Time,1000000
Cut the Rope GOLD,1000000
DraStic DS Emulator,1000000
Facetune - For Free,1000000
Five Nights at Freddy's,1000000


In [29]:
df.to_csv('E:/Data Analyst Portofilio Data/Datasets/Google Play Store Apps/googleplaystore_transformation.csv')