## Detailed EDA using Pandas on Google Play Store

In [1]:
# import libraries 
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
# import ydata_profiling as yd

In [2]:
# load google play dataset 
df_app = pd.read_csv('../kaggle/data/play_store/googleplaystore.csv')

In [3]:
df_app.head()

Unnamed: 0,App,Category,Rating,Reviews,Size,Installs,Type,Price,Content Rating,Genres,Last Updated,Current Ver,Android Ver
0,Photo Editor & Candy Camera & Grid & ScrapBook,ART_AND_DESIGN,4.1,159,19M,"10,000+",Free,0,Everyone,Art & Design,"January 7, 2018",1.0.0,4.0.3 and up
1,Coloring book moana,ART_AND_DESIGN,3.9,967,14M,"500,000+",Free,0,Everyone,Art & Design;Pretend Play,"January 15, 2018",2.0.0,4.0.3 and up
2,"U Launcher Lite – FREE Live Cool Themes, Hide ...",ART_AND_DESIGN,4.7,87510,8.7M,"5,000,000+",Free,0,Everyone,Art & Design,"August 1, 2018",1.2.4,4.0.3 and up
3,Sketch - Draw & Paint,ART_AND_DESIGN,4.5,215644,25M,"50,000,000+",Free,0,Teen,Art & Design,"June 8, 2018",Varies with device,4.2 and up
4,Pixel Draw - Number Art Coloring Book,ART_AND_DESIGN,4.3,967,2.8M,"100,000+",Free,0,Everyone,Art & Design;Creativity,"June 20, 2018",1.1,4.4 and up


In [4]:
df_app.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 10841 entries, 0 to 10840
Data columns (total 13 columns):
 #   Column          Non-Null Count  Dtype  
---  ------          --------------  -----  
 0   App             10841 non-null  object 
 1   Category        10841 non-null  object 
 2   Rating          9367 non-null   float64
 3   Reviews         10841 non-null  object 
 4   Size            10841 non-null  object 
 5   Installs        10841 non-null  object 
 6   Type            10840 non-null  object 
 7   Price           10841 non-null  object 
 8   Content Rating  10840 non-null  object 
 9   Genres          10841 non-null  object 
 10  Last Updated    10841 non-null  object 
 11  Current Ver     10833 non-null  object 
 12  Android Ver     10838 non-null  object 
dtypes: float64(1), object(12)
memory usage: 1.1+ MB


In [5]:
#take sample of dataset
df_app.sample(10)

Unnamed: 0,App,Category,Rating,Reviews,Size,Installs,Type,Price,Content Rating,Genres,Last Updated,Current Ver,Android Ver
751,CppDroid - C/C++ IDE,EDUCATION,4.1,29980,Varies with device,"1,000,000+",Free,0,Everyone,Education,"August 17, 2017",Varies with device,Varies with device
7074,WISE- MOBILE PORTAL,TOOLS,,3,1.0M,500+,Free,0,Everyone,Tools,"July 30, 2015",1.0.0.4,2.2 and up
1786,Episode - Choose Your Story,GAME,4.3,1841061,Varies with device,"50,000,000+",Free,0,Teen,Simulation,"July 31, 2018",Varies with device,Varies with device
8521,DL Hughley,FAMILY,4.6,12,10M,"1,000+",Free,0,Mature 17+,Entertainment,"March 23, 2018",2.0.4,4.1 and up
2149,codeSpark Academy & The Foos,FAMILY,4.1,4522,57M,"500,000+",Free,0,Everyone,Educational;Education,"June 21, 2018",2.19.01,4.1 and up
6046,Best Browser BD social networking,COMMUNICATION,4.8,6,21M,10+,Free,0,Everyone,Communication,"July 4, 2018",2.0,4.1 and up
580,Black White Interracial Dating - Interracial M...,DATING,4.1,537,28M,"100,000+",Free,0,Mature 17+,Dating,"March 29, 2018",2.0.0,4.1 and up
9961,Light Meter - EV,PHOTOGRAPHY,4.0,26,8.5M,"1,000+",Free,0,Everyone,Photography,"August 2, 2018",4.1.1,4.4 and up
10091,EY ATL Fuel Calculator,PRODUCTIVITY,4.4,15,3.1M,500+,Free,0,Everyone,Productivity,"July 21, 2018",12,2.1 and up
7694,CP Smart Check List,PERSONALIZATION,,1,3.9M,10+,Free,0,Everyone,Personalization,"August 14, 2017",0.0.1,4.1 and up


In [6]:
# removing a row which contains Installs value as 'Free' 
df_app = df_app[df_app['Installs'] != 'Free']  # contain data excluding a specific row

In [7]:
df_app.describe()

Unnamed: 0,Rating
count,9366.0
mean,4.191757
std,0.515219
min,1.0
25%,4.0
50%,4.3
75%,4.5
max,5.0


In [8]:
# enlist numeric variable 
# size, installs, price, reviews 

In [9]:
# correcting Installs column 
df_app['Installs'] = df_app['Installs'].astype(str).str.replace('+','')  # removing + and ,
df_app['Installs'] = df_app['Installs'].str.replace(',','')
df_app['Installs'] = df_app['Installs'].astype(np.int64)  # convert to int64
df_app['Installs']

0           10000
1          500000
2         5000000
3        50000000
4          100000
           ...   
10836        5000
10837         100
10838        1000
10839        1000
10840    10000000
Name: Installs, Length: 10840, dtype: int64

In [10]:
# binning of Installs column 
# Define the bin edges
bin_edges = [0, 1000, 10000, 100000, np.inf]

# Define the bin labels
bin_labels = ['0-1k', '1k-10k', '10k-100k', '100k+']

# Perform binning on the 'Installs' column
df_app['Installs_binned'] = pd.cut(df_app['Installs'], bins=bin_edges, labels=bin_labels)

# Display the updated DataFrame
df_app.head()

Unnamed: 0,App,Category,Rating,Reviews,Size,Installs,Type,Price,Content Rating,Genres,Last Updated,Current Ver,Android Ver,Installs_binned
0,Photo Editor & Candy Camera & Grid & ScrapBook,ART_AND_DESIGN,4.1,159,19M,10000,Free,0,Everyone,Art & Design,"January 7, 2018",1.0.0,4.0.3 and up,1k-10k
1,Coloring book moana,ART_AND_DESIGN,3.9,967,14M,500000,Free,0,Everyone,Art & Design;Pretend Play,"January 15, 2018",2.0.0,4.0.3 and up,100k+
2,"U Launcher Lite – FREE Live Cool Themes, Hide ...",ART_AND_DESIGN,4.7,87510,8.7M,5000000,Free,0,Everyone,Art & Design,"August 1, 2018",1.2.4,4.0.3 and up,100k+
3,Sketch - Draw & Paint,ART_AND_DESIGN,4.5,215644,25M,50000000,Free,0,Teen,Art & Design,"June 8, 2018",Varies with device,4.2 and up,100k+
4,Pixel Draw - Number Art Coloring Book,ART_AND_DESIGN,4.3,967,2.8M,100000,Free,0,Everyone,Art & Design;Creativity,"June 20, 2018",1.1,4.4 and up,10k-100k


In [11]:
# corrcting price column 
df_app['Price'] = df_app['Price'].str.replace('$', '')  # remove $ 
df_app['Price'] = df_app['Price'].astype(float)
df_app['Price']

0        0.0
1        0.0
2        0.0
3        0.0
4        0.0
        ... 
10836    0.0
10837    0.0
10838    0.0
10839    0.0
10840    0.0
Name: Price, Length: 10840, dtype: float64

In [12]:
# correcting size column and converting MB into kb 
df_app['Size'] = df_app['Size'].replace('Varies with device', np.nan)
df_app['Suffix'] = df_app['Size'].str[-1]  # save suffix in another column 'Suffix'
df_app['Size'] = df_app['Size'].str.replace('k', '') # remove k and M
df_app['Size'] = df_app['Size'].str.replace('M', '')
df_app['Size'] = df_app['Size'].astype(float)  # convert type to float
df_app.loc[df_app['Suffix']=='k', 'Size'] = df_app['Size'] / 1024  # converting kb to MB based on Suffix column which has k value
df_app['Size'] = df_app['Size'].round(2)  # round to two digits after decimal



In [13]:
df_app['Size'].tail(20)

10821     2.50
10822     3.10
10823     2.90
10824    82.00
10825     7.70
10826      NaN
10827    13.00
10828    13.00
10829     7.40
10830     2.30
10831     9.80
10832     0.57
10833     0.60
10834     2.60
10835     9.60
10836    53.00
10837     3.60
10838     9.50
10839      NaN
10840    19.00
Name: Size, dtype: float64

In [15]:
# drop Suffix column 
df_app = df_app.drop('Suffix', axis=1)

In [14]:
# It was used for automatic ydata profiling 
# automatic EDA using ydata profiling 
# profile = yd.ProfileReport(df_app)
# profile.to_file(output_file="../kaggle/output/report_google_play_store.html")