# Google App Store EDA

In [1]:
# Import libraries
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import ydata_profiling as yd

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
# import file
df = pd.read_csv("./data/04_googleplaystore.csv")

In [3]:
# Chek the data
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 10841 entries, 0 to 10840
Data columns (total 13 columns):
 #   Column          Non-Null Count  Dtype  
---  ------          --------------  -----  
 0   App             10841 non-null  object 
 1   Category        10840 non-null  object 
 2   Rating          9367 non-null   float64
 3   Reviews         10841 non-null  int64  
 4   Size            10841 non-null  object 
 5   Installs        10841 non-null  object 
 6   Type            10840 non-null  object 
 7   Price           10841 non-null  object 
 8   Content Rating  10841 non-null  object 
 9   Genres          10840 non-null  object 
 10  Last Updated    10841 non-null  object 
 11  Current Ver     10833 non-null  object 
 12  Android Ver     10839 non-null  object 
dtypes: float64(1), int64(1), object(11)
memory usage: 1.1+ MB


In [4]:
# Lets have a look on data
df.head()

Unnamed: 0,App,Category,Rating,Reviews,Size,Installs,Type,Price,Content Rating,Genres,Last Updated,Current Ver,Android Ver
0,Photo Editor & Candy Camera & Grid & ScrapBook,ART_AND_DESIGN,4.1,159,19M,"10,000+",Free,0,Everyone,Art & Design,"January 7, 2018",1.0.0,4.0.3 and up
1,Coloring book moana,ART_AND_DESIGN,3.9,967,14M,"500,000+",Free,0,Everyone,Art & Design;Pretend Play,"January 15, 2018",2.0.0,4.0.3 and up
2,"U Launcher Lite – FREE Live Cool Themes, Hide ...",ART_AND_DESIGN,4.7,87510,8.7M,"5,000,000+",Free,0,Everyone,Art & Design,"August 1, 2018",1.2.4,4.0.3 and up
3,Sketch - Draw & Paint,ART_AND_DESIGN,4.5,215644,25M,"50,000,000+",Free,0,Teen,Art & Design,"June 8, 2018",Varies with device,4.2 and up
4,Pixel Draw - Number Art Coloring Book,ART_AND_DESIGN,4.3,967,2.8M,"100,000+",Free,0,Everyone,Art & Design;Creativity,"June 20, 2018",1.1,4.4 and up


In [5]:
# Take samples from big dataset
df.sample(2)

Unnamed: 0,App,Category,Rating,Reviews,Size,Installs,Type,Price,Content Rating,Genres,Last Updated,Current Ver,Android Ver
982,Comedy Central,ENTERTAINMENT,3.9,22378,19M,"1,000,000+",Free,0,Teen,Entertainment,"July 8, 2018",11.45.0,4.4 and up
7823,CS Interview Questions (TechQ),FAMILY,4.8,33,6.7M,"5,000+",Free,0,Everyone,Education,"January 27, 2017",1.5,4.1 and up


In [6]:
df.describe()

Unnamed: 0,Rating,Reviews
count,9367.0,10841.0
mean,4.191513,444111.9
std,0.515735,2927629.0
min,1.0,0.0
25%,4.0,38.0
50%,4.3,2094.0
75%,4.5,54768.0
max,5.0,78158310.0


#  write down all the numeric variables ans make them numeric?
1. size
2. install
3. price

In [7]:
# To do that let's have look a unique values
df['Size'].value_counts()

Size
Varies with device    1695
11M                    198
12M                    196
14M                    194
13M                    191
                      ... 
253k                     1
992k                     1
658k                     1
73k                      1
246k                     1
Name: count, Length: 461, dtype: int64

1. convert KBs into M
2. then remove M from all numbers
3. Handle `varies with device`

In [8]:
df['Size'].isnull().sum()

np.int64(0)

In [9]:
def convert_size(size):
    if size == 'varies with device' or pd.isnull(size):
        return np.nan
    size = size.strip().upper()
    if size.endswith('M'):
        return float(size[:-1])
    elif size.endswith('K'):
        return float(size[:-1]) / 1024
    else:
        return np.nan
    
df['Size_MB'] = df['Size'].apply(convert_size)

In [10]:
df.drop('Size', axis=1, inplace=True)

In [11]:
df.head()

Unnamed: 0,App,Category,Rating,Reviews,Installs,Type,Price,Content Rating,Genres,Last Updated,Current Ver,Android Ver,Size_MB
0,Photo Editor & Candy Camera & Grid & ScrapBook,ART_AND_DESIGN,4.1,159,"10,000+",Free,0,Everyone,Art & Design,"January 7, 2018",1.0.0,4.0.3 and up,19.0
1,Coloring book moana,ART_AND_DESIGN,3.9,967,"500,000+",Free,0,Everyone,Art & Design;Pretend Play,"January 15, 2018",2.0.0,4.0.3 and up,14.0
2,"U Launcher Lite – FREE Live Cool Themes, Hide ...",ART_AND_DESIGN,4.7,87510,"5,000,000+",Free,0,Everyone,Art & Design,"August 1, 2018",1.2.4,4.0.3 and up,8.7
3,Sketch - Draw & Paint,ART_AND_DESIGN,4.5,215644,"50,000,000+",Free,0,Teen,Art & Design,"June 8, 2018",Varies with device,4.2 and up,25.0
4,Pixel Draw - Number Art Coloring Book,ART_AND_DESIGN,4.3,967,"100,000+",Free,0,Everyone,Art & Design;Creativity,"June 20, 2018",1.1,4.4 and up,2.8


In [12]:
df['Size_MB'].fillna(0, inplace=True)

The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  df['Size_MB'].fillna(0, inplace=True)


# install

In [13]:
df['Installs'].value_counts()

Installs
1,000,000+        1579
10,000,000+       1252
100,000+          1169
10,000+           1054
1,000+             908
5,000,000+         752
100+               719
500,000+           539
50,000+            479
5,000+             477
100,000,000+       409
10+                386
500+               330
50,000,000+        289
50+                205
5+                  82
500,000,000+        72
1+                  67
1,000,000,000+      58
0+                  14
0                    1
Name: count, dtype: int64

In [14]:
# unwanted characters remove kiye: ',' and '+'
df['Installs'] = df['Installs'].str.replace('[+,]', '', regex=True)

# ab string ko integer me convert kiya
df['Installs'] = pd.to_numeric(df['Installs'], errors='coerce')

# Following binnig method

# How to handle Price?

In [15]:
df['Price'].value_counts()

Price
0          10041
$0.99        148
$2.99        129
$1.99         73
$4.99         72
           ...  
$3.61          1
$394.99        1
$1.26          1
$1.20          1
$1.04          1
Name: count, Length: 92, dtype: int64

In [16]:
# Find missing value
df['Price'].isnull().sum()

np.int64(0)

## Remove $ sign from all columns using pandas

In [17]:
df['Type'].value_counts()

Type
Free    10040
Paid      800
Name: count, dtype: int64

In [18]:
# 'Free' ko '0' main replace krna hai
df['Price'] = df['Price'].replace('Free', '0')

# '$' symbol hatana hai
df['Price'] = df['Price'].str.replace('$', '', regex = False)

# String ko float main convert kiya
df['Price'] = pd.to_numeric(df['Price'], errors='coerce')

In [19]:
# run automatic eda using ydata_profiling 
profile = yd.ProfileReport(df)
profile.to_file(output_file='./outputs/04_ydata_googleplaystore_kaggle.html')

100%|██████████| 13/13 [00:01<00:00,  9.42it/s]1<00:00,  7.32it/s, Describe variable: Size_MB]      
Summarize dataset: 100%|██████████| 48/48 [00:06<00:00,  7.23it/s, Completed]                 
Generate report structure: 100%|██████████| 1/1 [00:05<00:00,  5.10s/it]
Render HTML: 100%|██████████| 1/1 [00:02<00:00,  2.00s/it]
Export report to file: 100%|██████████| 1/1 [00:00<00:00, 75.51it/s]


In [20]:
df.head()

Unnamed: 0,App,Category,Rating,Reviews,Installs,Type,Price,Content Rating,Genres,Last Updated,Current Ver,Android Ver,Size_MB
0,Photo Editor & Candy Camera & Grid & ScrapBook,ART_AND_DESIGN,4.1,159,10000,Free,0.0,Everyone,Art & Design,"January 7, 2018",1.0.0,4.0.3 and up,19.0
1,Coloring book moana,ART_AND_DESIGN,3.9,967,500000,Free,0.0,Everyone,Art & Design;Pretend Play,"January 15, 2018",2.0.0,4.0.3 and up,14.0
2,"U Launcher Lite – FREE Live Cool Themes, Hide ...",ART_AND_DESIGN,4.7,87510,5000000,Free,0.0,Everyone,Art & Design,"August 1, 2018",1.2.4,4.0.3 and up,8.7
3,Sketch - Draw & Paint,ART_AND_DESIGN,4.5,215644,50000000,Free,0.0,Teen,Art & Design,"June 8, 2018",Varies with device,4.2 and up,25.0
4,Pixel Draw - Number Art Coloring Book,ART_AND_DESIGN,4.3,967,100000,Free,0.0,Everyone,Art & Design;Creativity,"June 20, 2018",1.1,4.4 and up,2.8


In [21]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 10841 entries, 0 to 10840
Data columns (total 13 columns):
 #   Column          Non-Null Count  Dtype  
---  ------          --------------  -----  
 0   App             10841 non-null  object 
 1   Category        10840 non-null  object 
 2   Rating          9367 non-null   float64
 3   Reviews         10841 non-null  int64  
 4   Installs        10841 non-null  int64  
 5   Type            10840 non-null  object 
 6   Price           10841 non-null  float64
 7   Content Rating  10841 non-null  object 
 8   Genres          10840 non-null  object 
 9   Last Updated    10841 non-null  object 
 10  Current Ver     10833 non-null  object 
 11  Android Ver     10839 non-null  object 
 12  Size_MB         10841 non-null  float64
dtypes: float64(3), int64(2), object(8)
memory usage: 1.1+ MB


In [22]:
df.describe()

Unnamed: 0,Rating,Reviews,Installs,Price,Size_MB
count,9367.0,10841.0,10841.0,10841.0,10841.0
mean,4.191513,444111.9,15462910.0,1.027273,18.150386
std,0.515735,2927629.0,85025570.0,15.948971,22.170303
min,1.0,0.0,0.0,0.0,0.0
25%,4.0,38.0,1000.0,0.0,2.6
50%,4.3,2094.0,100000.0,0.0,9.2
75%,4.5,54768.0,5000000.0,0.0,26.0
max,5.0,78158310.0,1000000000.0,400.0,100.0
