# Getting Started

In [1]:
import pandas as pd
import numpy as np

In [2]:
link = 'https://raw.githubusercontent.com/AshishJangra27/Data-Analysis-with-Python-GFG/main/2.%20Dataset%20Walkthrough/googleplaystore.csv'

In [3]:
data = pd.read_csv(link)

In [4]:
df = pd.DataFrame(data)
df.head()

Unnamed: 0,App,Category,Rating,Reviews,Size,Installs,Type,Price,Content Rating,Genres,Last Updated,Current Ver,Android Ver
0,Photo Editor & Candy Camera & Grid & ScrapBook,ART_AND_DESIGN,4.1,159,19M,"10,000+",Free,0,Everyone,Art & Design,"January 7, 2018",1.0.0,4.0.3 and up
1,Coloring book moana,ART_AND_DESIGN,3.9,967,14M,"500,000+",Free,0,Everyone,Art & Design;Pretend Play,"January 15, 2018",2.0.0,4.0.3 and up
2,"U Launcher Lite – FREE Live Cool Themes, Hide ...",ART_AND_DESIGN,4.7,87510,8.7M,"5,000,000+",Free,0,Everyone,Art & Design,"August 1, 2018",1.2.4,4.0.3 and up
3,Sketch - Draw & Paint,ART_AND_DESIGN,4.5,215644,25M,"50,000,000+",Free,0,Teen,Art & Design,"June 8, 2018",Varies with device,4.2 and up
4,Pixel Draw - Number Art Coloring Book,ART_AND_DESIGN,4.3,967,2.8M,"100,000+",Free,0,Everyone,Art & Design;Creativity,"June 20, 2018",1.1,4.4 and up


In [5]:
df.columns

Index(['App', 'Category', 'Rating', 'Reviews', 'Size', 'Installs', 'Type',
       'Price', 'Content Rating', 'Genres', 'Last Updated', 'Current Ver',
       'Android Ver'],
      dtype='object')

# Data Preprocessing

In [6]:
# checking number of null values in every column

In [7]:
df.isnull().sum()

App                  0
Category             0
Rating            1474
Reviews              0
Size                 0
Installs             0
Type                 1
Price                0
Content Rating       1
Genres               0
Last Updated         0
Current Ver          8
Android Ver          3
dtype: int64

In [8]:
# thus the columns : rating, type, content rating, current version and android version have null values

In [9]:
# replacing null values with something else

In [10]:
df['Rating'] = df['Rating'].fillna(0)
df['Content Rating'] = df['Content Rating'].fillna('Unrated')
df['Current Ver'] = df['Current Ver'].fillna('Not available')
df['Android Ver'] = df['Android Ver'].fillna('Not available')
df['Type'] = df['Type'].fillna('0')
df.isnull().sum()
# dropna() can be used to drop the rows having null values

App               0
Category          0
Rating            0
Reviews           0
Size              0
Installs          0
Type              0
Price             0
Content Rating    0
Genres            0
Last Updated      0
Current Ver       0
Android Ver       0
dtype: int64

#### Analyzing numeric values

In [11]:
# finding average rating of apps : round figure

In [12]:
import statistics as stat
round(stat.mean(df['Rating']))

4

In [13]:
# checking how many apps have rating 5

In [14]:
len(df[df['Rating']==5])

274

#### Analyzing categorical values

In [15]:
df['Type'].unique()

array(['Free', 'Paid', '0'], dtype=object)

In [16]:
# finding total number of free and paid apps and percentage of free apps

In [17]:
free = len(df[df['Type']=='Free'])
paid = len(df[df['Type']=='Paid'])
print('free apps : ', free)
print('paid apps : ', paid)

free apps :  10039
paid apps :  800


In [18]:
print('percentage of free apps are : ', (free/(free+paid))*100)

percentage of free apps are :  92.61924531783376


In [19]:
# checking how many categories are there
len(df['Category'].unique())

34

In [20]:
# checking how many apps in art and design
len(df[df['Category']=='ART_AND_DESIGN'])

65

In [21]:
df['Content Rating'].unique()

array(['Everyone', 'Teen', 'Everyone 10+', 'Mature 17+',
       'Adults only 18+', 'Unrated'], dtype=object)

In [22]:
# writing number of apps in all categories using for loop
for i in df['Category'].unique():
    n = len(df[df['Category']==i])
    print(i,' : ',n)

ART_AND_DESIGN  :  65
AUTO_AND_VEHICLES  :  85
BEAUTY  :  53
BOOKS_AND_REFERENCE  :  231
BUSINESS  :  460
COMICS  :  60
COMMUNICATION  :  387
DATING  :  234
EDUCATION  :  156
ENTERTAINMENT  :  149
EVENTS  :  64
FINANCE  :  366
FOOD_AND_DRINK  :  127
HEALTH_AND_FITNESS  :  341
HOUSE_AND_HOME  :  88
LIBRARIES_AND_DEMO  :  85
LIFESTYLE  :  382
GAME  :  1144
FAMILY  :  1972
MEDICAL  :  463
SOCIAL  :  295
SHOPPING  :  260
PHOTOGRAPHY  :  335
SPORTS  :  384
TRAVEL_AND_LOCAL  :  258
TOOLS  :  843
PERSONALIZATION  :  392
PRODUCTIVITY  :  424
PARENTING  :  60
WEATHER  :  82
VIDEO_PLAYERS  :  175
NEWS_AND_MAGAZINES  :  283
MAPS_AND_NAVIGATION  :  137
1.9  :  1


In [23]:
# doing the same for genre and content rating
for i in df['Genres'].unique():
    n = len(df[df['Genres']==i])
    print(i,' : ',n)
    
print('*'*100)
    
for i in df['Content Rating'].unique():
    n = len(df[df['Content Rating']==i])
    print(i,' : ',n)

Art & Design  :  58
Art & Design;Pretend Play  :  2
Art & Design;Creativity  :  7
Art & Design;Action & Adventure  :  2
Auto & Vehicles  :  85
Beauty  :  53
Books & Reference  :  231
Business  :  460
Comics  :  59
Comics;Creativity  :  1
Communication  :  387
Dating  :  234
Education;Education  :  50
Education  :  549
Education;Creativity  :  7
Education;Music & Video  :  5
Education;Action & Adventure  :  6
Education;Pretend Play  :  23
Education;Brain Games  :  5
Entertainment  :  623
Entertainment;Music & Video  :  27
Entertainment;Brain Games  :  8
Entertainment;Creativity  :  3
Events  :  64
Finance  :  366
Food & Drink  :  127
Health & Fitness  :  341
House & Home  :  88
Libraries & Demo  :  85
Lifestyle  :  381
Lifestyle;Pretend Play  :  1
Adventure;Action & Adventure  :  13
Arcade  :  220
Casual  :  193
Card  :  48
Casual;Pretend Play  :  31
Action  :  365
Strategy  :  107
Puzzle  :  140
Sports  :  398
Music  :  22
Word  :  29
Racing  :  98
Casual;Creativity  :  7
Casual;Action

In [24]:
# using describe function
df['Type'].describe()

count     10841
unique        3
top        Free
freq      10039
Name: Type, dtype: object

#### Handling null values using imputer

In [25]:
import pandas as pd
link = 'https://raw.githubusercontent.com/AshishJangra27/Data-Analysis-with-Python-GFG/main/2.%20Dataset%20Walkthrough/googleplaystore.csv'

In [26]:
data = pd.read_csv(link)
df = pd.DataFrame(data)
df.head(2)

Unnamed: 0,App,Category,Rating,Reviews,Size,Installs,Type,Price,Content Rating,Genres,Last Updated,Current Ver,Android Ver
0,Photo Editor & Candy Camera & Grid & ScrapBook,ART_AND_DESIGN,4.1,159,19M,"10,000+",Free,0,Everyone,Art & Design,"January 7, 2018",1.0.0,4.0.3 and up
1,Coloring book moana,ART_AND_DESIGN,3.9,967,14M,"500,000+",Free,0,Everyone,Art & Design;Pretend Play,"January 15, 2018",2.0.0,4.0.3 and up


In [27]:
from sklearn.impute import SimpleImputer
import numpy as np
df['Rating'].isnull().sum()

1474

In [28]:
# Create a imputer to calculate the mean excluding the null values
# there are other stratergies apart from mean as well that you can use
A = SimpleImputer(missing_values=np.nan,strategy='mean')

In [29]:
# fit it to calculate the value
A.fit(np.array(df['Rating']).reshape(-1,1))

In [30]:
# use transform to replace the null value with the mean calculated
df['Rating'] = A.transform(np.array(df['Rating']).reshape(-1,1))

In [31]:
df['Rating'].isnull().sum()

0

In [32]:
# doing same for categorical value, but here we use mode
df['Content Rating'].isnull().sum()

1

In [33]:

print()
A = SimpleImputer(missing_values=np.nan,strategy='most_frequent')
A.fit(np.array(df['Content Rating']).reshape(-1,1))





In [34]:
df['Content Rating'] = A.transform(np.array(df['Content Rating']).reshape(-1,1))

In [35]:
df['Content Rating'].isnull().sum()

0

In [36]:
# we dealt with null values in 1 numerical and 1 categorical column. rest null values we will drop since they are few in number

In [37]:
df = df.dropna()

# Data Analysis

In [48]:
# data analysis with multiple columns

In [49]:
# finding number of free apps in Art and Design

len(df[(df['Category'] == 'ART_AND_DESIGN') & (df['Type'] == 'Free')])


61

In [50]:
# finding number of apps in Art and Design with rating more than 4.5
len(df[(df['Category'] == 'ART_AND_DESIGN') & (df['Rating'] > 4.5)])

22

In [51]:
# finding number of free apps with rating more than 4.5 in family category
len(df[(df['Category'] == 'FAMILY') & (df['Rating'] > 4.5) & (df['Type'] > 'Free')])

50

In [56]:
# listing those apps
df[(df['Category'] == 'FAMILY') & (df['Rating'] > 4.5) & (df['Type'] > 'Free')]['App']

2151                                       Toca Life: City
2168                        Children Educational Game Full
2170                                             Hactar Go
2172                        World Racers family board game
2176                        Lanterns: The Harvest Festival
2177                          Tsuro - The Game of the Path
2189                                     Avokiddo Emotions
2202                                     Avokiddo Emotions
3993                        C4droid - C/C++ compiler & IDE
4057                                        Mind Games Pro
4260                                     Cut the Rope GOLD
4293                                                K.MOJI
4299                        Math Games for Pre-K - Grade 4
4301                Fuzzy Numbers: Pre-K Number Foundation
4316                                          Anna.K Tarot
4508                                          Q Avatar Pro
4556                                         Day R Premi

### Groupby and sort_values in pandas

In [62]:
#data in ascending order wrt rating
df.sort_values(by = 'Rating', ascending = True)

Unnamed: 0,App,Category,Rating,Reviews,Size,Installs,Type,Price,Content Rating,Genres,Last Updated,Current Ver,Android Ver
7926,Tech CU Card Manager,FINANCE,1.0,2,7.2M,"1,000+",Free,0,Everyone,Finance,"July 25, 2017",1.0.1,4.0 and up
7806,CR Magazine,BUSINESS,1.0,1,7.8M,100+,Free,0,Everyone,Business,"July 23, 2014",2.4.2,2.3.3 and up
625,House party - live chat,DATING,1.0,1,9.2M,10+,Free,0,Mature 17+,Dating,"July 31, 2018",3.52,4.0.3 and up
10400,Familial Hypercholesterolaemia Handbook,MEDICAL,1.0,2,33M,100+,Free,0,Everyone,Medical,"July 2, 2018",2.0.1,4.1 and up
10591,Lottery Ticket Checker - Florida Results & Lotto,TOOLS,1.0,3,41M,500+,Free,0,Everyone,Tools,"December 12, 2017",1.0,4.2 and up
...,...,...,...,...,...,...,...,...,...,...,...,...,...
7377,CI 174 Gray Icon Pack,PERSONALIZATION,5.0,1,46M,10+,Paid,$0.99,Everyone,Personalization,"May 21, 2018",1.1,4.1 and up
5763,Tozer Devotional -Series 1,BOOKS_AND_REFERENCE,5.0,5,4.3M,"1,000+",Free,0,Everyone,Books & Reference,"October 8, 2016",1.0,2.3 and up
6816,BU Study,FAMILY,5.0,7,5.6M,10+,Free,0,Everyone,Education,"December 7, 2017",1.0,4.0.3 and up
5776,Food-Aw - Order Food Online in Aruba,FOOD_AND_DRINK,5.0,1,24M,100+,Free,0,Everyone,Food & Drink,"April 5, 2018",10,4.1 and up


In [63]:
# finding average rating of every category 
df.groupby('Category').mean()['Rating']

  df.groupby('Category').mean()['Rating']


Category
ART_AND_DESIGN         4.368438
AUTO_AND_VEHICLES      4.190824
BEAUTY                 4.260882
BOOKS_AND_REFERENCE    4.311537
BUSINESS               4.145987
COMICS                 4.156445
COMMUNICATION          4.163842
DATING                 4.007864
EDUCATION              4.387778
ENTERTAINMENT          4.126174
EVENTS                 4.363647
FAMILY                 4.192490
FINANCE                4.139108
FOOD_AND_DRINK         4.170709
GAME                   4.282506
HEALTH_AND_FITNESS     4.266296
HOUSE_AND_HOME         4.196819
LIBRARIES_AND_DEMO     4.182938
LIFESTYLE              4.112427
MAPS_AND_NAVIGATION    4.065061
MEDICAL                4.190167
NEWS_AND_MAGAZINES     4.142993
PARENTING              4.282223
PERSONALIZATION        4.306873
PHOTOGRAPHY            4.192179
PRODUCTIVITY           4.208287
SHOPPING               4.254052
SOCIAL                 4.248001
SPORTS                 4.218404
TOOLS                  4.065970
TRAVEL_AND_LOCAL       4.119716

In [68]:
# finding app with max average rating
df.groupby('Category').mean()['Rating'].sort_values(ascending=False)
# thus Education has max average rating

  df.groupby('Category').mean()['Rating'].sort_values(ascending=False)


Category
EDUCATION              4.387778
ART_AND_DESIGN         4.368438
EVENTS                 4.363647
BOOKS_AND_REFERENCE    4.311537
PERSONALIZATION        4.306873
GAME                   4.282506
PARENTING              4.282223
HEALTH_AND_FITNESS     4.266296
BEAUTY                 4.260882
SHOPPING               4.254052
SOCIAL                 4.248001
WEATHER                4.239675
SPORTS                 4.218404
PRODUCTIVITY           4.208287
HOUSE_AND_HOME         4.196819
FAMILY                 4.192490
PHOTOGRAPHY            4.192179
AUTO_AND_VEHICLES      4.190824
MEDICAL                4.190167
LIBRARIES_AND_DEMO     4.182938
FOOD_AND_DRINK         4.170709
COMMUNICATION          4.163842
COMICS                 4.156445
BUSINESS               4.145987
NEWS_AND_MAGAZINES     4.142993
FINANCE                4.139108
ENTERTAINMENT          4.126174
TRAVEL_AND_LOCAL       4.119716
LIFESTYLE              4.112427
VIDEO_PLAYERS          4.074858
TOOLS                  4.065970

In [69]:
# finding number of free apps in each category

In [73]:
df = df[df['Type']=='Free']
df.groupby('Category').count()['Type']

Category
ART_AND_DESIGN           61
AUTO_AND_VEHICLES        82
BEAUTY                   53
BOOKS_AND_REFERENCE     202
BUSINESS                446
COMICS                   60
COMMUNICATION           360
DATING                  227
EDUCATION               152
ENTERTAINMENT           147
EVENTS                   63
FAMILY                 1778
FINANCE                 349
FOOD_AND_DRINK          125
GAME                   1061
HEALTH_AND_FITNESS      325
HOUSE_AND_HOME           88
LIBRARIES_AND_DEMO       83
LIFESTYLE               363
MAPS_AND_NAVIGATION     132
MEDICAL                 354
NEWS_AND_MAGAZINES      281
PARENTING                58
PERSONALIZATION         308
PHOTOGRAPHY             313
PRODUCTIVITY            396
SHOPPING                258
SOCIAL                  292
SPORTS                  360
TOOLS                   764
TRAVEL_AND_LOCAL        246
VIDEO_PLAYERS           171
WEATHER                  74
Name: Type, dtype: int64