In [2]:
import pandas as pd
import numpy as np
import seaborn as sns

In [8]:
df = pd.read_csv('googleplaystore.csv')
df.sample(5)

Unnamed: 0,App,Category,Rating,Reviews,Size,Installs,Type,Price,Content Rating,Genres,Last Updated,Current Ver,Android Ver
8654,"TickTick: To Do List with Reminder, Day Planner",PRODUCTIVITY,4.6,25370,Varies with device,"1,000,000+",Free,0,Everyone,Productivity,"August 6, 2018",Varies with device,Varies with device
4117,Inputting Plus: Ctrl + Z/F/C/V,PRODUCTIVITY,4.2,1117,3.0M,"50,000+",Free,0,Everyone,Productivity,"October 18, 2017",1.0.6,5.0 and up
9845,Empires and Allies,FAMILY,4.3,398746,Varies with device,"10,000,000+",Free,0,Everyone 10+,Strategy,"August 6, 2018",Varies with device,4.1 and up
6846,Bu Hangi Dizi ?,GAME,,14,19M,"1,000+",Free,0,Everyone,Trivia,"May 4, 2018",3.1.7z,4.0.3 and up
6078,YAY - TBH,SOCIAL,4.1,2520,12M,"100,000+",Free,0,Teen,Social,"November 17, 2017",1.1.5,4.3 and up


##### 1. Which of the following column(s) has/have null values?

In [9]:
df.isna().sum().sort_values(ascending=False)

Rating            1474
Current Ver          8
Android Ver          3
Type                 1
Content Rating       1
App                  0
Category             0
Reviews              0
Size                 0
Installs             0
Price                0
Genres               0
Last Updated         0
dtype: int64

##### 2. Clean the `Rating` column and the other columns containing null values

In [10]:
df.loc[df['Rating'] > 5, 'Rating'] = np.nan
df['Rating'] = df['Rating'].fillna(df['Rating'].mean())
df.dropna(inplace=True)

##### 3. Clean the column `Reviews` and make it numeric

In [11]:
#Clean the column Reviews and make it numeric
df.loc[df['Reviews'].str.contains('M'), 'Reviews'] = (pd.to_numeric(
    df.loc[df['Reviews'].str.contains('M'), 'Reviews'].str.replace('M', '')) * 1_000_000).astype('str')
df['Reviews'] = pd.to_numeric(df['Reviews'])

##### 4. How many duplicated apps are there?

In [12]:
#How many duplicated apps are there?
df['App'].duplicated(keep=False).sum()

1979

###### 5. Drop duplicated apps keeping the ones with the greatest number of reviews

In [13]:
#Drop duplicated apps keeping the ones with the greatest number of reviews
df_sorted = df.sort_values(by=['App', 'Reviews'])
df_sorted.loc[
    df_sorted['App'].duplicated(keep=False) & ~df_sorted.duplicated(keep=False),
    ['App', 'Reviews']
].head(5)

Unnamed: 0,App,Reviews
5415,365Scores - Live Scores,666246
3083,365Scores - Live Scores,666521
3953,8 Ball Pool,14184910
1675,8 Ball Pool,14198297
1703,8 Ball Pool,14198602


##### 6. Format the `Category` column

In [14]:
#Format the Category column
df['Category'] = df['Category'].str.replace('_', ' ').str.capitalize()

##### 7. Clean and convert the `Installs` column to numeric type

In [15]:
#Clean and convert the Installs column to numeric type
df['Installs'] = df['Installs'].str.replace('+', '').str.replace(',', '')
df['Installs'] = pd.to_numeric(df['Installs'])

##### 8. Clean and convert the `Size` column to numeric (representing bytes)

In [16]:
#Clean and convert the Size column to numeric (representing bytes)
df['Size'] = df['Size'].replace('Varies with device', "0").astype(str)

df.loc[df['Size'].str.contains('M'), 'Size'] = (pd.to_numeric(
    df.loc[df['Size'].str.contains('M'), 'Size'].str.replace('M', '')
) * (1024 * 1024)).astype(str)

df.loc[df['Size'].str.contains('k'), 'Size'] = (pd.to_numeric(
    df.loc[df['Size'].str.contains('k'), 'Size'].str.replace('k', '')
) * 1024).astype(str)

df['Size'] = df['Size'].str.replace('+', '')
df['Size'] = df['Size'].str.replace(',', '')
df['Size'] = pd.to_numeric(df['Size'])

##### 9. Clean and convert the `Price` column to numeric

In [17]:
#Clean and convert the Price column to numeric
df.loc[df['Price'] == 'Free', 'Price'] = "0"
df['Price'] = df['Price'].str.replace('$', '').str.replace(',', '.')
df['Price'] = pd.to_numeric(df['Price'])

##### 10. Paid or free?

In [18]:
#Paid or free?
df['Distribution'] = df['Price'].apply(lambda x: 'Paid' if x > 0 else 'Free')

# Analysis

##### 11. What company has the most reviews?

In [19]:
#What company has the most reviews?
df.sort_values(by='Reviews', ascending=False)

Unnamed: 0,App,Category,Rating,Reviews,Size,Installs,Type,Price,Content Rating,Genres,Last Updated,Current Ver,Android Ver,Distribution
2544,Facebook,Social,4.100000,78158306,0.0,1000000000,Free,0.0,Teen,Social,"August 3, 2018",Varies with device,Varies with device,Free
3943,Facebook,Social,4.100000,78128208,0.0,1000000000,Free,0.0,Teen,Social,"August 3, 2018",Varies with device,Varies with device,Free
336,WhatsApp Messenger,Communication,4.400000,69119316,0.0,1000000000,Free,0.0,Everyone,Communication,"August 3, 2018",Varies with device,Varies with device,Free
381,WhatsApp Messenger,Communication,4.400000,69119316,0.0,1000000000,Free,0.0,Everyone,Communication,"August 3, 2018",Varies with device,Varies with device,Free
3904,WhatsApp Messenger,Communication,4.400000,69109672,0.0,1000000000,Free,0.0,Everyone,Communication,"August 3, 2018",Varies with device,Varies with device,Free
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2473,Warfarin Dose Calculator,Medical,4.191757,0,7235174.4,50,Free,0.0,Everyone,Medical,"July 15, 2018",3.0.0,4.1 and up,Free
9913,Programi podrške EU,Communication,4.191757,0,5662310.4,100,Free,0.0,Everyone,Communication,"January 15, 2018",1.0.3,4.1 and up,Free
2471,MyWoundDoctor - Provider,Medical,4.191757,0,3670016.0,1,Free,0.0,Everyone,Medical,"July 27, 2018",1.2.1,5.0 and up,Free
2470,Lord Fairfax EMS Council,Medical,4.191757,0,31457280.0,50,Free,0.0,Everyone,Medical,"July 17, 2018",1,4.0.3 and up,Free


##### 12. Which is the category with the most most uploaded apps?

In [20]:
#Which is the category with the most most uploaded apps?
df['Category'].value_counts()

Category
Family                 1968
Game                   1144
Tools                   841
Medical                 463
Business                460
Productivity            424
Personalization         390
Communication           387
Sports                  384
Lifestyle               382
Finance                 366
Health and fitness      341
Photography             335
Social                  295
News and magazines      283
Shopping                260
Travel and local        258
Dating                  234
Books and reference     230
Video players           175
Education               156
Entertainment           149
Maps and navigation     137
Food and drink          127
House and home           88
Auto and vehicles        85
Libraries and demo       84
Weather                  82
Events                   64
Art and design           64
Parenting                60
Comics                   60
Beauty                   53
Name: count, dtype: int64

##### 13. To which category belongs the most expensive app?

In [21]:
#To which category belongs the most expensive app?
df.sort_values(by='Price', ascending=False)

Unnamed: 0,App,Category,Rating,Reviews,Size,Installs,Type,Price,Content Rating,Genres,Last Updated,Current Ver,Android Ver,Distribution
4367,I'm Rich - Trump Edition,Lifestyle,3.6,275,7654604.8,10000,Paid,400.00,Everyone,Lifestyle,"May 3, 2018",1.0.1,4.1 and up,Paid
4362,💎 I'm rich,Lifestyle,3.8,718,27262976.0,10000,Paid,399.99,Everyone,Lifestyle,"March 11, 2018",1.0.0,4.4 and up,Paid
5369,I am Rich,Finance,4.3,180,3984588.8,5000,Paid,399.99,Everyone,Finance,"March 22, 2018",1.0,4.2 and up,Paid
5364,I am rich (Most expensive app),Finance,4.1,129,2831155.2,1000,Paid,399.99,Teen,Finance,"December 6, 2017",2,4.0.3 and up,Paid
5362,I Am Rich Pro,Family,4.4,201,2831155.2,5000,Paid,399.99,Everyone,Entertainment,"May 30, 2017",1.54,1.6 and up,Paid
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
3651,Weather From DMI/YR,Weather,4.3,2143,0.0,100000,Free,0.00,Everyone,Weather,"July 31, 2018",Varies with device,Varies with device,Free
3652,wetter.com - Weather and Radar,Weather,4.2,189313,39845888.0,10000000,Free,0.00,Everyone,Weather,"August 6, 2018",Varies with device,Varies with device,Free
3653,Storm Radar: Tornado Tracker & Hurricane Alerts,Weather,4.6,89868,0.0,1000000,Free,0.00,Everyone,Weather,"June 12, 2018",Varies with device,Varies with device,Free
3654,Yandex.Weather,Weather,4.5,309617,0.0,10000000,Free,0.00,Everyone,Weather,"July 23, 2018",Varies with device,Varies with device,Free


##### 14. What's the name of the most expensive game?

In [22]:
#What's the name of the most expensive game?
df[df['Category']=='Game'].sort_values(by='Price', ascending=False)

Unnamed: 0,App,Category,Rating,Reviews,Size,Installs,Type,Price,Content Rating,Genres,Last Updated,Current Ver,Android Ver,Distribution
4203,The World Ends With You,Game,4.6,4108,13631488.0,10000,Paid,17.99,Everyone 10+,Arcade,"December 14, 2015",1.0.4,4.0 and up,Paid
10782,Trine 2: Complete Story,Game,3.8,252,11534336.0,10000,Paid,16.99,Teen,Action,"February 27, 2015",2.22,5.0 and up,Paid
6341,Blackjack Verite Drills,Game,4.6,17,4928307.2,100,Paid,14.00,Teen,Casino,"July 9, 2017",1.1.10,3.0 and up,Paid
1838,Star Wars ™: DIRTY,Game,4.5,38207,15728640.0,100000,Paid,9.99,Teen,Role Playing,"October 19, 2015",1.0.6,4.1 and up,Paid
6198,Backgammon NJ for Android,Game,4.4,1644,15728640.0,10000,Paid,7.99,Everyone,Board,"April 5, 2017",4.1,2.3.3 and up,Paid
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
3944,Four In A Line,Game,3.8,22191,3145728.0,1000000,Free,0.00,Everyone,Board,"May 4, 2015",1.10,2.3 and up,Free
3955,Cardi B Piano Game,Game,3.1,50,7759462.4,10000,Free,0.00,Everyone,Music,"November 15, 2017",2.0,4.0 and up,Free
3958,MORTAL KOMBAT X,Game,4.4,3039889,18874368.0,10000000,Free,0.00,Mature 17+,Action,"June 27, 2018",1.18.2,4.0 and up,Free
3960,► MultiCraft ― Free Miner! 👍,Game,4.3,1305050,0.0,50000000,Free,0.00,Everyone 10+,Adventure,"July 29, 2018",1.1.11.11,4.1 and up,Free


##### 15. Which is the most popular Finance App?

In [23]:
#Which is the most popular Finance App?
df[df['Category']=='Finance'].sort_values(by='Installs', ascending=False)

Unnamed: 0,App,Category,Rating,Reviews,Size,Installs,Type,Price,Content Rating,Genres,Last Updated,Current Ver,Android Ver,Distribution
1083,Google Pay,Finance,4.200000,347838,0.0,100000000,Free,0.00,Everyone,Finance,"July 26, 2018",2.70.206190089,Varies with device,Free
1096,Google Pay,Finance,4.200000,347838,0.0,100000000,Free,0.00,Everyone,Finance,"July 26, 2018",2.70.206190089,Varies with device,Free
1163,Google Pay,Finance,4.200000,347874,0.0,100000000,Free,0.00,Everyone,Finance,"July 26, 2018",2.70.206190089,Varies with device,Free
5601,Google Pay,Finance,4.200000,348132,0.0,100000000,Free,0.00,Everyone,Finance,"July 26, 2018",2.70.206190089,Varies with device,Free
1156,PayPal,Finance,4.300000,659760,49283072.0,50000000,Free,0.00,Everyone,Finance,"July 18, 2018",6.28.0,4.4 and up,Free
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
9104,Dz kayas,Finance,4.191757,0,14680064.0,1,Paid,28.99,Everyone,Finance,"July 12, 2018",1.0,4.2 and up,Paid
10745,FP Boss,Finance,4.191757,1,6081740.8,1,Free,0.00,Everyone,Finance,"July 27, 2018",1.0.2,5.0 and up,Free
9101,amm dz,Finance,4.191757,0,14680064.0,1,Paid,5.99,Everyone,Finance,"July 8, 2018",1.0,4.2 and up,Paid
9917,Eu Sou Rico,Finance,4.191757,0,1468006.4,0,Paid,394.99,Everyone,Finance,"July 11, 2018",1.0,4.0.3 and up,Paid


##### 16. What *Teen* Game has the most reviews?

In [24]:
#What Teen Game has the most reviews?
df[(df['Category']=='Game') & (df['Content Rating']=='Teen')].sort_values(by='Reviews', ascending=False)

Unnamed: 0,App,Category,Rating,Reviews,Size,Installs,Type,Price,Content Rating,Genres,Last Updated,Current Ver,Android Ver,Distribution
3912,Asphalt 8: Airborne,Game,4.500000,8389714,96468992.0,100000000,Free,0.00,Teen,Racing,"July 4, 2018",3.7.1a,4.0.3 and up,Free
5417,Mobile Legends: Bang Bang,Game,4.400000,8219586,103809024.0,100000000,Free,0.00,Teen,Action,"July 24, 2018",1.2.97.3042,4.0.3 and up,Free
1988,Hungry Shark Evolution,Game,4.500000,6074627,104857600.0,100000000,Free,0.00,Teen,Arcade,"July 25, 2018",6.0.0,4.1 and up,Free
1758,Hungry Shark Evolution,Game,4.500000,6074334,104857600.0,100000000,Free,0.00,Teen,Arcade,"July 25, 2018",6.0.0,4.1 and up,Free
3973,Hungry Shark Evolution,Game,4.500000,6071542,104857600.0,100000000,Free,0.00,Teen,Arcade,"July 25, 2018",6.0.0,4.1 and up,Free
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
4431,Obbligo o Verità? PRO,Game,4.191757,4,3040870.4,100,Paid,0.99,Teen,Board,"April 26, 2018",101,3.0 and up,Paid
6335,BJ card game blackjack,Game,4.191757,3,22020096.0,500,Free,0.00,Teen,Card,"December 2, 2016",1.0,2.3 and up,Free
6555,Sic Bo,Game,4.191757,1,11534336.0,100,Paid,1.99,Teen,Card,"August 27, 2013",1.0.0,2.2 and up,Paid
7073,Animal Hunting: Sniper Shooting,Game,4.191757,0,50331648.0,50,Free,0.00,Teen,Action,"July 6, 2018",1.0,4.0 and up,Free


##### 17. What paid game has the most reviews?

In [25]:
#What paid game has the most reviews?
df[(df['Distribution'] == 'Free') & (df['Category']=='Game')].sort_values(by='Reviews', ascending=False)

Unnamed: 0,App,Category,Rating,Reviews,Size,Installs,Type,Price,Content Rating,Genres,Last Updated,Current Ver,Android Ver,Distribution
1879,Clash of Clans,Game,4.600000,44893888,102760448.0,100000000,Free,0.0,Everyone 10+,Strategy,"July 15, 2018",10.322.16,4.1 and up,Free
1704,Clash of Clans,Game,4.600000,44891723,102760448.0,100000000,Free,0.0,Everyone 10+,Strategy,"July 15, 2018",10.322.16,4.1 and up,Free
1670,Clash of Clans,Game,4.600000,44891723,102760448.0,100000000,Free,0.0,Everyone 10+,Strategy,"July 15, 2018",10.322.16,4.1 and up,Free
1917,Subway Surfers,Game,4.500000,27725352,79691776.0,1000000000,Free,0.0,Everyone 10+,Arcade,"July 12, 2018",1.90.0,4.1 and up,Free
1872,Subway Surfers,Game,4.500000,27725352,79691776.0,1000000000,Free,0.0,Everyone 10+,Arcade,"July 12, 2018",1.90.0,4.1 and up,Free
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
8580,DM Adventure,Game,4.191757,0,11534336.0,10,Free,0.0,Everyone,Adventure,"June 18, 2018",1.0.4,2.3 and up,Free
6329,Basic Strategy Training BJ 21,Game,4.191757,0,24117248.0,500,Free,0.0,Teen,Casino,"March 7, 2016",1.1,2.3 and up,Free
5824,Cyborg AX-001,Game,4.191757,0,0.0,50,Free,0.0,Everyone 10+,Action,"June 25, 2018",Varies with device,Varies with device,Free
8822,Simple x3DS Emulator - BETA,Game,4.191757,0,0.0,50000,Free,0.0,Everyone,Arcade,"June 13, 2018",Varies with device,Varies with device,Free


##### 18. How many TB (terabytes) were transferred (overall) for the most popular Lifestyle app?

In [26]:
#How many TB (terabytes) were transferred (overall) for the most popular Lifestyle app?
a = df[df['Category']=='Lifestyle'].sort_values(by='Installs', ascending=False).iloc[0]
a['Installs']*a['Size']/1024**4

6484.9853515625