## Google app Dataset

In [1]:
import pandas as pd

dataset = pd.read_csv("Dataset.csv")
dataset.sample(5)

Unnamed: 0,App,Category,Rating,Reviews,Size_MBs,Installs,Type,Price,Content_Rating,Genres,Last_Updated,Android_Ver
7176,Deep Sleep Battery Saver,TOOLS,4.1,22032,3.7,1000000,Free,0,Everyone,Tools,"August 17, 2017",4.1 and up
6253,Blood Pressure Log - MyDiary,MEDICAL,4.7,8347,2.6,500000,Free,0,Everyone,Medical,"April 13, 2018",4.0 and up
1810,Little Eye Surgery Simulator - ER Doctor Game,FAMILY,4.4,12,19.0,1000,Free,0,Teen,Casual,"April 20, 2018",2.3 and up
6390,Find&Save - Local Shopping,SHOPPING,4.0,4602,6.2,500000,Free,0,Everyone,Shopping,"December 22, 2017",4.1 and up
7900,Z Champions,GAME,4.7,96028,59.0,1000000,Free,0,Teen,Arcade,"May 28, 2018",4.0 and up


#### Initial checks

In [2]:

dataset.head()
dataset.shape
dataset.columns     # No need to adjust
dataset.dtypes      # No need fot adjustments

App                object
Category           object
Rating            float64
Reviews             int64
Size_MBs          float64
Installs           object
Type               object
Price              object
Content_Rating     object
Genres             object
Last_Updated       object
Android_Ver        object
dtype: object

#### Remove irrelevant columns

In [3]:
dataset.drop(columns = ["Last_Updated", "Android_Ver" ], inplace = True)

#### Missing Data

In [4]:
dataset [ dataset.isna()]   # See the columns that are missing

dataset.isna().sum()        # Seeing the total number of missing values

# Dropping the rows with any missing values
clean_df = dataset.dropna()
clean_df.shape


(9367, 10)

#### Checking for duplicates

In [5]:
# Check for the duplicates
clean_df.duplicated().value_counts()        # 476v are duplicates exactly.
                                            # only flags the second one - first is considered the original

# Getting rid of the duplicates
clean_df.drop_duplicates( subset  =[ "App", "Type", "Price"])
    # Keep only one row per app name,price, and type, even if other columns differ.

Unnamed: 0,App,Category,Rating,Reviews,Size_MBs,Installs,Type,Price,Content_Rating,Genres
21,KBA-EZ Health Guide,MEDICAL,5.0,4,25.00,1,Free,0,Everyone,Medical
28,Ra Ga Ba,GAME,5.0,2,20.00,1,Paid,$1.49,Everyone,Arcade
47,Mu.F.O.,GAME,5.0,2,16.00,1,Paid,$0.99,Everyone,Arcade
82,Brick Breaker BR,GAME,5.0,7,19.00,5,Free,0,Everyone,Arcade
99,Anatomy & Physiology Vocabulary Exam Review App,MEDICAL,5.0,1,4.60,5,Free,0,Everyone,Medical
...,...,...,...,...,...,...,...,...,...,...
10824,Google Drive,PRODUCTIVITY,4.4,2731171,4.00,1000000000,Free,0,Everyone,Productivity
10828,YouTube,VIDEO_PLAYERS,4.3,25655305,4.65,1000000000,Free,0,Teen,Video Players & Editors
10829,Google Play Movies & TV,VIDEO_PLAYERS,3.7,906384,4.65,1000000000,Free,0,Teen,Video Players & Editors
10831,Google News,NEWS_AND_MAGAZINES,3.9,877635,13.00,1000000000,Free,0,Teen,News & Magazines


#### Preliminary Exploration: The Highest Ratings, Most Reviews, and Largest Size

Challenge: Identify which apps are the highest rated. What problem might you encounter if you rely exclusively on ratings alone to determine the quality of an app?

Challenge: What's the size in megabytes (MB) of the largest Android apps in the Google Play Store. Based on the data, do you think there could be a limit in place or can developers make apps as large as they please?

Challenge: Which apps have the highest number of reviews? Are there any paid apps among the top 50?

In [9]:
# Challenge 1:
clean_df.sort_values(ascending = False, by= "Rating")

Unnamed: 0,App,Category,Rating,Reviews,Size_MBs,Installs,Type,Price,Content_Rating,Genres
291,AC DC Power Monitor,LIFESTYLE,5.0,1,1.2,10,Paid,$3.04,Everyone,Lifestyle
126,Tablet Reminder,MEDICAL,5.0,4,2.5,5,Free,0,Everyone,Medical
128,CQ ESPM,BUSINESS,5.0,2,3.4,5,Free,0,Everyone,Business
21,KBA-EZ Health Guide,MEDICAL,5.0,4,25.0,1,Free,0,Everyone,Medical
28,Ra Ga Ba,GAME,5.0,2,20.0,1,Paid,$1.49,Everyone,Arcade
...,...,...,...,...,...,...,...,...,...,...
240,House party - live chat,DATING,1.0,1,9.2,10,Free,0,Mature 17+,Dating
1271,MbH BM,MEDICAL,1.0,1,2.3,100,Free,0,Everyone,Medical
1721,Lottery Ticket Checker - Florida Results & Lotto,TOOLS,1.0,3,41.0,500,Free,0,Everyone,Tools
818,Familial Hypercholesterolaemia Handbook,MEDICAL,1.0,2,33.0,100,Free,0,Everyone,Medical


In [10]:
# Challenge 2:
clean_df.sort_values(by="Size_MBs", ascending = False).head(10)
    # Highest seems to be 100 mbs

Unnamed: 0,App,Category,Rating,Reviews,Size_MBs,Installs,Type,Price,Content_Rating,Genres
7926,Post Bank,FINANCE,4.5,60449,100.0,1000000,Free,0,Everyone,Finance
8719,Draft Simulator for FUT 18,SPORTS,4.6,162933,100.0,5000000,Free,0,Everyone,Sports
10295,SimCity BuildIt,FAMILY,4.5,4218587,100.0,50000000,Free,0,Everyone 10+,Simulation
7927,The Walking Dead: Our World,GAME,4.0,22435,100.0,1000000,Free,0,Teen,Action
8718,Mini Golf King - Multiplayer Game,GAME,4.5,531458,100.0,5000000,Free,0,Everyone,Sports
9944,Gangster Town: Vice District,FAMILY,4.3,65146,100.0,10000000,Free,0,Mature 17+,Simulation
1795,Navi Radiography Pro,MEDICAL,4.7,11,100.0,500,Paid,$15.99,Everyone,Medical
9945,Ultimate Tennis,SPORTS,4.3,183004,100.0,10000000,Free,0,Everyone,Sports
7928,Stickman Legends: Shadow Wars,GAME,4.4,38419,100.0,1000000,Paid,$0.99,Everyone 10+,Action
3144,Vi Trainer,HEALTH_AND_FITNESS,3.6,124,100.0,5000,Free,0,Everyone,Health & Fitness


In [13]:
# Challenge 3:
    # Get the top 50  apps by reviews
top_50_apps_reviews = clean_df.sort_values(by ="Reviews", ascending = False).head(50)

    # Get the filtered by type: free/ Paid ones
top_50_apps_reviews.groupby("Type").count()
        # There are no paid apps - they are all free.

Unnamed: 0_level_0,App,Category,Rating,Reviews,Size_MBs,Installs,Price,Content_Rating,Genres
Type,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1
Free,50,50,50,50,50,50,50,50,50
