## Google app Dataset

In [1]:
import pandas as pd

dataset = pd.read_csv("Dataset.csv")
dataset.sample(5)

Unnamed: 0,App,Category,Rating,Reviews,Size_MBs,Installs,Type,Price,Content_Rating,Genres,Last_Updated,Android_Ver
2956,DX Simulation Belt for Decade henshin,FAMILY,4.5,64,32.0,5000,Free,0,Everyone,Simulation,"July 27, 2018",4.1 and up
10574,Geometry Dash Lite,GAME,4.5,6181640,58.0,100000000,Free,0,Everyone,Arcade,"December 22, 2017",4.0 and up
10102,Bowmasters,GAME,4.7,1535581,36.0,50000000,Free,0,Teen,Action,"July 23, 2018",4.1 and up
5162,Sad Poetry Photo Frames 2018,ART_AND_DESIGN,4.5,176,10.0,100000,Free,0,Everyone,Art & Design,"April 2, 2018",4.0.3 and up
5800,Modern Action Commando 3D,GAME,4.2,492,63.0,100000,Free,0,Teen,Action,"April 6, 2017",4.0.3 and up


#### Initial checks

In [3]:

dataset.head()
dataset.shape
dataset.columns     # No need to adjust
dataset.dtypes      # No need fot adjustments

App                object
Category           object
Rating            float64
Reviews             int64
Size_MBs          float64
Installs           object
Type               object
Price              object
Content_Rating     object
Genres             object
Last_Updated       object
Android_Ver        object
dtype: object

#### Remove irrelevant columns

In [4]:
dataset.drop(columns = ["Last_Updated", "Android_Ver" ], inplace = True)

#### Missing Data

In [5]:
dataset [ dataset.isna()]   # See the columns that are missing

dataset.isna().sum()        # Seeing the total number of missing values

# Dropping the rows with any missing values
clean_df = dataset.dropna()
clean_df.shape


(9367, 10)

#### Checking for duplicates

In [6]:
# Check for the duplicates
clean_df.duplicated().value_counts()        # 476v are duplicates exactly.
                                            # only flags the second one - first is considered the original

# Getting rid of the duplicates
clean_df.drop_duplicates( subset  =[ "App", "Type", "Price"])
    # Keep only one row per app name,price, and type, even if other columns differ.

Unnamed: 0,App,Category,Rating,Reviews,Size_MBs,Installs,Type,Price,Content_Rating,Genres
21,KBA-EZ Health Guide,MEDICAL,5.0,4,25.00,1,Free,0,Everyone,Medical
28,Ra Ga Ba,GAME,5.0,2,20.00,1,Paid,$1.49,Everyone,Arcade
47,Mu.F.O.,GAME,5.0,2,16.00,1,Paid,$0.99,Everyone,Arcade
82,Brick Breaker BR,GAME,5.0,7,19.00,5,Free,0,Everyone,Arcade
99,Anatomy & Physiology Vocabulary Exam Review App,MEDICAL,5.0,1,4.60,5,Free,0,Everyone,Medical
...,...,...,...,...,...,...,...,...,...,...
10824,Google Drive,PRODUCTIVITY,4.4,2731171,4.00,1000000000,Free,0,Everyone,Productivity
10828,YouTube,VIDEO_PLAYERS,4.3,25655305,4.65,1000000000,Free,0,Teen,Video Players & Editors
10829,Google Play Movies & TV,VIDEO_PLAYERS,3.7,906384,4.65,1000000000,Free,0,Teen,Video Players & Editors
10831,Google News,NEWS_AND_MAGAZINES,3.9,877635,13.00,1000000000,Free,0,Teen,News & Magazines


#### Preliminary Exploration: The Highest Ratings, Most Reviews, and Largest Size

Challenge: Identify which apps are the highest rated. What problem might you encounter if you rely exclusively on ratings alone to determine the quality of an app?

Challenge: What's the size in megabytes (MB) of the largest Android apps in the Google Play Store. Based on the data, do you think there could be a limit in place or can developers make apps as large as they please?

Challenge: Which apps have the highest number of reviews? Are there any paid apps among the top 50?

In [7]:
# Challenge 1:
clean_df.sort_values(ascending = False, by= "Rating")

Unnamed: 0,App,Category,Rating,Reviews,Size_MBs,Installs,Type,Price,Content_Rating,Genres
291,AC DC Power Monitor,LIFESTYLE,5.0,1,1.2,10,Paid,$3.04,Everyone,Lifestyle
126,Tablet Reminder,MEDICAL,5.0,4,2.5,5,Free,0,Everyone,Medical
128,CQ ESPM,BUSINESS,5.0,2,3.4,5,Free,0,Everyone,Business
21,KBA-EZ Health Guide,MEDICAL,5.0,4,25.0,1,Free,0,Everyone,Medical
28,Ra Ga Ba,GAME,5.0,2,20.0,1,Paid,$1.49,Everyone,Arcade
...,...,...,...,...,...,...,...,...,...,...
240,House party - live chat,DATING,1.0,1,9.2,10,Free,0,Mature 17+,Dating
1271,MbH BM,MEDICAL,1.0,1,2.3,100,Free,0,Everyone,Medical
1721,Lottery Ticket Checker - Florida Results & Lotto,TOOLS,1.0,3,41.0,500,Free,0,Everyone,Tools
818,Familial Hypercholesterolaemia Handbook,MEDICAL,1.0,2,33.0,100,Free,0,Everyone,Medical


In [8]:
# Challenge 2:
clean_df.sort_values(by="Size_MBs", ascending = False).head(10)
    # Highest seems to be 100 mbs

Unnamed: 0,App,Category,Rating,Reviews,Size_MBs,Installs,Type,Price,Content_Rating,Genres
7926,Post Bank,FINANCE,4.5,60449,100.0,1000000,Free,0,Everyone,Finance
8719,Draft Simulator for FUT 18,SPORTS,4.6,162933,100.0,5000000,Free,0,Everyone,Sports
10295,SimCity BuildIt,FAMILY,4.5,4218587,100.0,50000000,Free,0,Everyone 10+,Simulation
7927,The Walking Dead: Our World,GAME,4.0,22435,100.0,1000000,Free,0,Teen,Action
8718,Mini Golf King - Multiplayer Game,GAME,4.5,531458,100.0,5000000,Free,0,Everyone,Sports
9944,Gangster Town: Vice District,FAMILY,4.3,65146,100.0,10000000,Free,0,Mature 17+,Simulation
1795,Navi Radiography Pro,MEDICAL,4.7,11,100.0,500,Paid,$15.99,Everyone,Medical
9945,Ultimate Tennis,SPORTS,4.3,183004,100.0,10000000,Free,0,Everyone,Sports
7928,Stickman Legends: Shadow Wars,GAME,4.4,38419,100.0,1000000,Paid,$0.99,Everyone 10+,Action
3144,Vi Trainer,HEALTH_AND_FITNESS,3.6,124,100.0,5000,Free,0,Everyone,Health & Fitness


In [9]:
# Challenge 3:
    # Get the top 50  apps by reviews
top_50_apps_reviews = clean_df.sort_values(by ="Reviews", ascending = False).head(50)

    # Get the filtered by type: free/ Paid ones
top_50_apps_reviews.groupby("Type").count()
        # There are no paid apps - they are all free.

Unnamed: 0_level_0,App,Category,Rating,Reviews,Size_MBs,Installs,Price,Content_Rating,Genres
Type,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1
Free,50,50,50,50,50,50,50,50,50


## Plotly

In [10]:
# Getting the types of content ratings
ratings = clean_df["Content_Rating"].value_counts()

In [11]:
# Steps to using Plotly
    # Import statement:
import plotly.express as px

#### Creating Pie charts

In [12]:
# Create a pie chart using plotly
fig = px.pie(labels=ratings.index,
    values=ratings.values,
    title="Content Rating",
    names=ratings.index,                # Defining the category names
    hole= 0.6                           # Creating a donut shape
)
fig.update_traces(textposition='outside', textinfo='percent+label')     # Moves the labels outside of the pie chart.

fig.show()


#### Creating Normal Bar charts

How many apps had over 1 billion (that's right - BILLION) installations? How many apps just had a single install?



Check the datatype of the Installs column.

Count the number of apps at each level of installations.

Convert the number of installations (the Installs column) to a numeric data type. Hint: this is a 2-step process. You'll have to make sure you remove non-numeric characters first.

In [14]:
clean_df["Installs"].dtypes             # String type of data

# Converting the data type
    # Removing the commas first:
clean_df = clean_df.copy()          #' when filtering or slicing - can create a view instead of a cppy.
clean_df["Installs"] = clean_df["Installs"].astype(str).str.replace(',', "")
                                    # Get to the correct columns
                                    # convert each of the columns to strings
                                    #. str forces it to apply it to the characters inside the overall strings( looks at each)
                                    # replace the odd ones.
clean_df["Installs"] =pd.to_numeric(clean_df["Installs"])
clean_df.groupby("Installs").count()

Unnamed: 0_level_0,App,Category,Rating,Reviews,Size_MBs,Type,Price,Content_Rating,Genres
Installs,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1
1,3,3,3,3,3,3,3,3,3
5,9,9,9,9,9,9,9,9,9
10,69,69,69,69,69,69,69,69,69
50,56,56,56,56,56,56,56,56,56
100,309,309,309,309,309,309,309,309,309
500,201,201,201,201,201,201,201,201,201
1000,714,714,714,714,714,714,714,714,714
5000,432,432,432,432,432,432,432,432,432
10000,1010,1010,1010,1010,1010,1010,1010,1010,1010
50000,467,467,467,467,467,467,467,467,467


#### Challenge
 - Convert the price column to numeric data. Then investigate the top 20 most expensive apps in the dataset.
 - Remove all apps that cost more than $250 from the df_apps_clean DataFrame.
 - Add a column called 'Revenue_Estimate' to the DataFrame. This column should hold the price of the app times the number of installs. What are the top 10 highest-grossing paid apps according to this estimate? Out of the top 10, how many are games?

In [15]:
# Need to remove the dollar sign
clean_df["Price"] = clean_df["Price"].astype(str).str.replace("$", "")
# Converting to numeric data
clean_df["Price"] =pd.to_numeric(clean_df["Price"])

# Looking at the top 20 of them
clean_df.sort_values(by ="Price", ascending = False).head(20)
# Remove all apps that cost above 250
adjusted_price = clean_df[clean_df["Price"] <250]

# Revenue estimate column
revenue_estimate = adjusted_price["Installs"]*adjusted_price["Price"]
adjusted_price.insert(1, "Revenue_Estimate", revenue_estimate)
adjusted_price[:10]

# Top 10 highest grossing:
adjusted_price_ranked = adjusted_price.sort_values(by="Revenue_Estimate", ascending = False)
adjusted_price_ranked
top_10 = adjusted_price.head(10)
top_10.groupby("Category").count()["App"]


Category
BUSINESS    2
GAME        3
MEDICAL     4
SPORTS      1
Name: App, dtype: int64

In [16]:
clean_df["Category"].nunique()
top_10_category = clean_df["Category"].value_counts()[:10]
top_10_category

Category
FAMILY           1747
GAME             1097
TOOLS             734
PRODUCTIVITY      351
MEDICAL           350
COMMUNICATION     328
FINANCE           323
SPORTS            319
PHOTOGRAPHY       317
LIFESTYLE         315
Name: count, dtype: int64

In [17]:
# Plotting the bar chart
bar = px.bar(x = top_10_category.index,
             y = top_10_category.values,
             hover_name= top_10_category.index,
             labels = {"x":"Category", "y":"Value"},
             title = "Top 10 Categories",)
bar.show()

In [18]:
# Plotting categories by installs
category_installs= clean_df.groupby("Category"). agg({"Installs":"sum"})
category_installs.sort_values('Installs', ascending=True, inplace=True)
category_installs
h_bar  = px.bar( x= category_installs["Installs"],
                 y = category_installs.index,
                 orientation="h",
                 title='Category Popularity',
                 labels= {"x": "Installs", "y": "Categories"})
h_bar.show()

#### Creating more complex scatter charts

In [24]:
apps_and_installs_df = clean_df.groupby("Category").agg({"Installs":"sum",
                                  "App":"count"})

In [33]:
scatter = px.scatter(x= apps_and_installs_df["App"],
           y=apps_and_installs_df["Installs"],
           hover_name= apps_and_installs_df.index,
           size= apps_and_installs_df["Installs"],
           labels= {"x": "Number of apps( Lower= More Concentrated",
                    "y":"Installs"},
           color  = apps_and_installs_df["Installs"],)

scatter.show()

#### Using the stack method

In [41]:
genre_df = clean_df["Genres"]
    # There are sub categories witin the genre sections.

# Splitting them up:
stack = genre_df.str.split(";", expand = True).stack()
    # Coverts the entered sting and splits them into two
    # Expand = True Splits them horizontally
    # Stacks them vertically into  long form data
stack = stack.value_counts().sort_values(ascending = False)

In [48]:
# Convert the earlier data  into a bar chart
chart =  px.bar(x= stack.index[:10],
                y = stack.values[:10],
                title ="Top Genres",
                labels ={"x" :"Genre", "y":"Number of Apps"},
                color_continuous_scale="Agsunset",  # Whats the scale of reference.
                color= stack.values[:10],)          # What data to use for color

chart.update_layout(coloraxis_showscale=False)
chart.show()