# main.py

### Import libraries

In [55]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
import datetime as dt

### Import Dataset into Dataframe

In [56]:
df = pd.read_csv('../dataset/playstore_dataset.csv')

print(f"Dataset loaded in Dataframe successfully. Shape: {df.shape}")
df.head(2)

Dataset loaded in Dataframe successfully. Shape: (60, 30)


Unnamed: 0,appId,title,developer,developerId,released,score,ratings,installs,free,price,...,summary,url,most_liked_review,most_liked_review_score,most_liked_review_date,most_disliked_review,most_disliked_review_score,most_disliked_review_date,discovered_from_category,discovered_query
0,com.king.candycrushsaga,Candy Crush Saga,King,6577204690045492686,"Nov 15, 2012",4.619896,38967321,"1,000,000,000+",True,0,...,Match your way through candy puzzles packed wi...,https://play.google.com/store/apps/details?id=...,I loved this game. Played for years (over 9000...,1,2025-07-22T02:19:25,"fun game as always, but the amount of pop ups ...",2.0,2025-02-13T01:59:01,GAME,top free games
1,com.rovio.baba,Angry Birds 2,Rovio Entertainment Oy,9133452689932095671,"Jul 23, 2015",4.160458,6278547,"100,000,000+",True,0,...,Play the Angry Birds game enjoyed by millions ...,https://play.google.com/store/apps/details?id=...,The game is rough. Some levels are made to be ...,1,2025-03-14T13:48:59,Got back into this after playing the originals...,2.0,2025-06-30T05:27:42,GAME,top free games


## Dataframe info

In [57]:
df.describe()

Unnamed: 0,score,ratings,price,updated,most_liked_review_score,most_disliked_review_score
count,60.0,60.0,60.0,60.0,60.0,51.0
mean,4.344631,16412630.0,0.0,1755825000.0,2.833333,1.352941
std,0.359948,38998120.0,0.0,3001648.0,1.814879,0.48264
min,3.282239,3771.0,0.0,1738902000.0,1.0,1.0
25%,4.120493,658888.0,0.0,1756090000.0,1.0,1.0
50%,4.461675,2798364.0,0.0,1756827000.0,2.5,1.0
75%,4.623284,8060641.0,0.0,1756937000.0,5.0,2.0
max,4.866443,172449600.0,0.0,1757076000.0,5.0,2.0


In [58]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 60 entries, 0 to 59
Data columns (total 30 columns):
 #   Column                      Non-Null Count  Dtype  
---  ------                      --------------  -----  
 0   appId                       60 non-null     object 
 1   title                       60 non-null     object 
 2   developer                   60 non-null     object 
 3   developerId                 60 non-null     object 
 4   released                    57 non-null     object 
 5   score                       60 non-null     float64
 6   ratings                     60 non-null     int64  
 7   installs                    60 non-null     object 
 8   free                        60 non-null     bool   
 9   price                       60 non-null     int64  
 10  currency                    60 non-null     object 
 11  offersInAppPurchases        60 non-null     bool   
 12  inAppProductPrice           32 non-null     object 
 13  containsAds                 60 non-nu

## Basic checks of Dataframe

In [59]:
df.isnull().sum()[[
        'appId', 'inAppProductPrice', 'contentRatingDescription', 'most_disliked_review', 'most_disliked_review_score',
        'most_disliked_review_date'
]]

appId                          0
inAppProductPrice             28
contentRatingDescription      47
most_disliked_review           6
most_disliked_review_score     9
most_disliked_review_date      9
dtype: int64

In [60]:
print(df['contentRating'].unique())
print(df['genre'].unique())

['Everyone' 'Everyone 10+' 'Teen' 'Mature 17+']
['Casual' 'Arcade' 'Action' 'Strategy' 'Finance' 'Tools' 'Productivity'
 'Education' 'Social' 'News & Magazines' 'Health & Fitness'
 'Communication' 'Travel & Local' 'Entertainment'
 'Video Players & Editors' 'Music & Audio' 'Shopping']


### Duplicates Handling

In [61]:
#       Drop duplicates if any:
df.drop_duplicates(subset="appId", inplace=True)

In [62]:
#       Handle Missing Values
#       For reviews: Keep them as NaN for now.
#       For monetization field:

df['inAppProductPrice'].fillna("No In App Product Price info", inplace=True)
df['contentRatingDescription'].fillna("Not provided", inplace=True)

The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  df['inAppProductPrice'].fillna("No In App Product Price info", inplace=True)
The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  df['contentRatingDescription'].fillna("Not provided", inplace=True)


In [63]:
#       Convert 'updated' from Unix Timestamp -> datetime
df["updated"] = pd.to_datetime(df["updated"], unit="s", errors="coerce")

#       Convert 'released' if available
df["released"] = pd.to_datetime(df["released"], errors="coerce")

## MONETIZATION features

In [64]:
#       PRICE BUCKETS
df['price_bucket'] = pd.cut(
        df['price'],
        bins=[-0.01, 0.01, 1, 5, 10, 50, 100, np.inf],
        labels=["Free", "≤$1", "$1-5", "$5-10", "$10-50", "$50-100", "$100+"]
)

In [65]:
#       CLEAN INSTALLS COLUMN (remove + and ,)
def parse_installs(x):
        try:
                return int(x.replace("+", "").replace(",", "").strip())
        except:
                return np.nan

df["installs_clean"] =df["installs"].apply(parse_installs)

In [66]:
#       Midpoint Revenue Proxy (for paid apps only)
#       Not real review but useful for comparisons
def revenue_proxy(row):
    if row["free"] or row["installs_clean"] is np.nan:
        return 0
    return row["price"] * row["installs_clean"]
    
df["revenue_proxy"] = df.apply(revenue_proxy, axis=1)

## Quick Monetization Stats

In [67]:
print("\n====== Basic Monetization Stats ======")
print(df[["free", "offersInAppPurchases", "containsAds"]].sum())

print("\n====== Price Buckets ======")
print(df["price_bucket"].value_counts())

print("\n====== Revenue Proxy ======")
print(df.sort_values("revenue_proxy", ascending=False)[
    ["title", "price", "installs_clean", "revenue_proxy"]
].head())


free                    60
offersInAppPurchases    32
containsAds             26
dtype: int64

price_bucket
Free       60
≤$1         0
$1-5        0
$5-10       0
$10-50      0
$50-100     0
$100+       0
Name: count, dtype: int64

                          title  price  installs_clean  revenue_proxy
0              Candy Crush Saga      0      1000000000              0
1                 Angry Birds 2      0       100000000              0
2                   Mob Control      0       100000000              0
3  Temple Run 2: Endless Escape      0      1000000000              0
4           Plants vs. Zombies™      0       500000000              0


## Save Clean Dataset

In [68]:
df.to_csv("../dataset/playstore_dataset_clean.csv", index=False)
print("\nCleaned dataset saved as playstore_dataset_clean.csv")


Cleaned dataset saved as playstore_dataset_clean.csv
