# main.py

### Import libraries

In [69]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
import datetime as dt

### Import Dataset into Dataframe

In [70]:
df = pd.read_csv('../dataset/playstore_dataset.csv')

print(f"Dataset loaded in Dataframe successfully. Shape: {df.shape}")
df.head(2)

Dataset loaded in Dataframe successfully. Shape: (357, 30)


Unnamed: 0,appId,title,developer,developerId,released,score,ratings,installs,free,price,...,summary,url,most_liked_review,most_liked_review_score,most_liked_review_date,most_disliked_review,most_disliked_review_score,most_disliked_review_date,discovered_from_category,discovered_query
0,com.king.candycrushsaga,Candy Crush Saga,King,6577204690045492686,"Nov 15, 2012",4.619947,38968364.0,"1,000,000,000+",True,0.0,...,Match your way through candy puzzles packed wi...,https://play.google.com/store/apps/details?id=...,I loved this game. Played for years (over 9000...,1.0,2025-07-22T02:19:25,"fun game as always, but the amount of pop ups ...",2.0,2025-02-13T01:59:01,GAME,top free games
1,com.rovio.baba,Angry Birds 2,Rovio Entertainment Oy,9133452689932095671,"Jul 23, 2015",4.160238,6278648.0,"100,000,000+",True,0.0,...,Play the Angry Birds game enjoyed by millions ...,https://play.google.com/store/apps/details?id=...,The game is rough. Some levels are made to be ...,1.0,2025-03-14T13:48:59,Got back into this after playing the originals...,2.0,2025-06-30T05:27:42,GAME,top free games


## Dataframe info

In [71]:
df.describe()

Unnamed: 0,score,ratings,price,updated,most_liked_review_score,most_disliked_review_score
count,354.0,354.0,357.0,357.0,356.0,311.0
mean,4.37537,7644587.0,0.011176,1754747000.0,2.997191,1.347267
std,0.404734,22934870.0,0.211173,13379600.0,1.710775,0.476869
min,2.60241,66.0,0.0,1531381000.0,1.0,1.0
25%,4.190061,197144.0,0.0,1755898000.0,1.0,1.0
50%,4.494429,1310036.0,0.0,1756717000.0,3.0,1.0
75%,4.646368,5099808.0,0.0,1756927000.0,5.0,2.0
max,5.0,211049800.0,3.99,1757117000.0,5.0,2.0


In [72]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 357 entries, 0 to 356
Data columns (total 30 columns):
 #   Column                      Non-Null Count  Dtype  
---  ------                      --------------  -----  
 0   appId                       357 non-null    object 
 1   title                       357 non-null    object 
 2   developer                   357 non-null    object 
 3   developerId                 357 non-null    object 
 4   released                    346 non-null    object 
 5   score                       354 non-null    float64
 6   ratings                     354 non-null    float64
 7   installs                    357 non-null    object 
 8   free                        357 non-null    bool   
 9   price                       357 non-null    float64
 10  currency                    357 non-null    object 
 11  offersInAppPurchases        357 non-null    bool   
 12  inAppProductPrice           218 non-null    object 
 13  containsAds                 357 non

## Basic checks of Dataframe

In [73]:
df.isnull().sum()[[
        'appId', 'inAppProductPrice', 'contentRatingDescription', 'most_disliked_review', 'most_disliked_review_score',
        'most_disliked_review_date'
]]

appId                           0
inAppProductPrice             139
contentRatingDescription      292
most_disliked_review           28
most_disliked_review_score     46
most_disliked_review_date      46
dtype: int64

In [74]:
print(df['contentRating'].unique())
print(df['genre'].unique())

['Everyone' 'Everyone 10+' 'Mature 17+' 'Teen']
['Casual' 'Arcade' 'Action' 'Strategy' 'Sports' 'Puzzle' 'Adventure'
 'Racing' 'Word' 'Finance' 'Health & Fitness' 'Shopping' 'Tools'
 'Productivity' 'Personalization' 'Education' 'Educational' 'Social'
 'News & Magazines' 'Communication' 'Business' 'Lifestyle' 'Entertainment'
 'Travel & Local' 'Maps & Navigation' 'Video Players & Editors'
 'Music & Audio']


### Duplicates Handling

In [75]:
#       Drop duplicates if any:
df.drop_duplicates(subset="appId", inplace=True)

In [76]:
#       Handle Missing Values
#       For reviews: Keep them as NaN for now.
#       For monetization field:

df['inAppProductPrice'].fillna("No In App Product Price info", inplace=True)
df['contentRatingDescription'].fillna("Not provided", inplace=True)

The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  df['inAppProductPrice'].fillna("No In App Product Price info", inplace=True)
The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  df['contentRatingDescription'].fillna("Not provided", inplace=True)


In [77]:
#       Convert 'updated' from Unix Timestamp -> datetime
df["updated"] = pd.to_datetime(df["updated"], unit="s", errors="coerce")

#       Convert 'released' if available
df["released"] = pd.to_datetime(df["released"], errors="coerce")

## MONETIZATION features

In [78]:
#       PRICE BUCKETS
df['price_bucket'] = pd.cut(
        df['price'],
        bins=[-0.01, 0.01, 1, 5, 10, 50, 100, np.inf],
        labels=["Free", "≤$1", "$1-5", "$5-10", "$10-50", "$50-100", "$100+"]
)

In [79]:
#       CLEAN INSTALLS COLUMN (remove + and ,)
def parse_installs(x):
        try:
                return int(x.replace("+", "").replace(",", "").strip())
        except:
                return np.nan

df["installs_clean"] =df["installs"].apply(parse_installs)

In [80]:
#       Midpoint Revenue Proxy (for paid apps only)
#       Not real review but useful for comparisons
def revenue_proxy(row):
    if row["free"] or row["installs_clean"] is np.nan:
        return 0
    return row["price"] * row["installs_clean"]
    
df["revenue_proxy"] = df.apply(revenue_proxy, axis=1)

## Quick Monetization Stats

In [81]:
print("\n====== Basic Monetization Stats ======")
print(df[["free", "offersInAppPurchases", "containsAds"]].sum())

print("\n====== Price Buckets ======")
print(df["price_bucket"].value_counts())

print("\n====== Revenue Proxy ======")
print(df.sort_values("revenue_proxy", ascending=False)[
    ["title", "price", "installs_clean", "revenue_proxy"]
].head())


free                    347
offersInAppPurchases    209
containsAds             149
dtype: int64

price_bucket
Free       347
$1-5         1
≤$1          0
$5-10        0
$10-50       0
$50-100      0
$100+        0
Name: count, dtype: int64

                              title  price  installs_clean  revenue_proxy
72   Dev Tools Pro(Developer Tools)   3.99           10000        39900.0
246  Hopper: Hotels, Flights & Cars   0.00        10000000            0.0
245  Trip.com: Book Flights, Hotels   0.00        50000000            0.0
244  Expedia: Hotels, Flights, Cars   0.00        50000000            0.0
243  Tripadvisor: Plan & Book Trips   0.00       100000000            0.0


## Save Clean Dataset

In [82]:
df.to_csv("../dataset/playstore_dataset_clean.csv", index=False)
print("\nCleaned dataset saved as playstore_dataset_clean.csv")


Cleaned dataset saved as playstore_dataset_clean.csv
