In [1]:
import pandas as pd
from fastparquet import write

In [2]:
data = pd.read_csv('appstore_games.csv')

In [3]:
print ("null values: \n")
print(data.isna().sum())

null values: 

URL                                 0
ID                                  0
Name                                0
Subtitle                        11746
Icon URL                            0
Average User Rating              9446
User Rating Count                9446
Price                              24
In-app Purchases                 9324
Description                         0
Developer                           0
Age Rating                          0
Languages                          60
Size                                1
Primary Genre                       0
Genres                              0
Original Release Date               0
Current Version Release Date        0
dtype: int64


In [4]:
print("total null values: ", data.isna().sum().sum())

total null values:  40047


In [5]:
# Handling Missing Data

In [6]:
data[['Subtitle', 'Languages']] = data[['Subtitle', 'Languages']].fillna(value="No info")
data[['Average User Rating', 'User Rating Count', 'Price', 'In-app Purchases', 'Size']] = data[['Average User Rating', 'User Rating Count', 'Price', 'In-app Purchases', 'Size']].fillna(value=0.0)

In [7]:
# Handling duplicated data

In [8]:
duplicate_data = data[data.duplicated()]
print("number of duplicated rows: ", duplicate_data.shape)

number of duplicated rows:  (160, 18)


In [9]:
duplicate_data_URL = data[data.duplicated(['URL'])]
print(duplicate_data_URL.shape)

(160, 18)


In [10]:
print(len(data.URL.unique()))

16847


In [11]:
print("shape of dataframe after dropping duplicates: ", data.drop_duplicates().shape)

shape of dataframe after dropping duplicates:  (16847, 18)


In [12]:
# Data binning

In [13]:
play_labels = ["No rating. Who knows?", 'horrible', 'somewhat playable', 'awesome']
category = [-1, 0.1, 2., 3.5, 5.]
data['Playability'] = pd.cut(data['Average User Rating'], labels=play_labels, bins=category, include_lowest=False)

In [14]:
print(data[['URL', 'Name', 'Average User Rating', 'Playability']])

                                                     URL  \
0       https://apps.apple.com/us/app/sudoku/id284921427   
1      https://apps.apple.com/us/app/reversi/id284926400   
2      https://apps.apple.com/us/app/morocco/id284946595   
3      https://apps.apple.com/us/app/sudoku-free/id28...   
4      https://apps.apple.com/us/app/senet-deluxe/id2...   
...                                                  ...   
17002  https://apps.apple.com/us/app/stack-puzzle-ris...   
17003  https://apps.apple.com/us/app/eachother/id1474...   
17004  https://apps.apple.com/us/app/rabbit-vs-tortoi...   
17005  https://apps.apple.com/us/app/fatall/id1474963671   
17006  https://apps.apple.com/us/app/the-three-kingdo...   

                            Name  Average User Rating            Playability  
0                         Sudoku                  4.0                awesome  
1                        Reversi                  3.5      somewhat playable  
2                        Morocco          

In [15]:
# TODO: adjust the category to start from 0.1, that way 0.0 means no rating. and if no rating, clean up NaN's from
# plability to read "no rating, who knows"

In [16]:
data

Unnamed: 0,URL,ID,Name,Subtitle,Icon URL,Average User Rating,User Rating Count,Price,In-app Purchases,Description,Developer,Age Rating,Languages,Size,Primary Genre,Genres,Original Release Date,Current Version Release Date,Playability
0,https://apps.apple.com/us/app/sudoku/id284921427,284921427,Sudoku,No info,https://is2-ssl.mzstatic.com/image/thumb/Purpl...,4.0,3553.0,2.99,0,"Join over 21,000,000 of our fans and download ...",Mighty Mighty Good Games,4+,"DA, NL, EN, FI, FR, DE, IT, JA, KO, NB, PL, PT...",15853568.0,Games,"Games, Strategy, Puzzle",11/07/2008,30/05/2017,awesome
1,https://apps.apple.com/us/app/reversi/id284926400,284926400,Reversi,No info,https://is4-ssl.mzstatic.com/image/thumb/Purpl...,3.5,284.0,1.99,0,"The classic game of Reversi, also known as Oth...",Kiss The Machine,4+,EN,12328960.0,Games,"Games, Strategy, Board",11/07/2008,17/05/2018,somewhat playable
2,https://apps.apple.com/us/app/morocco/id284946595,284946595,Morocco,No info,https://is5-ssl.mzstatic.com/image/thumb/Purpl...,3.0,8376.0,0.00,0,Play the classic strategy game Othello (also k...,Bayou Games,4+,EN,674816.0,Games,"Games, Board, Strategy",11/07/2008,5/09/2017,somewhat playable
3,https://apps.apple.com/us/app/sudoku-free/id28...,285755462,Sudoku (Free),No info,https://is3-ssl.mzstatic.com/image/thumb/Purpl...,3.5,190394.0,0.00,0,"Top 100 free app for over a year.\nRated ""Best...",Mighty Mighty Good Games,4+,"DA, NL, EN, FI, FR, DE, IT, JA, KO, NB, PL, PT...",21552128.0,Games,"Games, Strategy, Puzzle",23/07/2008,30/05/2017,somewhat playable
4,https://apps.apple.com/us/app/senet-deluxe/id2...,285831220,Senet Deluxe,No info,https://is1-ssl.mzstatic.com/image/thumb/Purpl...,3.5,28.0,2.99,0,"""Senet Deluxe - The Ancient Game of Life and A...",RoGame Software,4+,"DA, NL, EN, FR, DE, EL, IT, JA, KO, NO, PT, RU...",34689024.0,Games,"Games, Strategy, Board, Education",18/07/2008,22/07/2018,somewhat playable
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
17002,https://apps.apple.com/us/app/stack-puzzle-ris...,1474626442,Stack Puzzle : Rise Tower,"Blast the cubes, solve puzzle!",https://is5-ssl.mzstatic.com/image/thumb/Purpl...,0.0,0.0,0.00,0,"The goal is very simple, move the square horiz...",Zhigang Pei,4+,EN,64795648.0,Games,"Games, Entertainment, Casual, Strategy",30/07/2019,30/07/2019,No rating. Who knows?
17003,https://apps.apple.com/us/app/eachother/id1474...,1474919257,EachOther,No info,https://is2-ssl.mzstatic.com/image/thumb/Purpl...,0.0,0.0,0.00,0,Collect a score while you play!!\n\nBy linking...,Sultan Shindi,4+,EN,110341120.0,Games,"Games, Family, Strategy",1/08/2019,1/08/2019,No rating. Who knows?
17004,https://apps.apple.com/us/app/rabbit-vs-tortoi...,1474962324,Rabbit Vs Tortoise,No info,https://is2-ssl.mzstatic.com/image/thumb/Purpl...,0.0,0.0,0.00,0,"""Rabbit Vs Tortoise is chess type cool simple ...",Vishal Baldha,4+,EN,23207936.0,Games,"Games, Strategy",3/08/2019,3/08/2019,No rating. Who knows?
17005,https://apps.apple.com/us/app/fatall/id1474963671,1474963671,FaTaLL,Most fun game!!!,https://is1-ssl.mzstatic.com/image/thumb/Purpl...,0.0,0.0,0.00,"9.99, 49.99, 3.99",Upgrade your character and use your skills to ...,Tayrem Games,4+,EN,196750336.0,Games,"Games, Strategy, Action",1/08/2019,1/08/2019,No rating. Who knows?


In [21]:
data['In-app Purchases'] = data['In-app Purchases'].astype(str)

In [22]:
write('games.parq', data)