# Profitable App Profiles for the App Store and Google Play Markets

The goal of this project is to analyze data to help developers understand what type of apps are likely to attract more users.

In [1]:
# Opening the file

opened_file1 = open('AppleStore.csv', encoding='utf-8')
opened_file2 = open('googleplaystore.csv', encoding='utf-8')
from csv import reader
read_file1 = reader(opened_file1)
read_file2 = reader(opened_file2)
apple_data = list(read_file1)
google_data = list(read_file2)

apple_header = apple_data[0]
apple = apple_data[1:]
google_header = google_data[0]
google = google_data[1:]

In [2]:
# Function to explore data

def explore_data(dataset, start, end, rows_and_columns=False):
    dataset_slice = dataset[start:end]    
    for row in dataset_slice:
        print(row)
        print('\n') # adds a new (empty) line after each row

    if rows_and_columns:
        print('Number of rows:', len(dataset))
        print('Number of columns:', len(dataset[0]))

explore_data(apple,0,3,True)
explore_data(google,0,3,True)

['284882215', 'Facebook', '389879808', 'USD', '0.0', '2974676', '212', '3.5', '3.5', '95.0', '4+', 'Social Networking', '37', '1', '29', '1']


['389801252', 'Instagram', '113954816', 'USD', '0.0', '2161558', '1289', '4.5', '4.0', '10.23', '12+', 'Photo & Video', '37', '0', '29', '1']


['529479190', 'Clash of Clans', '116476928', 'USD', '0.0', '2130805', '579', '4.5', '4.5', '9.24.12', '9+', 'Games', '38', '5', '18', '1']


Number of rows: 7197
Number of columns: 16
['Photo Editor & Candy Camera & Grid & ScrapBook', 'ART_AND_DESIGN', '4.1', '159', '19M', '10,000+', 'Free', '0', 'Everyone', 'Art & Design', 'January 7, 2018', '1.0.0', '4.0.3 and up']


['Coloring book moana', 'ART_AND_DESIGN', '3.9', '967', '14M', '500,000+', 'Free', '0', 'Everyone', 'Art & Design;Pretend Play', 'January 15, 2018', '2.0.0', '4.0.3 and up']


['U Launcher Lite – FREE Live Cool Themes, Hide Apps', 'ART_AND_DESIGN', '4.7', '87510', '8.7M', '5,000,000+', 'Free', '0', 'Everyone', 'Art & Design', 'August 1, 2

In [3]:
# Print the header
print(apple_header)
print(google_header)

['id', 'track_name', 'size_bytes', 'currency', 'price', 'rating_count_tot', 'rating_count_ver', 'user_rating', 'user_rating_ver', 'ver', 'cont_rating', 'prime_genre', 'sup_devices.num', 'ipadSc_urls.num', 'lang.num', 'vpp_lic']
['App', 'Category', 'Rating', 'Reviews', 'Size', 'Installs', 'Type', 'Price', 'Content Rating', 'Genres', 'Last Updated', 'Current Ver', 'Android Ver']


Click [here](https://www.kaggle.com/ramamet4/app-store-apple-data-set-10k-apps)	here for the documentation of Apple App Store's data
Click [here](https://www.kaggle.com/lava18/google-play-store-apps) for the documentation of Google Play Store's data

In [4]:
# Sampling one row
print(google[10472])

['Life Made WI-Fi Touchscreen Photo Frame', '1.9', '19', '3.0M', '1,000+', 'Free', '0', 'Everyone', '', 'February 11, 2018', '1.0.19', '4.0 and up']


In [5]:
# Deleting row
del google[10472]

Google Play data set has duplicate entries:

In [6]:
# Code to check duplicate entries
google_duplicate = []
google_unique = []

for row in google:
    if row[0] in google_unique:
        google_duplicate.append(row[0])
    else:
        google_unique.append(row[0])
        
print('The number of duplicate apps: ',len(google_duplicate),'\n')
print('The number of unique apps: ', len(google_unique))

The number of duplicate apps:  1181 

The number of unique apps:  9659


I won't remove the duplicates randomly. The criterion I'm going to use to remove the duplicates is by reading the number of review in 'Reviews' column (column index 3), and keep the highest number of review, because it will be most likely most recent entry of those apps.

In [7]:
# Find highest number of review
reviews_max = {}

for row in google:
    name = row[0]
    n_reviews = float(row[3])
    if name in reviews_max and reviews_max[name] < n_reviews:
        reviews_max[name] = n_reviews
    elif name not in reviews_max:
        reviews_max[name] = n_reviews

print(len(reviews_max))

9659


In [8]:
# Cleaning dataset
google_clean = []
already_added = []

for row in google:
    name = row[0]
    n_reviews = float(row[3])
    if n_reviews == reviews_max[name] and name not in already_added:
        google_clean.append(row)
        already_added.append(name)

In [9]:
# See the result
print(google_clean[:3])
print(len(google_clean))

[['Photo Editor & Candy Camera & Grid & ScrapBook', 'ART_AND_DESIGN', '4.1', '159', '19M', '10,000+', 'Free', '0', 'Everyone', 'Art & Design', 'January 7, 2018', '1.0.0', '4.0.3 and up'], ['U Launcher Lite – FREE Live Cool Themes, Hide Apps', 'ART_AND_DESIGN', '4.7', '87510', '8.7M', '5,000,000+', 'Free', '0', 'Everyone', 'Art & Design', 'August 1, 2018', '1.2.4', '4.0.3 and up'], ['Sketch - Draw & Paint', 'ART_AND_DESIGN', '4.5', '215644', '25M', '50,000,000+', 'Free', '0', 'Teen', 'Art & Design', 'June 8, 2018', 'Varies with device', '4.2 and up']]
9659


In [10]:
# Function to read string whether it is readable english character
def englishlang(string):
    non_engchar = []
    for character in string:
        if ord(character) > 127:
            non_engchar.append(character)  
        if len(non_engchar) > 3:
            return False
    
    return True

print(englishlang('Instagram'))
print(englishlang('爱奇艺PPS -《欢乐颂2》电视剧热播'))
print(englishlang('Docs To Go™ Free Office Suite'))
print(englishlang('Instachat 😜'))

True
False
True
True


In [11]:
# Cleaning dataset from non-english application
gclean_english = []
aclean_english = []
for row in google_clean:
    appname = row[0]
    if englishlang(appname) == True:
        gclean_english.append(row)


for row in apple:
    appname = row[1]
    if englishlang(appname) == True:
        aclean_english.append(row)

In [12]:
# Explore cleaned data
explore_data(gclean_english,0,3,True)
explore_data(aclean_english,0,3,True)

['Photo Editor & Candy Camera & Grid & ScrapBook', 'ART_AND_DESIGN', '4.1', '159', '19M', '10,000+', 'Free', '0', 'Everyone', 'Art & Design', 'January 7, 2018', '1.0.0', '4.0.3 and up']


['U Launcher Lite – FREE Live Cool Themes, Hide Apps', 'ART_AND_DESIGN', '4.7', '87510', '8.7M', '5,000,000+', 'Free', '0', 'Everyone', 'Art & Design', 'August 1, 2018', '1.2.4', '4.0.3 and up']


['Sketch - Draw & Paint', 'ART_AND_DESIGN', '4.5', '215644', '25M', '50,000,000+', 'Free', '0', 'Teen', 'Art & Design', 'June 8, 2018', 'Varies with device', '4.2 and up']


Number of rows: 9614
Number of columns: 13
['284882215', 'Facebook', '389879808', 'USD', '0.0', '2974676', '212', '3.5', '3.5', '95.0', '4+', 'Social Networking', '37', '1', '29', '1']


['389801252', 'Instagram', '113954816', 'USD', '0.0', '2161558', '1289', '4.5', '4.0', '10.23', '12+', 'Photo & Video', '37', '0', '29', '1']


['529479190', 'Clash of Clans', '116476928', 'USD', '0.0', '2130805', '579', '4.5', '4.5', '9.24.12', '9+', 'G

In [13]:
# Filter to only show free apps
g_cleanfreeng = []
a_cleanfreeng = []

for row in gclean_english:
    price = row[6]
    if price == 'Free':
        g_cleanfreeng.append(row)
        
for row in aclean_english:
    price = float(row[4])
    if price == 0:
        a_cleanfreeng.append(row)

In [14]:
# Explore filtered data
explore_data(g_cleanfreeng,0,3,True)
explore_data(a_cleanfreeng,0,3,True)

['Photo Editor & Candy Camera & Grid & ScrapBook', 'ART_AND_DESIGN', '4.1', '159', '19M', '10,000+', 'Free', '0', 'Everyone', 'Art & Design', 'January 7, 2018', '1.0.0', '4.0.3 and up']


['U Launcher Lite – FREE Live Cool Themes, Hide Apps', 'ART_AND_DESIGN', '4.7', '87510', '8.7M', '5,000,000+', 'Free', '0', 'Everyone', 'Art & Design', 'August 1, 2018', '1.2.4', '4.0.3 and up']


['Sketch - Draw & Paint', 'ART_AND_DESIGN', '4.5', '215644', '25M', '50,000,000+', 'Free', '0', 'Teen', 'Art & Design', 'June 8, 2018', 'Varies with device', '4.2 and up']


Number of rows: 8863
Number of columns: 13
['284882215', 'Facebook', '389879808', 'USD', '0.0', '2974676', '212', '3.5', '3.5', '95.0', '4+', 'Social Networking', '37', '1', '29', '1']


['389801252', 'Instagram', '113954816', 'USD', '0.0', '2161558', '1289', '4.5', '4.0', '10.23', '12+', 'Photo & Video', '37', '0', '29', '1']


['529479190', 'Clash of Clans', '116476928', 'USD', '0.0', '2130805', '579', '4.5', '4.5', '9.24.12', '9+', 'G

The aim of this project is to determine which apps that are attract more users, because the revenue generated by free apps is highly influenced by the number of people using the apps.

The validation strategy is as follows:
1. Build a minimal Android version of the app, and add it to Google Play.
2. If the app has a good response from users, we develop it further.
3. If the app is profitable after six months, we build an iOS version of the app and add it to the App Store.

So, we need to know what is the most common genres for each market.

In [15]:
# Print header as reminder
print(google_header)
print(apple_header)

['App', 'Category', 'Rating', 'Reviews', 'Size', 'Installs', 'Type', 'Price', 'Content Rating', 'Genres', 'Last Updated', 'Current Ver', 'Android Ver']
['id', 'track_name', 'size_bytes', 'currency', 'price', 'rating_count_tot', 'rating_count_ver', 'user_rating', 'user_rating_ver', 'ver', 'cont_rating', 'prime_genre', 'sup_devices.num', 'ipadSc_urls.num', 'lang.num', 'vpp_lic']


In [16]:
# Build function to make freq table dictionary and to display table

def freq_table(dataset,index):
    freq_dict = {}
    total = 0
    
    for row in dataset:
        total += 1
        value = row[index]
        if value in freq_dict:
            freq_dict[value] += 1
        else:
            freq_dict[value] = 1 
            
    freq_dict_percentage = {}
    
    for iteration_variable in freq_dict:
        percentage = (freq_dict[iteration_variable] / total) * 100
        freq_dict_percentage[iteration_variable] = percentage
    
    return freq_dict_percentage

def display_table(dataset, index):
    table = freq_table(dataset, index)
    table_display = []
    for key in table:
        key_val_as_tuple = (table[key], key)
        table_display.append(key_val_as_tuple)

    table_sorted = sorted(table_display, reverse = True)
    for entry in table_sorted:
        print(entry[1], ':', entry[0])

In [17]:
# Display table appstore category
display_table(a_cleanfreeng,11)

Games : 58.16263190564867
Entertainment : 7.883302296710118
Photo & Video : 4.9658597144630665
Education : 3.662321539416512
Social Networking : 3.2898820608317814
Shopping : 2.60707635009311
Utilities : 2.5139664804469275
Sports : 2.1415270018621975
Music : 2.0484171322160147
Health & Fitness : 2.0173805090006205
Productivity : 1.7380509000620732
Lifestyle : 1.5828677839851024
News : 1.3345747982619491
Travel : 1.2414649286157666
Finance : 1.1173184357541899
Weather : 0.8690254500310366
Food & Drink : 0.8069522036002483
Reference : 0.5586592178770949
Business : 0.5276225946617008
Book : 0.4345127250155183
Navigation : 0.186219739292365
Medical : 0.186219739292365
Catalogs : 0.12414649286157665


In [18]:
# Display table google play store category
display_table(g_cleanfreeng,1)

FAMILY : 18.898792733837304
GAME : 9.725826469592688
TOOLS : 8.462146000225657
BUSINESS : 4.592124562789123
LIFESTYLE : 3.9038700214374367
PRODUCTIVITY : 3.8925871601038025
FINANCE : 3.7007785174320205
MEDICAL : 3.5315355974275078
SPORTS : 3.396141261423897
PERSONALIZATION : 3.317161232088458
COMMUNICATION : 3.2381812027530184
HEALTH_AND_FITNESS : 3.0802211440821394
PHOTOGRAPHY : 2.944826808078529
NEWS_AND_MAGAZINES : 2.798149610741284
SOCIAL : 2.6627552747376737
TRAVEL_AND_LOCAL : 2.335552296062281
SHOPPING : 2.245289405393208
BOOKS_AND_REFERENCE : 2.1437436533904997
DATING : 1.8616721200496444
VIDEO_PLAYERS : 1.7939749520478394
MAPS_AND_NAVIGATION : 1.399074805370642
FOOD_AND_DRINK : 1.241114746699763
EDUCATION : 1.1621347173643235
ENTERTAINMENT : 0.9590432133589079
LIBRARIES_AND_DEMO : 0.9364774906916393
AUTO_AND_VEHICLES : 0.9251946293580051
HOUSE_AND_HOME : 0.8236488773552973
WEATHER : 0.8010831546880289
EVENTS : 0.7108202640189552
PARENTING : 0.6544059573507841
ART_AND_DESIGN : 0

In [19]:
# Display table google play store alternative category
display_table(g_cleanfreeng,9)

Tools : 8.450863138892023
Entertainment : 6.070179397495204
Education : 5.348076272142616
Business : 4.592124562789123
Productivity : 3.8925871601038025
Lifestyle : 3.8925871601038025
Finance : 3.7007785174320205
Medical : 3.5315355974275078
Sports : 3.463838429425702
Personalization : 3.317161232088458
Communication : 3.2381812027530184
Action : 3.102786866749408
Health & Fitness : 3.0802211440821394
Photography : 2.944826808078529
News & Magazines : 2.798149610741284
Social : 2.6627552747376737
Travel & Local : 2.324269434728647
Shopping : 2.245289405393208
Books & Reference : 2.1437436533904997
Simulation : 2.042197901387792
Dating : 1.8616721200496444
Arcade : 1.8503892587160102
Video Players & Editors : 1.771409229380571
Casual : 1.7601263680469368
Maps & Navigation : 1.399074805370642
Food & Drink : 1.241114746699763
Puzzle : 1.128286133363421
Racing : 0.9928917973598104
Role Playing : 0.9364774906916393
Libraries & Demo : 0.9364774906916393
Auto & Vehicles : 0.9251946293580051
S

In [20]:
# Display average number of ratings for each category for Google
a_uniqgen = freq_table(a_cleanfreeng,11)
for genre in a_uniqgen:
    total = 0
    len_genre = 0
    for row in a_cleanfreeng:
        genre_app = row[11]
        if genre_app == genre:
            rating_count = float(row[5])
            total += rating_count
            len_genre += 1
    
    avg_num_ratings = total / len_genre
    print(genre," ",avg_num_ratings)

Social Networking   71548.34905660378
Photo & Video   28441.54375
Games   22788.6696905016
Music   57326.530303030304
Reference   74942.11111111111
Health & Fitness   23298.015384615384
Weather   52279.892857142855
Utilities   18684.456790123455
Travel   28243.8
Shopping   26919.690476190477
News   21248.023255813954
Navigation   86090.33333333333
Lifestyle   16485.764705882353
Entertainment   14029.830708661417
Food & Drink   33333.92307692308
Sports   23008.898550724636
Book   39758.5
Finance   31467.944444444445
Education   7003.983050847458
Productivity   21028.410714285714
Business   7491.117647058823
Catalogs   4004.0
Medical   612.0


In [21]:
# Display average number of ratings for each category for Apple
g_uniqgen = freq_table(g_cleanfreeng,1)
for category in g_uniqgen:
    total = 0
    len_category = 0
    for row in g_cleanfreeng:
        category_app = row[1]
        if category_app == category:
            install_count = row[5]
            install_count = install_count.replace('+','')
            install_count = install_count.replace(',','')
            install_count = float(install_count)
            total += install_count
            len_category += 1
    
    avg_num_install = total / len_category
    print(category," ",avg_num_install)

ART_AND_DESIGN   1986335.0877192982
AUTO_AND_VEHICLES   647317.8170731707
BEAUTY   513151.88679245283
BOOKS_AND_REFERENCE   8767811.894736841
BUSINESS   1712290.1474201474
COMICS   817657.2727272727
COMMUNICATION   38456119.167247385
DATING   854028.8303030303
EDUCATION   1833495.145631068
ENTERTAINMENT   11640705.88235294
EVENTS   253542.22222222222
FINANCE   1387692.475609756
FOOD_AND_DRINK   1924897.7363636363
HEALTH_AND_FITNESS   4188821.9853479853
HOUSE_AND_HOME   1331540.5616438356
LIBRARIES_AND_DEMO   638503.734939759
LIFESTYLE   1437816.2687861272
GAME   15588015.603248259
FAMILY   3697848.1731343283
MEDICAL   120550.61980830671
SOCIAL   23253652.127118643
SHOPPING   7036877.311557789
PHOTOGRAPHY   17840110.40229885
SPORTS   3638640.1428571427
TRAVEL_AND_LOCAL   13984077.710144928
TOOLS   10801391.298666667
PERSONALIZATION   5201482.6122448975
PRODUCTIVITY   16787331.344927534
PARENTING   542603.6206896552
WEATHER   5074486.197183099
VIDEO_PLAYERS   24727872.452830188
NEWS_AND_