# Profitable App Profiles for the App Store and Google Play Store
## Data Cleaning and Basic Comparison Analysis

App data from the two sources are loaded and taken through various functions to extract clean data that can be compared to ibserve which app genres are most popular.

In [29]:
from csv import reader
file_name = open('AppleStore.csv')
read_file = reader(file_name)
apple = list(read_file)
apple_header = apple[0]
apple_data = apple[1:]

opened_file = open('googleplaystore.csv')
read_file = reader(opened_file)
google = list(read_file)
google_header = google[0]
google_data = google[1:]

explore_data is a function to have a quick look at the data lists in slices. The arguments are the dataset, and 2 integers (start and end) to limit the rows that are sliced and output by the function.

In [30]:
def explore_data(dataset, start, end, rows_and_columns=False):
    dataset_slice = dataset[start:end]    
    for row in dataset_slice:
        print(row)
        print('\n')

    if rows_and_columns:
        print('Number of rows:', len(dataset))
        print('Number of columns:', len(dataset[0]))

In [31]:
explore_data(apple_data, 0, 5)

['284882215', 'Facebook', '389879808', 'USD', '0.0', '2974676', '212', '3.5', '3.5', '95.0', '4+', 'Social Networking', '37', '1', '29', '1']


['389801252', 'Instagram', '113954816', 'USD', '0.0', '2161558', '1289', '4.5', '4.0', '10.23', '12+', 'Photo & Video', '37', '0', '29', '1']


['529479190', 'Clash of Clans', '116476928', 'USD', '0.0', '2130805', '579', '4.5', '4.5', '9.24.12', '9+', 'Games', '38', '5', '18', '1']


['420009108', 'Temple Run', '65921024', 'USD', '0.0', '1724546', '3842', '4.5', '4.0', '1.6.2', '9+', 'Games', '40', '5', '1', '1']


['284035177', 'Pandora - Music & Radio', '130242560', 'USD', '0.0', '1126879', '3594', '4.0', '4.5', '8.4.1', '12+', 'Music', '37', '4', '1', '1']




In [32]:
explore_data(google_data, 0, 5)

['Photo Editor & Candy Camera & Grid & ScrapBook', 'ART_AND_DESIGN', '4.1', '159', '19M', '10,000+', 'Free', '0', 'Everyone', 'Art & Design', 'January 7, 2018', '1.0.0', '4.0.3 and up']


['Coloring book moana', 'ART_AND_DESIGN', '3.9', '967', '14M', '500,000+', 'Free', '0', 'Everyone', 'Art & Design;Pretend Play', 'January 15, 2018', '2.0.0', '4.0.3 and up']


['U Launcher Lite – FREE Live Cool Themes, Hide Apps', 'ART_AND_DESIGN', '4.7', '87510', '8.7M', '5,000,000+', 'Free', '0', 'Everyone', 'Art & Design', 'August 1, 2018', '1.2.4', '4.0.3 and up']


['Sketch - Draw & Paint', 'ART_AND_DESIGN', '4.5', '215644', '25M', '50,000,000+', 'Free', '0', 'Teen', 'Art & Design', 'June 8, 2018', 'Varies with device', '4.2 and up']


['Pixel Draw - Number Art Coloring Book', 'ART_AND_DESIGN', '4.3', '967', '2.8M', '100,000+', 'Free', '0', 'Everyone', 'Art & Design;Creativity', 'June 20, 2018', '1.1', '4.4 and up']




To delete incomplete data, a for loop is written to compare the length of each app entry(row) to the length of the header row and delete the ones that fall short.

In [33]:
# del google_data[10472]
count = -1
for app in google_data:
    count += 1
    if len(app) < len(google_header):
        print(count)
        del google_data[count]

10472


# Cleaning Data

**Part One : Duplicated Data**

Duplicated data is found here by adding all unique app names to a list and identifying other instances of the same app name. 

Next, the entries with duplicates are compared based on the number of reviews since that is the only observed difference between them. The entry with the highest number of reviews is deemed the most up-to-date and is retained while the others are not added to a new, "cleaned" list called "cleaned_google_data".

In [34]:
duplicate_apps = []
unique_apps = []

for app in google_data:
    name = app[0]
    if name in unique_apps:
        duplicate_apps.append(name)
    else:
        unique_apps.append(name)
        
assert len(duplicate_apps) == 1181

In [35]:
max_reviews = {}

for app in google_data:
    name = app[0]
    try:
        reviews = float(app[3])
    except ValueError as ve:
        print(len(app))
    
    if name in max_reviews and max_reviews[name] < reviews:
        max_reviews[name] = reviews
        
    elif name not in max_reviews:
        max_reviews[name] = reviews

In [36]:
clean_google_data = []
already_added = []

for app in google_data:
    name = app[0]
    reviews = float(app[3])
    
    if (max_reviews[name] == reviews) and (name not in already_added):
        clean_google_data.append(app)
        already_added.append(name)
        
    
assert len(clean_google_data) == 9659

**Part Two: Removing Non-English and Costed Apps**

Apps with non-english names, defined as those with more than 3 non-ASCII characters (with an order between 0 and 127) are removed from the list with a function written with the in-built ord function. te

Non-free (costed) apps are also removed with a for loop that adds only apps with price value of '0' to the final list. 

In [37]:
def is_english(string):
    non_ascii = 0
    
    for letter in string:
        if ord(letter) > 127:
            non_ascii += 1
    if non_ascii > 3:
        return False
    else:
        return True

assert is_english('Instachat 😜') == True
assert is_english('Instagram') == True

In [38]:
g_apps_english = []
apple_apps_english = []


for app in clean_google_data:
    name = app[0]
    if is_english(name):
        g_apps_english.append(app)
        
for app in apple_data:
    name = app[1]
    if is_english(name):
        apple_apps_english.append(app)
        
assert len(g_apps_english) == 9614
assert len(apple_apps_english) == 6183

In [39]:
android_final = []
ios_final = []

for app in g_apps_english:
    price = app[7]
    if price == '0':
        android_final.append(app)
        
for app in apple_apps_english:
    price = app[4]
    if price == '0.0':
        ios_final.append(app)
        
assert len(android_final) == 8864
assert len(ios_final) == 3222

# Analysing/Comparing Data

To determine the apps that attract more users both on android and ios, the desired app profiles are those that are successful in both markets. The apps can first be narrowed down by genre to determine the current popular app genres.  

Two functions are written to rework and display tables. 

In [40]:
def freq_table(dataset, index):
    table = {}
    total = 0
    
    for row in dataset:
        total += 1
        value = row[index]
        if value in table:
            table[value] += 1
        else:
            table[value] = 1
    
    table_percentages = {}
    for key in table:
        percentage = table[key] / total * 100
        table_percentages[key] = round(percentage,2) 
    
    return table_percentages


def display_table(dataset, index):
    table = freq_table(dataset, index)
    table_display = []
    for key in table:
        key_val_as_tuple = (table[key], key)
        table_display.append(key_val_as_tuple)
        
    table_sorted = sorted(table_display, reverse = True)
    for entry in table_sorted:
        print(entry[1], ':', entry[0])

In [41]:
print(apple_header)
print(google_header)

['id', 'track_name', 'size_bytes', 'currency', 'price', 'rating_count_tot', 'rating_count_ver', 'user_rating', 'user_rating_ver', 'ver', 'cont_rating', 'prime_genre', 'sup_devices.num', 'ipadSc_urls.num', 'lang.num', 'vpp_lic']
['App', 'Category', 'Rating', 'Reviews', 'Size', 'Installs', 'Type', 'Price', 'Content Rating', 'Genres', 'Last Updated', 'Current Ver', 'Android Ver']


In [42]:
display_table(android_final, 1) #categories

FAMILY : 18.91
GAME : 9.72
TOOLS : 8.46
BUSINESS : 4.59
LIFESTYLE : 3.9
PRODUCTIVITY : 3.89
FINANCE : 3.7
MEDICAL : 3.53
SPORTS : 3.4
PERSONALIZATION : 3.32
COMMUNICATION : 3.24
HEALTH_AND_FITNESS : 3.08
PHOTOGRAPHY : 2.94
NEWS_AND_MAGAZINES : 2.8
SOCIAL : 2.66
TRAVEL_AND_LOCAL : 2.34
SHOPPING : 2.25
BOOKS_AND_REFERENCE : 2.14
DATING : 1.86
VIDEO_PLAYERS : 1.79
MAPS_AND_NAVIGATION : 1.4
FOOD_AND_DRINK : 1.24
EDUCATION : 1.16
ENTERTAINMENT : 0.96
LIBRARIES_AND_DEMO : 0.94
AUTO_AND_VEHICLES : 0.93
HOUSE_AND_HOME : 0.82
WEATHER : 0.8
EVENTS : 0.71
PARENTING : 0.65
ART_AND_DESIGN : 0.64
COMICS : 0.62
BEAUTY : 0.6


In [43]:
display_table(android_final, 9) #genres

Tools : 8.45
Entertainment : 6.07
Education : 5.35
Business : 4.59
Productivity : 3.89
Lifestyle : 3.89
Finance : 3.7
Medical : 3.53
Sports : 3.46
Personalization : 3.32
Communication : 3.24
Action : 3.1
Health & Fitness : 3.08
Photography : 2.94
News & Magazines : 2.8
Social : 2.66
Travel & Local : 2.32
Shopping : 2.25
Books & Reference : 2.14
Simulation : 2.04
Dating : 1.86
Arcade : 1.85
Video Players & Editors : 1.77
Casual : 1.76
Maps & Navigation : 1.4
Food & Drink : 1.24
Puzzle : 1.13
Racing : 0.99
Role Playing : 0.94
Libraries & Demo : 0.94
Auto & Vehicles : 0.93
Strategy : 0.91
House & Home : 0.82
Weather : 0.8
Events : 0.71
Adventure : 0.68
Comics : 0.61
Beauty : 0.6
Art & Design : 0.6
Parenting : 0.5
Card : 0.45
Casino : 0.43
Trivia : 0.42
Educational;Education : 0.39
Board : 0.38
Educational : 0.37
Education;Education : 0.34
Word : 0.26
Casual;Pretend Play : 0.24
Music : 0.2
Racing;Action & Adventure : 0.17
Puzzle;Brain Games : 0.17
Entertainment;Music & Video : 0.17
Casual;

In [44]:
display_table(ios_final, 11) #genres

Games : 58.16
Entertainment : 7.88
Photo & Video : 4.97
Education : 3.66
Social Networking : 3.29
Shopping : 2.61
Utilities : 2.51
Sports : 2.14
Music : 2.05
Health & Fitness : 2.02
Productivity : 1.74
Lifestyle : 1.58
News : 1.33
Travel : 1.24
Finance : 1.12
Weather : 0.87
Food & Drink : 0.81
Reference : 0.56
Business : 0.53
Book : 0.43
Navigation : 0.19
Medical : 0.19
Catalogs : 0.12


To calculate the average number of user ratings per app genre on the App Store

In [45]:
genres = freq_table(ios_final, 11)
ratings_dict = {}

for genre in genres:
    total = 0
    len_genre = 0
    for app in ios_final:
        genre_app = app[11]
        if genre_app == genre:
            user_rating = float(app[8])
            total += user_rating
            len_genre += 1
    av_user_rating = round((total/len_genre),2)
#     print(genre,': ', av_user_rating)
    ratings_dict[genre] = av_user_rating
    

{key: value for key, value in sorted(ratings_dict.items(), key=lambda item: item[1])}

{'Navigation': 2.25,
 'News': 2.66,
 'Sports': 2.68,
 'Travel': 2.74,
 'Finance': 2.85,
 'Lifestyle': 2.92,
 'Social Networking': 2.99,
 'Weather': 3.02,
 'Business': 3.06,
 'Education': 3.11,
 'Utilities': 3.12,
 'Book': 3.14,
 'Food & Drink': 3.25,
 'Medical': 3.25,
 'Entertainment': 3.32,
 'Photo & Video': 3.38,
 'Shopping': 3.49,
 'Health & Fitness': 3.62,
 'Reference': 3.86,
 'Games': 3.91,
 'Music': 3.93,
 'Productivity': 3.94,
 'Catalogs': 4.0}

In [46]:
genres = freq_table(ios_final, -5)
ratings_dict = {}

for genre in genres:
    total = 0
    len_genre = 0
    for app in ios_final:
        genre_app = app[-5]
        if genre_app == genre:
            user_rating = float(app[5])
            total += user_rating
            len_genre += 1
    av_user_rating = round((total/len_genre),2)
#     print(genre,': ', av_user_rating)
    ratings_dict[genre] = av_user_rating
    

{key: value for key, value in sorted(ratings_dict.items(), key=lambda item: item[1], reverse=True)}

{'Navigation': 86090.33,
 'Reference': 74942.11,
 'Social Networking': 71548.35,
 'Music': 57326.53,
 'Weather': 52279.89,
 'Book': 39758.5,
 'Food & Drink': 33333.92,
 'Finance': 31467.94,
 'Photo & Video': 28441.54,
 'Travel': 28243.8,
 'Shopping': 26919.69,
 'Health & Fitness': 23298.02,
 'Sports': 23008.9,
 'Games': 22788.67,
 'News': 21248.02,
 'Productivity': 21028.41,
 'Utilities': 18684.46,
 'Lifestyle': 16485.76,
 'Entertainment': 14029.83,
 'Business': 7491.12,
 'Education': 7003.98,
 'Catalogs': 4004.0,
 'Medical': 612.0}

In [47]:
for app in ios_final:
    if app[-5] == 'Navigation':
        print(app[1], ':', app[5])

Waze - GPS Navigation, Maps & Real-time Traffic : 345046
Google Maps - Navigation & Transit : 154911
Geocaching® : 12811
CoPilot GPS – Car Navigation & Offline Maps : 3582
ImmobilienScout24: Real Estate Search in Germany : 187
Railway Route Search : 5


Although Navigation apps have the highest ratings, its due to market domination by only 2 apps. This is not a true depiction of a truly popular genre.

In [48]:
# for app in ios_final:
#     if app[-5] == 'Reference':
#         print(app[1], ':', app[5])

for app in ios_final:
    if app[-5] == 'News':
        print(app[1], ':', app[5])

Twitter : 354058
Fox News : 132703
CNN: Breaking US & World News, Live Video : 112886
Reddit Official App: All That's Trending and Viral : 67560
USA TODAY : 61724
ABC News - US & World News + Live Video : 48407
NBC News : 32881
HuffPost - News, Politics & Entertainment : 29107
The Washington Post Classic : 18572
WIRED Magazine : 12074
CBS News - Watch Free Live Breaking News : 11691
The Guardian : 8176
AOL: News, Email, Weather & Video : 5233
SmartNews - Trending News & Stories : 4645
MSNBC : 3692
LotteryHUB : 2417
theSkimm : 1765
Quartz • News in a whole new way : 1267
Lotto Results - Mega Millions Powerball Lottery : 794
TopBuzz: Best Viral Videos, GIFs, TV & News : 692
Ticket Scanner for Powerball & MegaMillions Pool : 581
FOCUS Online - Aktuelle Nachrichten : 373
SPIEGEL ONLINE - Nachrichten : 299
n-tv Nachrichten : 273
CNN Politics : 254
Tagesschau : 233
Fresco — Be a part of the news : 219
News Break - Local & World Breaking News & Radio : 173
OPM Alert : 172
franceinfo - l'actua

To calculate the average number of installs per app genre on the google store:

In [49]:
categories = freq_table(android_final, 1)
installs_dict = {}

for category in categories:
    total = 0
    len_category = 0
    for app in android_final:
        category_app = app[1]
        if category_app == category:
            installs = app[5]
            installs = installs.replace(',', '')
            installs = installs.replace('+', '')
            total += float(installs)
            len_category += 1
    av_installs = round((total/len_category),2)
#     print(category,': ', av_installs)
    installs_dict[category] = av_installs
    

{key: value for key, value in sorted(installs_dict.items(), key=lambda item: item[1], reverse=True)}

{'COMMUNICATION': 38456119.17,
 'VIDEO_PLAYERS': 24727872.45,
 'SOCIAL': 23253652.13,
 'PHOTOGRAPHY': 17840110.4,
 'PRODUCTIVITY': 16787331.34,
 'GAME': 15588015.6,
 'TRAVEL_AND_LOCAL': 13984077.71,
 'ENTERTAINMENT': 11640705.88,
 'TOOLS': 10801391.3,
 'NEWS_AND_MAGAZINES': 9549178.47,
 'BOOKS_AND_REFERENCE': 8767811.89,
 'SHOPPING': 7036877.31,
 'PERSONALIZATION': 5201482.61,
 'WEATHER': 5074486.2,
 'HEALTH_AND_FITNESS': 4188821.99,
 'MAPS_AND_NAVIGATION': 4056941.77,
 'FAMILY': 3695641.82,
 'SPORTS': 3638640.14,
 'ART_AND_DESIGN': 1986335.09,
 'FOOD_AND_DRINK': 1924897.74,
 'EDUCATION': 1833495.15,
 'BUSINESS': 1712290.15,
 'LIFESTYLE': 1437816.27,
 'FINANCE': 1387692.48,
 'HOUSE_AND_HOME': 1331540.56,
 'DATING': 854028.83,
 'COMICS': 817657.27,
 'AUTO_AND_VEHICLES': 647317.82,
 'LIBRARIES_AND_DEMO': 638503.73,
 'PARENTING': 542603.62,
 'BEAUTY': 513151.89,
 'EVENTS': 253542.22,
 'MEDICAL': 120550.62}

In [50]:
for app in android_final:
    if app[1] == 'COMMUNICATION':
        print(app[0], ':', app[5])

WhatsApp Messenger : 1,000,000,000+
Messenger for SMS : 10,000,000+
My Tele2 : 5,000,000+
imo beta free calls and text : 100,000,000+
Contacts : 50,000,000+
Call Free – Free Call : 5,000,000+
Web Browser & Explorer : 5,000,000+
Browser 4G : 10,000,000+
MegaFon Dashboard : 10,000,000+
ZenUI Dialer & Contacts : 10,000,000+
Cricket Visual Voicemail : 10,000,000+
TracFone My Account : 1,000,000+
Xperia Link™ : 10,000,000+
TouchPal Keyboard - Fun Emoji & Android Keyboard : 10,000,000+
Skype Lite - Free Video Call & Chat : 5,000,000+
My magenta : 1,000,000+
Android Messages : 100,000,000+
Google Duo - High Quality Video Calls : 500,000,000+
Seznam.cz : 1,000,000+
Antillean Gold Telegram (original version) : 100,000+
AT&T Visual Voicemail : 10,000,000+
GMX Mail : 10,000,000+
Omlet Chat : 10,000,000+
My Vodacom SA : 5,000,000+
Microsoft Edge : 5,000,000+
Messenger – Text and Video Chat for Free : 1,000,000,000+
imo free video calls and chat : 500,000,000+
Calls & Text by Mo+ : 5,000,000+
free 

Likewise, the average seems to be dominated by only a few apps. 

In [51]:
lower_than_benchmark = []
total_count = 0

for app in android_final:
    n_installs = app[5]
    n_installs = n_installs.replace(',', '')
    n_installs = n_installs.replace('+', '')
    if (app[1] == 'COMMUNICATION') :
        total_count += 1
        if float(n_installs) < 100000000:
            lower_than_benchmark.append(float(n_installs))
        
# print(total_count)
# print(len(lower_than_benchmark))

new_average = (sum(lower_than_benchmark) / len(lower_than_benchmark))
old_average = (installs_dict['COMMUNICATION'])

perc_diff = 100*((old_average-new_average)/old_average)

print('Removing only', (total_count-len(lower_than_benchmark)), 'apps reduces average number of installation by', round(perc_diff),'%')

Removing only 27 apps reduces average number of installation by 91 %


In [52]:
for app in android_final:
    if app[1] == 'SOCIAL':
        print(app[0], ':', app[5])

Facebook : 1,000,000,000+
Facebook Lite : 500,000,000+
Tumblr : 100,000,000+
Social network all in one 2018 : 100,000+
Pinterest : 100,000,000+
TextNow - free text + calls : 10,000,000+
Google+ : 1,000,000,000+
The Messenger App : 1,000,000+
Messenger Pro : 1,000,000+
Free Messages, Video, Chat,Text for Messenger Plus : 1,000,000+
Telegram X : 5,000,000+
The Video Messenger App : 100,000+
Jodel - The Hyperlocal App : 1,000,000+
Hide Something - Photo, Video : 5,000,000+
Love Sticker : 1,000,000+
Web Browser & Fast Explorer : 5,000,000+
LiveMe - Video chat, new friends, and make money : 10,000,000+
VidStatus app - Status Videos & Status Downloader : 5,000,000+
Love Images : 1,000,000+
Web Browser ( Fast & Secure Web Explorer) : 500,000+
SPARK - Live random video chat & meet new people : 5,000,000+
Golden telegram : 50,000+
Facebook Local : 1,000,000+
Meet – Talk to Strangers Using Random Video Chat : 5,000,000+
MobilePatrol Public Safety App : 1,000,000+
💘 WhatsLov: Smileys of love, sti

In [53]:
lower_than_benchmark = []
total_count = 0

for app in android_final:
    installs = app[5]
    installs = installs.replace(',', '')
    installs = installs.replace('+', '')
    if (app[1] == 'SOCIAL') :
        total_count += 1
        if float(installs) < 100000000:
            lower_than_benchmark.append(float(installs))
        
# print(total_count)
# print(len(lower_than_benchmark))

new_average = (sum(lower_than_benchmark)/len(lower_than_benchmark))
old_average = (installs_dict['SOCIAL'])

perc_diff = 100*((old_average-new_average)/old_average)

print('Removing only', (total_count-len(lower_than_benchmark)), 'apps reduces average number of installation by', round(perc_diff),'%')

Removing only 13 apps reduces average number of installation by 87 %


In [54]:
# for category in installs_dict:
#     lower_than_benchmark = []
#     total_count = 0

#     for app in android_final:
#         installs = app[5]
#         installs = installs.replace(',', '')
#         installs = installs.replace('+', '')
#         if (app[1] == category) :
#             total_count += 1
#             if float(installs) < 100000000:
#                 lower_than_benchmark.append(float(n_installs))
                
#     new_average = (sum(lower_than_benchmark)/len(lower_than_benchmark))
#     old_average = (installs_dict[category])
#     perc_diff = 100*((old_average-new_average)/old_average)
    
#     print(category, ': ', perc_diff)
# #     if perc_diff < 90:
# #         print(category)
    