# Type of apps that attract more users

Since we build apps that are free to download and install, and our main source of revenue consists of in-app ads. This means our revenue for any given app is mostly influenced by the number of users who use our app — the more users that see and engage with the ads, the better. 

Our **goal** for this project is to analyze data to help our developers understand what type of apps are likely to attract more users.

In [1]:
import csv

In [2]:
with open('AppleStore.csv', encoding='utf8', newline='') as f:
    reader = csv.reader(f)
    applestore = list(reader)

In [3]:
with open('googleplaystore.csv', encoding='utf8', newline='') as f:
    reader = csv.reader(f)
    googleplaystore = list(reader)

In [4]:
def explore_data(dataset, start, end, rows_and_columns=False):
    dataset_slice = dataset[start:end]    
    for row in dataset_slice:
        print(row)
        print('\n') # adds a new (empty) line after each row

    if rows_and_columns:
        print('Number of rows:', len(dataset))
        print('Number of columns:', len(dataset[0]))

In [5]:
explore_data(applestore, 0, 5, rows_and_columns=True)

['id', 'track_name', 'size_bytes', 'currency', 'price', 'rating_count_tot', 'rating_count_ver', 'user_rating', 'user_rating_ver', 'ver', 'cont_rating', 'prime_genre', 'sup_devices.num', 'ipadSc_urls.num', 'lang.num', 'vpp_lic']


['284882215', 'Facebook', '389879808', 'USD', '0.0', '2974676', '212', '3.5', '3.5', '95.0', '4+', 'Social Networking', '37', '1', '29', '1']


['389801252', 'Instagram', '113954816', 'USD', '0.0', '2161558', '1289', '4.5', '4.0', '10.23', '12+', 'Photo & Video', '37', '0', '29', '1']


['529479190', 'Clash of Clans', '116476928', 'USD', '0.0', '2130805', '579', '4.5', '4.5', '9.24.12', '9+', 'Games', '38', '5', '18', '1']


['420009108', 'Temple Run', '65921024', 'USD', '0.0', '1724546', '3842', '4.5', '4.0', '1.6.2', '9+', 'Games', '40', '5', '1', '1']


Number of rows: 7198
Number of columns: 16


In [6]:
explore_data(googleplaystore, 0, 5, rows_and_columns=True)

['App', 'Category', 'Rating', 'Reviews', 'Size', 'Installs', 'Type', 'Price', 'Content Rating', 'Genres', 'Last Updated', 'Current Ver', 'Android Ver']


['Photo Editor & Candy Camera & Grid & ScrapBook', 'ART_AND_DESIGN', '4.1', '159', '19M', '10,000+', 'Free', '0', 'Everyone', 'Art & Design', 'January 7, 2018', '1.0.0', '4.0.3 and up']


['Coloring book moana', 'ART_AND_DESIGN', '3.9', '967', '14M', '500,000+', 'Free', '0', 'Everyone', 'Art & Design;Pretend Play', 'January 15, 2018', '2.0.0', '4.0.3 and up']


['U Launcher Lite – FREE Live Cool Themes, Hide Apps', 'ART_AND_DESIGN', '4.7', '87510', '8.7M', '5,000,000+', 'Free', '0', 'Everyone', 'Art & Design', 'August 1, 2018', '1.2.4', '4.0.3 and up']


['Sketch - Draw & Paint', 'ART_AND_DESIGN', '4.5', '215644', '25M', '50,000,000+', 'Free', '0', 'Teen', 'Art & Design', 'June 8, 2018', 'Varies with device', '4.2 and up']


Number of rows: 10842
Number of columns: 13


In [7]:
explore_data(googleplaystore, 0, 1, rows_and_columns=False)

['App', 'Category', 'Rating', 'Reviews', 'Size', 'Installs', 'Type', 'Price', 'Content Rating', 'Genres', 'Last Updated', 'Current Ver', 'Android Ver']




In [8]:
explore_data(applestore, 0, 1, rows_and_columns=False)

['id', 'track_name', 'size_bytes', 'currency', 'price', 'rating_count_tot', 'rating_count_ver', 'user_rating', 'user_rating_ver', 'ver', 'cont_rating', 'prime_genre', 'sup_devices.num', 'ipadSc_urls.num', 'lang.num', 'vpp_lic']




In [9]:
explore_data(googleplaystore, 10472, 10475, rows_and_columns=False)

['Xposed Wi-Fi-Pwd', 'PERSONALIZATION', '3.5', '1042', '404k', '100,000+', 'Free', '0', 'Everyone', 'Personalization', 'August 5, 2014', '3.0.0', '4.0.3 and up']


['Life Made WI-Fi Touchscreen Photo Frame', '1.9', '19', '3.0M', '1,000+', 'Free', '0', 'Everyone', '', 'February 11, 2018', '1.0.19', '4.0 and up']


['osmino Wi-Fi: free WiFi', 'TOOLS', '4.2', '134203', '4.1M', '10,000,000+', 'Free', '0', 'Everyone', 'Tools', 'August 7, 2018', '6.06.14', '4.4 and up']




In [10]:
del googleplaystore[10473]

In [11]:
explore_data(googleplaystore, 10470, 10476, rows_and_columns=False)

['TownWiFi | Wi-Fi Everywhere', 'COMMUNICATION', '3.9', '2372', '58M', '500,000+', 'Free', '0', 'Everyone', 'Communication', 'August 2, 2018', '4.2.1', '4.2 and up']


['Jazz Wi-Fi', 'COMMUNICATION', '3.4', '49', '4.0M', '10,000+', 'Free', '0', 'Everyone', 'Communication', 'February 10, 2017', '0.1', '2.3 and up']


['Xposed Wi-Fi-Pwd', 'PERSONALIZATION', '3.5', '1042', '404k', '100,000+', 'Free', '0', 'Everyone', 'Personalization', 'August 5, 2014', '3.0.0', '4.0.3 and up']


['osmino Wi-Fi: free WiFi', 'TOOLS', '4.2', '134203', '4.1M', '10,000,000+', 'Free', '0', 'Everyone', 'Tools', 'August 7, 2018', '6.06.14', '4.4 and up']


['Sat-Fi Voice', 'COMMUNICATION', '3.4', '37', '14M', '1,000+', 'Free', '0', 'Everyone', 'Communication', 'November 21, 2014', '2.2.1.5', '2.2 and up']


['Wi-Fi Visualizer', 'TOOLS', '3.9', '132', '2.6M', '50,000+', 'Free', '0', 'Everyone', 'Tools', 'May 17, 2017', '0.0.9', '2.3 and up']




### Removing duplicate apps from apple and google play store

Apple Store and Google Play Store both have duplicate app names in dataset. Lets print a few rows to confirm it.

Duplicate apps in applestore:-

In [12]:
apple_unique_apps = []
apple_duplicate_apps = []

for app in applestore:
    app_name = app[1]

    if app_name not in apple_unique_apps:
        apple_unique_apps.append(app_name)
    else:
        apple_duplicate_apps.append(app)

Duplicate apps in googleplaystore:-

In [13]:
googleplaystore_unique_apps = []
googleplaystore_duplicate_apps = []

for app in googleplaystore:
    app_name = app[0]
    
    if app_name not in googleplaystore_unique_apps:
        googleplaystore_unique_apps.append(app_name)
    else:
        googleplaystore_duplicate_apps.append(app)

Name of duplicate apps in applestore dataset:-

In [14]:
for row in apple_duplicate_apps:
    print(row[1])

Mannequin Challenge
VR Roller Coaster


Name of duplicate apps in googleplaystore dataset:-

In [15]:
len(googleplaystore_duplicate_apps)

1181

In [16]:
for row in googleplaystore_duplicate_apps:
    print(row[0])

Quick PDF Scanner + OCR FREE
Box
Google My Business
ZOOM Cloud Meetings
join.me - Simple Meetings
Box
Zenefits
Google Ads
Google My Business
Slack
FreshBooks Classic
Insightly CRM
QuickBooks Accounting: Invoicing & Expenses
HipChat - Chat Built for Teams
Xero Accounting Software
MailChimp - Email, Marketing Automation
Crew - Free Messaging and Scheduling
Asana: organize team projects
Google Analytics
AdWords Express
Accounting App - Zoho Books
Invoice & Time Tracking - Zoho
join.me - Simple Meetings
Invoice 2go — Professional Invoices and Estimates
SignEasy | Sign and Fill PDF and other Documents
Quick PDF Scanner + OCR FREE
Genius Scan - PDF Scanner
Tiny Scanner - PDF Scanner App
Fast Scanner : Free PDF Scan
Mobile Doc Scanner (MDScan) Lite
TurboScan: scan documents and receipts in PDF
Tiny Scanner Pro: PDF Doc Scan
Docs To Go™ Free Office Suite
OfficeSuite : Free Office + PDF Editor
Slack
QuickBooks Accounting: Invoicing & Expenses
WhatsApp Messenger
Messenger – Text and Video Chat f

example for how many rows of 'Quick PDF Scanner + OCR FREE' app duplicates exist:-

In [17]:
for app in googleplaystore:
    name = app[0]
    if name == 'Quick PDF Scanner + OCR FREE':
        print(app)

['Quick PDF Scanner + OCR FREE', 'BUSINESS', '4.2', '80805', 'Varies with device', '5,000,000+', 'Free', '0', 'Everyone', 'Business', 'February 26, 2018', 'Varies with device', '4.0.3 and up']
['Quick PDF Scanner + OCR FREE', 'BUSINESS', '4.2', '80805', 'Varies with device', '5,000,000+', 'Free', '0', 'Everyone', 'Business', 'February 26, 2018', 'Varies with device', '4.0.3 and up']
['Quick PDF Scanner + OCR FREE', 'BUSINESS', '4.2', '80804', 'Varies with device', '5,000,000+', 'Free', '0', 'Everyone', 'Business', 'February 26, 2018', 'Varies with device', '4.0.3 and up']


Removing duplicate rows in applestore dataset:-

In [18]:
for apps in apple_duplicate_apps:
    dup_name = apps[1]
    i = 0
    reviews = []
    index = 0
    for app in applestore:
        i = i + 1
        name = app[1]
        if name == dup_name:
            if len(reviews) == 0:
                reviews = [app[5]]
                index = i - 1
            else:
                if app[5] > reviews[0]:
                    del applestore[index]
                    reviews = [app[5]]
                    index = i - 1
                else:
                    del applestore[i - 1]                   

In [19]:
len(applestore)

7196

Removing duplicate rows in googleplaystore dataset:-

In [20]:
for apps in googleplaystore_duplicate_apps:
    dup_name = apps[0]
    i = 0
    reviews = []
    index = 0
    for app in googleplaystore:
        i = i + 1
        name = app[0]
        if name == dup_name:
            if len(reviews) == 0:
                reviews = [app[3]]
                index = i - 1
            else:
                if app[3] > reviews[0]:
                    del googleplaystore[index]
                    reviews = [app[3]]
                    index = i - 2
                else:
                    del googleplaystore[i - 1]

In [21]:
len(googleplaystore)

9660

In [22]:
def check_if_char_is_english(_str):
    i = 0
    for character in _str:
        if ord(character) > 127:
                i += 1
    if i > 3:
        return False
    return True            

In [23]:
check_if_char_is_english('PP播播播SS播')

False

In [24]:
googleplaystore_english = []
for app in googleplaystore:
    name = app[0]
    if check_if_char_is_english(name):
        googleplaystore_english.append(app)

In [25]:
len(googleplaystore_english)

9615

In [26]:
applestore_english = []
for app in applestore:
    name = app[0]
    if check_if_char_is_english(name):
        applestore_english.append(app)

In [27]:
len(applestore_english)

7196

In [28]:
float(applestore_english[1][4]) == 0

True

In [29]:
applestore_english_free =[]

i = 0

for app in applestore_english:
    i += 1
    if i > 1:
        price = float(app[4])
        if price == 0:
            applestore_english_free.append(app)

In [30]:
len(applestore_english)

7196

In [31]:
len(applestore_english_free)

4054

In [32]:
googleplaystore_english_free =[]

i = 0

for app in googleplaystore_english:
    i += 1
    if i > 1:
        price = app[6]
#         print(price)
        if price == 'Free':
#             print(price)
            googleplaystore_english_free.append(app)

In [33]:
len(googleplaystore_english_free)

8861

As our goal is to ultimately find profitable apps, we need to identify apps that are likely to attract more users because the number of people using our apps affect our revenue.

We are assuming below validation strategy to be adopted for any app idea:-
1. Build a minimal Android version of the app, and add it to Google Play.
2. If the app has a good response from users, we develop it further.
3. If the app is profitable after six months, we build an iOS version of the app and add it to the App Store.

We can start by identying types of apps which are most popular. 'prime_genre' column is available in applestore dataset and 'Category' & 'Genres' columns are available in googleplaystore dataset.

In [34]:
def freq_table(dataset, index):
    dict_ = {}
    i = 0
    for app in dataset:
        i += 1
        if i > 1:
            genre = app[index]
            if genre in dict_.keys():
                dict_[genre] += 1
            else:
                dict_[genre] = 1
    return dict_

In [35]:
freq_table(applestore_english_free, 11)

{'Photo & Video': 167,
 'Games': 2255,
 'Music': 67,
 'Social Networking': 142,
 'Reference': 20,
 'Health & Fitness': 76,
 'Weather': 31,
 'Utilities': 109,
 'Travel': 56,
 'Shopping': 121,
 'News': 58,
 'Navigation': 20,
 'Lifestyle': 94,
 'Entertainment': 334,
 'Food & Drink': 43,
 'Sports': 79,
 'Book': 66,
 'Finance': 84,
 'Education': 132,
 'Productivity': 62,
 'Business': 20,
 'Catalogs': 9,
 'Medical': 8}

In [36]:
freq_table(googleplaystore_english_free, 1)

{'ART_AND_DESIGN': 56,
 'AUTO_AND_VEHICLES': 82,
 'BEAUTY': 53,
 'BOOKS_AND_REFERENCE': 190,
 'BUSINESS': 407,
 'COMICS': 55,
 'COMMUNICATION': 287,
 'DATING': 165,
 'EDUCATION': 104,
 'ENTERTAINMENT': 85,
 'EVENTS': 63,
 'FINANCE': 328,
 'FOOD_AND_DRINK': 110,
 'HEALTH_AND_FITNESS': 273,
 'HOUSE_AND_HOME': 73,
 'LIBRARIES_AND_DEMO': 83,
 'LIFESTYLE': 346,
 'GAME': 859,
 'FAMILY': 1677,
 'MEDICAL': 312,
 'SOCIAL': 236,
 'SHOPPING': 199,
 'PHOTOGRAPHY': 261,
 'SPORTS': 301,
 'TRAVEL_AND_LOCAL': 207,
 'TOOLS': 749,
 'PERSONALIZATION': 294,
 'PRODUCTIVITY': 345,
 'PARENTING': 58,
 'WEATHER': 71,
 'VIDEO_PLAYERS': 159,
 'NEWS_AND_MAGAZINES': 248,
 'MAPS_AND_NAVIGATION': 124}

In [37]:
freq_table(googleplaystore_english_free, 9)

{'Art & Design': 52,
 'Art & Design;Creativity': 6,
 'Auto & Vehicles': 82,
 'Beauty': 53,
 'Books & Reference': 190,
 'Business': 407,
 'Comics': 54,
 'Comics;Creativity': 1,
 'Communication': 287,
 'Dating': 165,
 'Education': 474,
 'Education;Creativity': 4,
 'Education;Education': 30,
 'Education;Pretend Play': 5,
 'Education;Brain Games': 3,
 'Entertainment': 538,
 'Entertainment;Brain Games': 7,
 'Entertainment;Creativity': 3,
 'Entertainment;Music & Video': 15,
 'Events': 63,
 'Finance': 328,
 'Food & Drink': 110,
 'Health & Fitness': 273,
 'House & Home': 73,
 'Libraries & Demo': 83,
 'Lifestyle': 345,
 'Lifestyle;Pretend Play': 1,
 'Arcade': 164,
 'Puzzle': 100,
 'Racing': 88,
 'Sports': 307,
 'Casual': 155,
 'Simulation': 181,
 'Adventure': 60,
 'Trivia': 37,
 'Action': 275,
 'Word': 23,
 'Role Playing': 83,
 'Strategy': 80,
 'Board': 33,
 'Card': 39,
 'Music': 18,
 'Action;Action & Adventure': 9,
 'Casual;Brain Games': 12,
 'Educational;Creativity': 3,
 'Puzzle;Brain Games':

In [38]:
def display_table(dataset, index):
    table = freq_table(dataset, index)
    table_display = []
    for key in table:
        key_val_as_tuple = (table[key], key)
        table_display.append(key_val_as_tuple)

    table_sorted = sorted(table_display, reverse = True)
    for entry in table_sorted:
        print(entry[1], ':', entry[0])

In [39]:
display_table(applestore_english_free, 11)

Games : 2255
Entertainment : 334
Photo & Video : 167
Social Networking : 142
Education : 132
Shopping : 121
Utilities : 109
Lifestyle : 94
Finance : 84
Sports : 79
Health & Fitness : 76
Music : 67
Book : 66
Productivity : 62
News : 58
Travel : 56
Food & Drink : 43
Weather : 31
Reference : 20
Navigation : 20
Business : 20
Catalogs : 9
Medical : 8


1. Most common genre from app store is "Games"
2. No. of gaming apps are almost 7 times that of the second highest category "Entertainment"
3. Entertainment apps are greater in number than other types of apps which are designed for practical purposes
4. Games seems to be the recommended profile. However, we may need to also correlate this data with number of users

In [40]:
display_table(googleplaystore_english_free, 1)

FAMILY : 1677
GAME : 859
TOOLS : 749
BUSINESS : 407
LIFESTYLE : 346
PRODUCTIVITY : 345
FINANCE : 328
MEDICAL : 312
SPORTS : 301
PERSONALIZATION : 294
COMMUNICATION : 287
HEALTH_AND_FITNESS : 273
PHOTOGRAPHY : 261
NEWS_AND_MAGAZINES : 248
SOCIAL : 236
TRAVEL_AND_LOCAL : 207
SHOPPING : 199
BOOKS_AND_REFERENCE : 190
DATING : 165
VIDEO_PLAYERS : 159
MAPS_AND_NAVIGATION : 124
FOOD_AND_DRINK : 110
EDUCATION : 104
ENTERTAINMENT : 85
LIBRARIES_AND_DEMO : 83
AUTO_AND_VEHICLES : 82
HOUSE_AND_HOME : 73
WEATHER : 71
EVENTS : 63
PARENTING : 58
ART_AND_DESIGN : 56
COMICS : 55
BEAUTY : 53


In [41]:
display_table(googleplaystore_english_free, 9)

Tools : 748
Entertainment : 538
Education : 474
Business : 407
Productivity : 345
Lifestyle : 345
Finance : 328
Medical : 312
Sports : 307
Personalization : 294
Communication : 287
Action : 275
Health & Fitness : 273
Photography : 261
News & Magazines : 248
Social : 236
Travel & Local : 206
Shopping : 199
Books & Reference : 190
Simulation : 181
Dating : 165
Arcade : 164
Video Players & Editors : 157
Casual : 155
Maps & Navigation : 124
Food & Drink : 110
Puzzle : 100
Racing : 88
Role Playing : 83
Libraries & Demo : 83
Auto & Vehicles : 82
Strategy : 80
House & Home : 73
Weather : 71
Events : 63
Adventure : 60
Comics : 54
Beauty : 53
Art & Design : 52
Parenting : 44
Card : 39
Casino : 38
Trivia : 37
Educational;Education : 35
Educational : 33
Board : 33
Education;Education : 30
Word : 23
Casual;Pretend Play : 21
Music : 18
Puzzle;Brain Games : 16
Racing;Action & Adventure : 15
Entertainment;Music & Video : 15
Casual;Brain Games : 12
Casual;Action & Adventure : 12
Arcade;Action & Advent

Google Play Store dataset with category column as index has Family, Game, Tools as top three. However, there is not significant difference between them.

Google Play Store dataset has Tools, Entertainment and Education as the top three genres.

But overall there is not variation like apple store dataset, where majority apps belong entertainment.

In [42]:
applestore_genres = freq_table(applestore_english_free, 11)

In [43]:
for genre in applestore_genres:
    total = 0
    len_genre = 0
    i = 0
    for app in applestore_english_free:
        i += 1
        if i > 1:
            genre_app = app[11] #get the genre
            if genre_app == genre:
                total += float(app[7]) #get the user rating of that app
                len_genre += 1
    avg_usr_ratings = total / len_genre
    print(genre + ' : ' + str(round(avg_usr_ratings,2)))

Photo & Video : 3.79
Games : 3.53
Music : 3.94
Social Networking : 2.99
Reference : 3.3
Health & Fitness : 3.58
Weather : 3.26
Utilities : 3.45
Travel : 3.38
Shopping : 3.53
News : 2.88
Navigation : 2.2
Lifestyle : 2.59
Entertainment : 3.15
Food & Drink : 3.03
Sports : 2.92
Book : 1.6
Finance : 2.22
Education : 3.48
Productivity : 3.96
Business : 3.5
Catalogs : 1.83
Medical : 2.88


Games has 3.53 avg user rating and it is the most common genre. Therefore, Games appears to be best app profile  recommendation.

In [44]:
googleplaystore_genres = freq_table(googleplaystore_english_free, 1)

In [45]:
highest_install = []
for category in googleplaystore_genres:
    total = 0
    len_category = 0
    i = 0
    for app in googleplaystore_english_free:
        i += 1
        if i > 1:
            category_app = app[1] #get the genre
            if category_app == category:
                total += float(app[5].replace('+', '').replace(',', '')) #get the number of installs and replace '+ & , with empty string'
                len_category += 1
    avg_installs = total / len_category
    
    if len(highest_install) == 0:
        highest_install.append([category, avg_installs])
    else:
        if highest_install[0][1] < avg_installs:
            highest_install = [[category, avg_installs]]
    
    print(category + ' : ' + str(round(avg_installs,2)))

ART_AND_DESIGN : 2021626.79
AUTO_AND_VEHICLES : 647317.82
BEAUTY : 513151.89
BOOKS_AND_REFERENCE : 8767811.89
BUSINESS : 1712290.15
COMICS : 817657.27
COMMUNICATION : 38456119.17
DATING : 854028.83
EDUCATION : 1820673.08
ENTERTAINMENT : 11640705.88
EVENTS : 253542.22
FINANCE : 1387692.48
FOOD_AND_DRINK : 1924897.74
HEALTH_AND_FITNESS : 4188821.99
HOUSE_AND_HOME : 1331540.56
LIBRARIES_AND_DEMO : 638503.73
LIFESTYLE : 1437816.27
GAME : 15560965.6
FAMILY : 3696479.24
MEDICAL : 120616.49
SOCIAL : 23253652.13
SHOPPING : 7036877.31
PHOTOGRAPHY : 17805627.64
SPORTS : 3638640.14
TRAVEL_AND_LOCAL : 13984077.71
TOOLS : 10682301.03
PERSONALIZATION : 5201482.61
PRODUCTIVITY : 16787331.34
PARENTING : 542603.62
WEATHER : 5074486.2
VIDEO_PLAYERS : 24727872.45
NEWS_AND_MAGAZINES : 9549178.47
MAPS_AND_NAVIGATION : 4056941.77


In [46]:
highest_install

[['COMMUNICATION', 38456119.167247385]]

COMMUNICATION has the highest number of average installs