# What type of apps are likely to attract more users
Our goal for this project is to analyze data to help our developers understand what type of apps are likely to attract more users.

## About Data
A data set containing data about approximately 10,000 Android apps from Google Play; the data was collected in August 2018.
A data set containing data about approximately 7,000 iOS apps from the App Store; the data was collected in July 2017.

In [22]:
def open_dataset(file_name, header=True):        
    opened_file = open(file_name, encoding='utf8')
    from csv import reader
    read_file = reader(opened_file)
    data = list(read_file)
    
    if header:
        return data[1:], data[0]
    else:
        return data

In [23]:
def explore_data(dataset, start=0, end=3, rows_and_columns=True):
    dataset_slice = dataset[start:end]    
    for row in dataset_slice:
        print(row)
        print('\n') # adds a new (empty) line after each row

    if rows_and_columns:
        print('Number of rows:', len(dataset))
        print('Number of columns:', len(dataset[0]))

In [25]:
android_data, android_header = open_dataset('googleplaystore.csv')
ios_data, ios_header = open_dataset('AppleStore.csv')

## Brief Data exploring

In [26]:
print(android_header, '\n')
explore_data(android_data)

['App', 'Category', 'Rating', 'Reviews', 'Size', 'Installs', 'Type', 'Price', 'Content Rating', 'Genres', 'Last Updated', 'Current Ver', 'Android Ver'] 

['Photo Editor & Candy Camera & Grid & ScrapBook', 'ART_AND_DESIGN', '4.1', '159', '19M', '10,000+', 'Free', '0', 'Everyone', 'Art & Design', 'January 7, 2018', '1.0.0', '4.0.3 and up']


['Coloring book moana', 'ART_AND_DESIGN', '3.9', '967', '14M', '500,000+', 'Free', '0', 'Everyone', 'Art & Design;Pretend Play', 'January 15, 2018', '2.0.0', '4.0.3 and up']


['U Launcher Lite – FREE Live Cool Themes, Hide Apps', 'ART_AND_DESIGN', '4.7', '87510', '8.7M', '5,000,000+', 'Free', '0', 'Everyone', 'Art & Design', 'August 1, 2018', '1.2.4', '4.0.3 and up']


Number of rows: 10841
Number of columns: 13


In [27]:
print(ios_header, '\n')
explore_data(ios_data)

['id', 'track_name', 'size_bytes', 'currency', 'price', 'rating_count_tot', 'rating_count_ver', 'user_rating', 'user_rating_ver', 'ver', 'cont_rating', 'prime_genre', 'sup_devices.num', 'ipadSc_urls.num', 'lang.num', 'vpp_lic'] 

['284882215', 'Facebook', '389879808', 'USD', '0.0', '2974676', '212', '3.5', '3.5', '95.0', '4+', 'Social Networking', '37', '1', '29', '1']


['389801252', 'Instagram', '113954816', 'USD', '0.0', '2161558', '1289', '4.5', '4.0', '10.23', '12+', 'Photo & Video', '37', '0', '29', '1']


['529479190', 'Clash of Clans', '116476928', 'USD', '0.0', '2130805', '579', '4.5', '4.5', '9.24.12', '9+', 'Games', '38', '5', '18', '1']


Number of rows: 7197
Number of columns: 16


## Data preparing

### Finding not proper data entries

In [28]:
for row in android_data:
    if len(row) != len(android_header):
        print(row)
        print(android_data.index(row))

['Life Made WI-Fi Touchscreen Photo Frame', '1.9', '19', '3.0M', '1,000+', 'Free', '0', 'Everyone', '', 'February 11, 2018', '1.0.19', '4.0 and up']
10472


In [30]:
del android_data[10472]

In [29]:
for row in ios_data:
    if len(row) != len(ios_header):
        print(row)
        print(ios_data.index(row))

### Checking for duplicates

In [33]:
duplicate_apps_android = []
unique_apps_android = []

for app in android_data:
    if app[0] in unique_apps_android:
        duplicate_apps_android.append(app[0])
    else:
        unique_apps_android.append(app[0])
        
print('Number of duplicates: ', len(duplicate_apps_android), '\n')
print('Examples of duplicates: ', duplicate_apps_android[:10])

Number of duplicates:  1181 

Examples of duplicates:  ['Quick PDF Scanner + OCR FREE', 'Box', 'Google My Business', 'ZOOM Cloud Meetings', 'join.me - Simple Meetings', 'Box', 'Zenefits', 'Google Ads', 'Google My Business', 'Slack']


In [34]:
duplicate_apps_ios = []
unique_apps_ios = []

for app in ios_data:
    if app[1] in unique_apps_ios:
        duplicate_apps_ios.append(app[1])
    else:
        unique_apps_ios.append(app[1])
        
print('Number of duplicates: ', len(duplicate_apps_ios), '\n')
print('Examples of duplicates: ', duplicate_apps_ios[:10])

Number of duplicates:  2 

Examples of duplicates:  ['Mannequin Challenge', 'VR Roller Coaster']


In [38]:
print(ios_header, '\n')
for app in ios_data:
    if app[1] == 'Mannequin Challenge' or app[1] == 'VR Roller Coaster':
        print(app)

['id', 'track_name', 'size_bytes', 'currency', 'price', 'rating_count_tot', 'rating_count_ver', 'user_rating', 'user_rating_ver', 'ver', 'cont_rating', 'prime_genre', 'sup_devices.num', 'ipadSc_urls.num', 'lang.num', 'vpp_lic'] 

['1173990889', 'Mannequin Challenge', '109705216', 'USD', '0.0', '668', '87', '3.0', '3.0', '1.4', '9+', 'Games', '37', '4', '1', '1']
['952877179', 'VR Roller Coaster', '169523200', 'USD', '0.0', '107', '102', '3.5', '3.5', '2.0.0', '4+', 'Games', '37', '5', '1', '1']
['1178454060', 'Mannequin Challenge', '59572224', 'USD', '0.0', '105', '58', '4.0', '4.5', '1.0.1', '4+', 'Games', '38', '5', '1', '1']
['1089824278', 'VR Roller Coaster', '240964608', 'USD', '0.0', '67', '44', '3.5', '4.0', '0.81', '4+', 'Games', '38', '0', '1', '1']


#### Deleting duplicates
The higher the number of reviews, the more recent the data should be. We'll only keep the row with the highest number of reviews and remove the other entries for any given app.

In [36]:
reviews_max_android = {}
for app in android_data:
    name = app[0]
    n_reviews = float(app[3])
    if name in reviews_max_android and reviews_max_android[name] < n_reviews:
        reviews_max_android[name] = n_reviews
    elif name not in reviews_max_android:
        reviews_max_android[name] = n_reviews

# checking        
print('Expected length: ', len(android_data) -  len(duplicate_apps_android))
print('Actual length of names: ', len(reviews_max_android))

Expected length:  9659
Actual length of names:  9659


In [39]:
android_clean = []
already_added = []

for app in android_data:
    name = app[0]
    n_reviews = float(app[3])
    if n_reviews == reviews_max_android[name] and name not in already_added:
        android_clean.append(app)
        already_added.append(name)

explore_data(android_clean)

['Photo Editor & Candy Camera & Grid & ScrapBook', 'ART_AND_DESIGN', '4.1', '159', '19M', '10,000+', 'Free', '0', 'Everyone', 'Art & Design', 'January 7, 2018', '1.0.0', '4.0.3 and up']


['U Launcher Lite – FREE Live Cool Themes, Hide Apps', 'ART_AND_DESIGN', '4.7', '87510', '8.7M', '5,000,000+', 'Free', '0', 'Everyone', 'Art & Design', 'August 1, 2018', '1.2.4', '4.0.3 and up']


['Sketch - Draw & Paint', 'ART_AND_DESIGN', '4.5', '215644', '25M', '50,000,000+', 'Free', '0', 'Teen', 'Art & Design', 'June 8, 2018', 'Varies with device', '4.2 and up']


Number of rows: 9659
Number of columns: 13


### Deleting non-English apps
We use English for the apps we develop at our company, and we'd like to analyze only the apps that are directed toward an English-speaking audience

In [42]:
# all English apps with up to three emoji or other special characters will still be labeled as English
def isEnglish(name):
    i = 0
    for letter in name:
        if ord(letter) > 127:
            i += 1
    if i <= 3:
        return True
    return False

In [44]:
android_clean_en = []

for app in android_clean:
    name = app[0]
    if isEnglish(name):
        android_clean_en.append(app)
        
explore_data(android_clean_en)

['Photo Editor & Candy Camera & Grid & ScrapBook', 'ART_AND_DESIGN', '4.1', '159', '19M', '10,000+', 'Free', '0', 'Everyone', 'Art & Design', 'January 7, 2018', '1.0.0', '4.0.3 and up']


['U Launcher Lite – FREE Live Cool Themes, Hide Apps', 'ART_AND_DESIGN', '4.7', '87510', '8.7M', '5,000,000+', 'Free', '0', 'Everyone', 'Art & Design', 'August 1, 2018', '1.2.4', '4.0.3 and up']


['Sketch - Draw & Paint', 'ART_AND_DESIGN', '4.5', '215644', '25M', '50,000,000+', 'Free', '0', 'Teen', 'Art & Design', 'June 8, 2018', 'Varies with device', '4.2 and up']


Number of rows: 9614
Number of columns: 13


In [45]:
ios_clean_en = []

for app in ios_data:
    name = app[1]
    if isEnglish(name):
        ios_clean_en.append(app)
        
explore_data(ios_clean_en)

['284882215', 'Facebook', '389879808', 'USD', '0.0', '2974676', '212', '3.5', '3.5', '95.0', '4+', 'Social Networking', '37', '1', '29', '1']


['389801252', 'Instagram', '113954816', 'USD', '0.0', '2161558', '1289', '4.5', '4.0', '10.23', '12+', 'Photo & Video', '37', '0', '29', '1']


['529479190', 'Clash of Clans', '116476928', 'USD', '0.0', '2130805', '579', '4.5', '4.5', '9.24.12', '9+', 'Games', '38', '5', '18', '1']


Number of rows: 6183
Number of columns: 16


### Deleting non-Free apps
We only build apps that are free to download and install, and our main source of revenue consists of in-app ads

In [48]:
android_clean_en_free = []

for app in android_clean_en:
    price = app[7]
    if price == '0':
        android_clean_en_free.append(app)
        
explore_data(android_clean_en_free)

['Photo Editor & Candy Camera & Grid & ScrapBook', 'ART_AND_DESIGN', '4.1', '159', '19M', '10,000+', 'Free', '0', 'Everyone', 'Art & Design', 'January 7, 2018', '1.0.0', '4.0.3 and up']


['U Launcher Lite – FREE Live Cool Themes, Hide Apps', 'ART_AND_DESIGN', '4.7', '87510', '8.7M', '5,000,000+', 'Free', '0', 'Everyone', 'Art & Design', 'August 1, 2018', '1.2.4', '4.0.3 and up']


['Sketch - Draw & Paint', 'ART_AND_DESIGN', '4.5', '215644', '25M', '50,000,000+', 'Free', '0', 'Teen', 'Art & Design', 'June 8, 2018', 'Varies with device', '4.2 and up']


Number of rows: 8864
Number of columns: 13


In [49]:
ios_clean_en_free = []

for app in ios_clean_en:
    price = app[4]
    if price == '0.0':
        ios_clean_en_free.append(app)
        
explore_data(ios_clean_en_free)

['284882215', 'Facebook', '389879808', 'USD', '0.0', '2974676', '212', '3.5', '3.5', '95.0', '4+', 'Social Networking', '37', '1', '29', '1']


['389801252', 'Instagram', '113954816', 'USD', '0.0', '2161558', '1289', '4.5', '4.0', '10.23', '12+', 'Photo & Video', '37', '0', '29', '1']


['529479190', 'Clash of Clans', '116476928', 'USD', '0.0', '2130805', '579', '4.5', '4.5', '9.24.12', '9+', 'Games', '38', '5', '18', '1']


Number of rows: 3222
Number of columns: 16


## Data Analysis
*Our aim is to determine the kinds of apps that are likely to attract more users because our revenue is highly influenced by the number of people using our apps*

Our validation strategy for an app idea is comprised of three steps:

1. Build a minimal Android version of the app, and add it to Google Play.
2. If the app has a good response from users, we develop it further.
3. If the app is profitable after six months, we build an iOS version of the app and add it to the App Store.

In [54]:
and_dt = android_clean_en_free
ios_dt = ios_clean_en_free

### What are the most common genres in each market

In [52]:
def freq_table(dataset, index):
    fr_table = {}
    for row in dataset:
        if row[index] in fr_table:
            fr_table[row[index]] += 1
        else:
            fr_table[row[index]] = 1
    return fr_table
    

In [58]:
def display_table(dataset, index):
    table = freq_table(dataset, index)
    table_display = []
    for key in table:
        key_val_as_tuple = (table[key], key)
        table_display.append(key_val_as_tuple)

    table_sorted = sorted(table_display, reverse = True)
    for entry in table_sorted:
        print(entry[1], ':', entry[0])
        
print('Genres for Android:')
display_table(and_dt, 1)
print('\n', 'Category for Android:')
display_table(and_dt, 9)
print('\n', 'prime_genre for iOS:')
display_table(ios_dt, 11)

Genres for Android:
FAMILY : 1676
GAME : 862
TOOLS : 750
BUSINESS : 407
LIFESTYLE : 346
PRODUCTIVITY : 345
FINANCE : 328
MEDICAL : 313
SPORTS : 301
PERSONALIZATION : 294
COMMUNICATION : 287
HEALTH_AND_FITNESS : 273
PHOTOGRAPHY : 261
NEWS_AND_MAGAZINES : 248
SOCIAL : 236
TRAVEL_AND_LOCAL : 207
SHOPPING : 199
BOOKS_AND_REFERENCE : 190
DATING : 165
VIDEO_PLAYERS : 159
MAPS_AND_NAVIGATION : 124
FOOD_AND_DRINK : 110
EDUCATION : 103
ENTERTAINMENT : 85
LIBRARIES_AND_DEMO : 83
AUTO_AND_VEHICLES : 82
HOUSE_AND_HOME : 73
WEATHER : 71
EVENTS : 63
PARENTING : 58
ART_AND_DESIGN : 57
COMICS : 55
BEAUTY : 53

 Category for Android:
Tools : 749
Entertainment : 538
Education : 474
Business : 407
Productivity : 345
Lifestyle : 345
Finance : 328
Medical : 313
Sports : 307
Personalization : 294
Communication : 287
Action : 275
Health & Fitness : 273
Photography : 261
News & Magazines : 248
Social : 236
Travel & Local : 206
Shopping : 199
Books & Reference : 190
Simulation : 181
Dating : 165
Arcade : 164
V

#### Calculating the average number of user ratings per app genre on the App Store

##### iOs

In [59]:
fr_table = freq_table(ios_dt, 11)
print(fr_table)

{'Social Networking': 106, 'Photo & Video': 160, 'Games': 1874, 'Music': 66, 'Reference': 18, 'Health & Fitness': 65, 'Weather': 28, 'Utilities': 81, 'Travel': 40, 'Shopping': 84, 'News': 43, 'Navigation': 6, 'Lifestyle': 51, 'Entertainment': 254, 'Food & Drink': 26, 'Sports': 69, 'Book': 14, 'Finance': 36, 'Education': 118, 'Productivity': 56, 'Business': 17, 'Catalogs': 4, 'Medical': 6}


In [73]:
res_ios = []
avg_n_ratings_l = []

for genre in fr_table:
    total = 0 # This variable will store the sum of user ratings (the number of ratings, not the actual ratings)
    len_genre = 0 # This variable will store the number of apps specific to each genre.
    for row in ios_dt:
        genre_app = row[11]
        if genre_app == genre:
            total += float(row[5])
            len_genre += 1
    avg_n_ratings = total / len_genre

    res_ios.append(genre + ': ' + str(avg_n_ratings))
    avg_n_ratings_l.append(avg_n_ratings)
    
avg_n_ratings_l = sorted(avg_n_ratings_l, reverse=True)
for i in avg_n_ratings_l:
    for j in res_ios:
        if str(i) in j:
            print(j)



Navigation: 86090.33333333333
Reference: 74942.11111111111
Social Networking: 71548.34905660378
Music: 57326.530303030304
Weather: 52279.892857142855
Book: 39758.5
Food & Drink: 33333.92307692308
Finance: 31467.944444444445
Photo & Video: 28441.54375
Travel: 28243.8
Shopping: 26919.690476190477
Health & Fitness: 23298.015384615384
Sports: 23008.898550724636
Games: 22788.6696905016
News: 21248.023255813954
Productivity: 21028.410714285714
Utilities: 18684.456790123455
Lifestyle: 16485.764705882353
Entertainment: 14029.830708661417
Business: 7491.117647058823
Education: 7003.983050847458
Catalogs: 4004.0
Medical: 612.0


##### Android

In [61]:
fr_table_and = freq_table(and_dt, android_header.index('Category'))
print(fr_table_and)

{'ART_AND_DESIGN': 57, 'AUTO_AND_VEHICLES': 82, 'BEAUTY': 53, 'BOOKS_AND_REFERENCE': 190, 'BUSINESS': 407, 'COMICS': 55, 'COMMUNICATION': 287, 'DATING': 165, 'EDUCATION': 103, 'ENTERTAINMENT': 85, 'EVENTS': 63, 'FINANCE': 328, 'FOOD_AND_DRINK': 110, 'HEALTH_AND_FITNESS': 273, 'HOUSE_AND_HOME': 73, 'LIBRARIES_AND_DEMO': 83, 'LIFESTYLE': 346, 'GAME': 862, 'FAMILY': 1676, 'MEDICAL': 313, 'SOCIAL': 236, 'SHOPPING': 199, 'PHOTOGRAPHY': 261, 'SPORTS': 301, 'TRAVEL_AND_LOCAL': 207, 'TOOLS': 750, 'PERSONALIZATION': 294, 'PRODUCTIVITY': 345, 'PARENTING': 58, 'WEATHER': 71, 'VIDEO_PLAYERS': 159, 'NEWS_AND_MAGAZINES': 248, 'MAPS_AND_NAVIGATION': 124}


In [74]:
res_and = []
avg_n_installs_l = []

for category in fr_table_and:
    total = 0 # This variable will store the sum of installs specific to each genre
    len_category = 0 # This variable will store the number of apps specific to each genre
    for row in and_dt:
        category_app = row[1]
        if category_app == category:
            installs = float(row[5].replace('+', '').replace(',', ''))
            total += installs
            len_category += 1
    avg_n_installs = total / len_category
    #print(category, ': ', avg_n_installs)
    
    res_and.append(category + ': ' + str(avg_n_installs))
    avg_n_installs_l.append(avg_n_installs)
    
avg_n_installs_l = sorted(avg_n_installs_l, reverse=True)
for i in avg_n_installs_l:
    for j in res_and:
        if str(i) in j:
            print(j)
            

COMMUNICATION: 38456119.167247385
VIDEO_PLAYERS: 24727872.452830188
SOCIAL: 23253652.127118643
PHOTOGRAPHY: 17840110.40229885
PRODUCTIVITY: 16787331.344927534
GAME: 15588015.603248259
TRAVEL_AND_LOCAL: 13984077.710144928
ENTERTAINMENT: 11640705.88235294
TOOLS: 10801391.298666667
NEWS_AND_MAGAZINES: 9549178.467741935
BOOKS_AND_REFERENCE: 8767811.894736841
SHOPPING: 7036877.311557789
PERSONALIZATION: 5201482.6122448975
WEATHER: 5074486.197183099
HEALTH_AND_FITNESS: 4188821.9853479853
MAPS_AND_NAVIGATION: 4056941.7741935486
FAMILY: 3695641.8198090694
SPORTS: 3638640.1428571427
ART_AND_DESIGN: 1986335.0877192982
FOOD_AND_DRINK: 1924897.7363636363
EDUCATION: 1833495.145631068
BUSINESS: 1712290.1474201474
LIFESTYLE: 1437816.2687861272
FINANCE: 1387692.475609756
HOUSE_AND_HOME: 1331540.5616438356
DATING: 854028.8303030303
COMICS: 817657.2727272727
AUTO_AND_VEHICLES: 647317.8170731707
LIBRARIES_AND_DEMO: 638503.734939759
PARENTING: 542603.6206896552
BEAUTY: 513151.88679245283
EVENTS: 253542.22

## Conclusion
The Social Networking (for both systems) apps and Books (especially for android) genres are both succesfull on two patforms