# Mobile App Data Analysis
What type of apps are likely to attract more users?

## Opening and exploring the Data ##

In [1]:
from csv import reader
applestore_data = list(reader(open('AppleStore.csv')))
googleplay_data = list(reader(open('googleplaystore.csv')))

In [2]:
def explore_data(dataset, start, end, rows_and_columns=False):
    dataset_slice = dataset[start:end]    
    for row in dataset_slice:
        print(row)
        print('\n') # adds a new (empty) line after each row

    if rows_and_columns:
        print('Number of rows:', len(dataset))
        print('Number of columns:', len(dataset[0]))

In [3]:
explore_data(applestore_data, 0, 4, True)

['id', 'track_name', 'size_bytes', 'currency', 'price', 'rating_count_tot', 'rating_count_ver', 'user_rating', 'user_rating_ver', 'ver', 'cont_rating', 'prime_genre', 'sup_devices.num', 'ipadSc_urls.num', 'lang.num', 'vpp_lic']


['284882215', 'Facebook', '389879808', 'USD', '0.0', '2974676', '212', '3.5', '3.5', '95.0', '4+', 'Social Networking', '37', '1', '29', '1']


['389801252', 'Instagram', '113954816', 'USD', '0.0', '2161558', '1289', '4.5', '4.0', '10.23', '12+', 'Photo & Video', '37', '0', '29', '1']


['529479190', 'Clash of Clans', '116476928', 'USD', '0.0', '2130805', '579', '4.5', '4.5', '9.24.12', '9+', 'Games', '38', '5', '18', '1']


Number of rows: 7198
Number of columns: 16


Useful columns: 'track_name', 'currency', 'price', 'rating_count_tot', 'user_rating', 'prime_genre'

In [4]:
explore_data(googleplay_data, 0, 4, True)

['App', 'Category', 'Rating', 'Reviews', 'Size', 'Installs', 'Type', 'Price', 'Content Rating', 'Genres', 'Last Updated', 'Current Ver', 'Android Ver']


['Photo Editor & Candy Camera & Grid & ScrapBook', 'ART_AND_DESIGN', '4.1', '159', '19M', '10,000+', 'Free', '0', 'Everyone', 'Art & Design', 'January 7, 2018', '1.0.0', '4.0.3 and up']


['Coloring book moana', 'ART_AND_DESIGN', '3.9', '967', '14M', '500,000+', 'Free', '0', 'Everyone', 'Art & Design;Pretend Play', 'January 15, 2018', '2.0.0', '4.0.3 and up']


['U Launcher Lite – FREE Live Cool Themes, Hide Apps', 'ART_AND_DESIGN', '4.7', '87510', '8.7M', '5,000,000+', 'Free', '0', 'Everyone', 'Art & Design', 'August 1, 2018', '1.2.4', '4.0.3 and up']


Number of rows: 10842
Number of columns: 13


Useful columns: 'App', 'Category', 'Rating', 'Reviews','Type', 'Price', 'Genres'

## Deleting Wrong Data ##

In [5]:
print(googleplay_data[10473])

['Life Made WI-Fi Touchscreen Photo Frame', '1.9', '19', '3.0M', '1,000+', 'Free', '0', 'Everyone', '', 'February 11, 2018', '1.0.19', '4.0 and up']


In [6]:
del googleplay_data[10473]

## Removing duplicate Entries ##
Checking for duplicate entries in Google Play data:

In [7]:
duplicate = []
unique = []
for app in googleplay_data[1:]:
    name = app[0]
    if name in unique:
        duplicate.append(name)
    else:
        unique.append(name)
print('Number of duplicate apps: ', len(duplicate))
print('Exampe of duplicate apps: ', duplicate[:5])

Number of duplicate apps:  1181
Exampe of duplicate apps:  ['Quick PDF Scanner + OCR FREE', 'Box', 'Google My Business', 'ZOOM Cloud Meetings', 'join.me - Simple Meetings']


Print some duplicate rows:

In [8]:
for app in googleplay_data[1:]:
    if app[0] == 'Quick PDF Scanner + OCR FREE':
        print(app)

['Quick PDF Scanner + OCR FREE', 'BUSINESS', '4.2', '80805', 'Varies with device', '5,000,000+', 'Free', '0', 'Everyone', 'Business', 'February 26, 2018', 'Varies with device', '4.0.3 and up']
['Quick PDF Scanner + OCR FREE', 'BUSINESS', '4.2', '80805', 'Varies with device', '5,000,000+', 'Free', '0', 'Everyone', 'Business', 'February 26, 2018', 'Varies with device', '4.0.3 and up']
['Quick PDF Scanner + OCR FREE', 'BUSINESS', '4.2', '80804', 'Varies with device', '5,000,000+', 'Free', '0', 'Everyone', 'Business', 'February 26, 2018', 'Varies with device', '4.0.3 and up']


The higher the number of reviews, the more recent the data should be. Rather than removing duplicates randomly, we'll only keep the row with the highest number of reviews and remove the other entries for any given app.

In [9]:
reviews_max = {}
for app in googleplay_data[1:]:
    name = app[0]
    n_reviews = float(app[3])
    if name in reviews_max and reviews_max[name] < n_reviews:
        reviews_max[name] = n_reviews
    if name not in reviews_max:
        reviews_max[name] = n_reviews

In [10]:
len(reviews_max)

9659

In [11]:
android_clean = []
already_added = []
for app in googleplay_data[1:]:
    name = app[0]
    n_reviews = float(app[3])
    if n_reviews == reviews_max[name] and name not in already_added:
        android_clean.append(app)
        already_added.append(name)

In [12]:
len(android_clean)

9659

In [13]:
print(android_clean[:20])

[['Photo Editor & Candy Camera & Grid & ScrapBook', 'ART_AND_DESIGN', '4.1', '159', '19M', '10,000+', 'Free', '0', 'Everyone', 'Art & Design', 'January 7, 2018', '1.0.0', '4.0.3 and up'], ['U Launcher Lite – FREE Live Cool Themes, Hide Apps', 'ART_AND_DESIGN', '4.7', '87510', '8.7M', '5,000,000+', 'Free', '0', 'Everyone', 'Art & Design', 'August 1, 2018', '1.2.4', '4.0.3 and up'], ['Sketch - Draw & Paint', 'ART_AND_DESIGN', '4.5', '215644', '25M', '50,000,000+', 'Free', '0', 'Teen', 'Art & Design', 'June 8, 2018', 'Varies with device', '4.2 and up'], ['Pixel Draw - Number Art Coloring Book', 'ART_AND_DESIGN', '4.3', '967', '2.8M', '100,000+', 'Free', '0', 'Everyone', 'Art & Design;Creativity', 'June 20, 2018', '1.1', '4.4 and up'], ['Paper flowers instructions', 'ART_AND_DESIGN', '4.4', '167', '5.6M', '50,000+', 'Free', '0', 'Everyone', 'Art & Design', 'March 26, 2017', '1.0', '2.3 and up'], ['Smoke Effect Photo Maker - Smoke Editor', 'ART_AND_DESIGN', '3.8', '178', '19M', '50,000+', '

## Removing Non-English Apps ##

Function checks whether a string is English:

In [14]:
def is_english(string):
    count = 0
    for char in string:
        if ord(char) > 127:
            count += 1
    if count > 3:
        return False
    
    return True

print(is_english('Instagram'))
print(is_english('爱奇艺PPS -《欢乐颂2》电视剧热播'))
print(is_english('Docs To Go™ Free Office Suite'))
print(is_english('Instachat 😜'))

True
False
True
True


Filter out non-English apps:

In [15]:
android_clean_eng = []
for app in android_clean:
    if is_english(app[0]):
        android_clean_eng.append(app)
print(len(android_clean_eng))

ios_eng = []
for app in applestore_data[1:]:
    if is_english(app[1]):
        ios_eng.append(app)
print(len(ios_eng))

9614
6183


## Isolating free Apps##

In [16]:
android_free = []
ios_free = []

for app in android_clean_eng:
    if app[7] == '0':
        android_free.append(app)
for app in ios_eng:
    if app[4] == '0.0':
        ios_free.append(app)
print('Number of android apps: ', len(android_free))
print('Number of ios apps: ', len(ios_free))

Number of android apps:  8864
Number of ios apps:  3222


## Most common apps by genre##

As we mentioned in the introduction, our aim is to determine the kinds of apps that are likely to attract more users because our revenue is highly influenced by the number of people using our apps.

To minimize risks and overhead, our validation strategy for an app idea is comprised of three steps:

Build a minimal Android version of the app, and add it to Google Play.
If the app has a good response from users, we develop it further.
If the app is profitable after six months, we build an iOS version of the app and add it to the App Store.
Because our end goal is to add the app on both Google Play and the App Store, we need to find app profiles that are successful on both markets. For instance, a profile that works well for both markets might be a productivity app that makes use of gamification.

We'll build a frequency table for the prime_genre column of the App Store data set, and the Genres and Category columns of the Google Play data set.

Function for creating frequecy table (percentages):

In [17]:
def freq_table(dataset, index):
    table = {}
    n_rows = 0
    for row in dataset:
        n_rows += 1
        value = row[index]
        if value in table:
            table[value] += 1
        else: table[value] = 1
    for value in table:
        table[value] = round(table[value]/n_rows*100, 2)
        
    return table           

Function for displaying table entries in descending order:

In [18]:
def display_table(dataset, index):
    table = freq_table(dataset, index)
    table_display = []
    for key in table:
        key_val_as_tuple = (table[key], key)
        table_display.append(key_val_as_tuple)

    table_sorted = sorted(table_display, reverse = True)
    for entry in table_sorted:
        print(entry[1], ':', entry[0])

Display the frequency table of prime_genre:

In [19]:
display_table(ios_free, 11)

Games : 58.16
Entertainment : 7.88
Photo & Video : 4.97
Education : 3.66
Social Networking : 3.29
Shopping : 2.61
Utilities : 2.51
Sports : 2.14
Music : 2.05
Health & Fitness : 2.02
Productivity : 1.74
Lifestyle : 1.58
News : 1.33
Travel : 1.24
Finance : 1.12
Weather : 0.87
Food & Drink : 0.81
Reference : 0.56
Business : 0.53
Book : 0.43
Navigation : 0.19
Medical : 0.19
Catalogs : 0.12


Display the frequency table of Genres:

In [20]:
display_table(android_free, 1)

FAMILY : 18.91
GAME : 9.72
TOOLS : 8.46
BUSINESS : 4.59
LIFESTYLE : 3.9
PRODUCTIVITY : 3.89
FINANCE : 3.7
MEDICAL : 3.53
SPORTS : 3.4
PERSONALIZATION : 3.32
COMMUNICATION : 3.24
HEALTH_AND_FITNESS : 3.08
PHOTOGRAPHY : 2.94
NEWS_AND_MAGAZINES : 2.8
SOCIAL : 2.66
TRAVEL_AND_LOCAL : 2.34
SHOPPING : 2.25
BOOKS_AND_REFERENCE : 2.14
DATING : 1.86
VIDEO_PLAYERS : 1.79
MAPS_AND_NAVIGATION : 1.4
FOOD_AND_DRINK : 1.24
EDUCATION : 1.16
ENTERTAINMENT : 0.96
LIBRARIES_AND_DEMO : 0.94
AUTO_AND_VEHICLES : 0.93
HOUSE_AND_HOME : 0.82
WEATHER : 0.8
EVENTS : 0.71
PARENTING : 0.65
ART_AND_DESIGN : 0.64
COMICS : 0.62
BEAUTY : 0.6


Display the frequency table of Category:

In [21]:
display_table(android_free, 9)

Tools : 8.45
Entertainment : 6.07
Education : 5.35
Business : 4.59
Productivity : 3.89
Lifestyle : 3.89
Finance : 3.7
Medical : 3.53
Sports : 3.46
Personalization : 3.32
Communication : 3.24
Action : 3.1
Health & Fitness : 3.08
Photography : 2.94
News & Magazines : 2.8
Social : 2.66
Travel & Local : 2.32
Shopping : 2.25
Books & Reference : 2.14
Simulation : 2.04
Dating : 1.86
Arcade : 1.85
Video Players & Editors : 1.77
Casual : 1.76
Maps & Navigation : 1.4
Food & Drink : 1.24
Puzzle : 1.13
Racing : 0.99
Role Playing : 0.94
Libraries & Demo : 0.94
Auto & Vehicles : 0.93
Strategy : 0.91
House & Home : 0.82
Weather : 0.8
Events : 0.71
Adventure : 0.68
Comics : 0.61
Beauty : 0.6
Art & Design : 0.6
Parenting : 0.5
Card : 0.45
Casino : 0.43
Trivia : 0.42
Educational;Education : 0.39
Board : 0.38
Educational : 0.37
Education;Education : 0.34
Word : 0.26
Casual;Pretend Play : 0.24
Music : 0.2
Racing;Action & Adventure : 0.17
Puzzle;Brain Games : 0.17
Entertainment;Music & Video : 0.17
Casual;

## Most popular apps by Genre on Apple Store ##

In [23]:
prime_genre = freq_table(ios_free, 11)
for genre in prime_genre:
    total = 0
    len_genre = 0
    for row in ios_free:
        genre_app = row[11]
        if genre_app == genre:
            total += float(row[5])
            len_genre += 1
    print(genre, ': ', total / len_genre)

Food & Drink :  33333.92307692308
Photo & Video :  28441.54375
Business :  7491.117647058823
Sports :  23008.898550724636
Utilities :  18684.456790123455
News :  21248.023255813954
Lifestyle :  16485.764705882353
Education :  7003.983050847458
Productivity :  21028.410714285714
Catalogs :  4004.0
Games :  22788.6696905016
Music :  57326.530303030304
Weather :  52279.892857142855
Social Networking :  71548.34905660378
Entertainment :  14029.830708661417
Finance :  31467.944444444445
Medical :  612.0
Health & Fitness :  23298.015384615384
Navigation :  86090.33333333333
Shopping :  26919.690476190477
Travel :  28243.8
Reference :  74942.11111111111
Book :  39758.5


## Most popular apps by Genre on Google Play ##

In [26]:
category_table = freq_table(android_free, 1)
for category in category_table:
    total = 0
    len_category = 0
    for row in android_free:
        category_app = row[1]
        if category_app == category:
            install = row[5].replace('+', '').replace(',', '')
            total += float(install)
            len_category += 1
    print(category, ': ', total / len_category)

BOOKS_AND_REFERENCE :  8767811.894736841
SOCIAL :  23253652.127118643
PERSONALIZATION :  5201482.6122448975
FINANCE :  1387692.475609756
EVENTS :  253542.22222222222
TRAVEL_AND_LOCAL :  13984077.710144928
ENTERTAINMENT :  11640705.88235294
VIDEO_PLAYERS :  24727872.452830188
SPORTS :  3638640.1428571427
LIFESTYLE :  1437816.2687861272
MEDICAL :  120550.61980830671
PHOTOGRAPHY :  17840110.40229885
ART_AND_DESIGN :  1986335.0877192982
TOOLS :  10801391.298666667
SHOPPING :  7036877.311557789
COMICS :  817657.2727272727
MAPS_AND_NAVIGATION :  4056941.7741935486
AUTO_AND_VEHICLES :  647317.8170731707
WEATHER :  5074486.197183099
FOOD_AND_DRINK :  1924897.7363636363
BUSINESS :  1712290.1474201474
FAMILY :  3695641.8198090694
NEWS_AND_MAGAZINES :  9549178.467741935
PARENTING :  542603.6206896552
HOUSE_AND_HOME :  1331540.5616438356
GAME :  15588015.603248259
EDUCATION :  1833495.145631068
PRODUCTIVITY :  16787331.344927534
DATING :  854028.8303030303
HEALTH_AND_FITNESS :  4188821.9853479853
