# Data Analysis on Andriod and iOS Mobile Apps
<br>
This project is to discover and learn trends from apps that are used in the Google Play and App store. Which apps are more popular, have most daily users, and spend the most money.

## Gathering Data

In [69]:
from csv import reader

In [70]:
# Pulling in the csv files for both iOS and Google
# Dataset infomation for iOS
# https://www.kaggle.com/ramamet4/app-store-apple-data-set-10k-apps
apple_file = open('AppleStore.csv')
google_file = open('googleplaystore.csv')

# Reading in both of those files
# Dataset information for Google Play
# https://www.kaggle.com/lava18/google-play-store-apps
read_apple_file = reader(apple_file)
read_google_file = reader(google_file)

# Creating a list of lists of all data
apple_apps_data = list(read_apple_file)
google_apps_data = list(read_google_file)

In [71]:
# Checking to see if data read in correctly
apple_apps_data[:3]

[['id',
  'track_name',
  'size_bytes',
  'currency',
  'price',
  'rating_count_tot',
  'rating_count_ver',
  'user_rating',
  'user_rating_ver',
  'ver',
  'cont_rating',
  'prime_genre',
  'sup_devices.num',
  'ipadSc_urls.num',
  'lang.num',
  'vpp_lic'],
 ['284882215',
  'Facebook',
  '389879808',
  'USD',
  '0.0',
  '2974676',
  '212',
  '3.5',
  '3.5',
  '95.0',
  '4+',
  'Social Networking',
  '37',
  '1',
  '29',
  '1'],
 ['389801252',
  'Instagram',
  '113954816',
  'USD',
  '0.0',
  '2161558',
  '1289',
  '4.5',
  '4.0',
  '10.23',
  '12+',
  'Photo & Video',
  '37',
  '0',
  '29',
  '1']]

In [72]:
# Checking to see if data read in correctly
google_apps_data[:3]

[['App',
  'Category',
  'Rating',
  'Reviews',
  'Size',
  'Installs',
  'Type',
  'Price',
  'Content Rating',
  'Genres',
  'Last Updated',
  'Current Ver',
  'Android Ver'],
 ['Photo Editor & Candy Camera & Grid & ScrapBook',
  'ART_AND_DESIGN',
  '4.1',
  '159',
  '19M',
  '10,000+',
  'Free',
  '0',
  'Everyone',
  'Art & Design',
  'January 7, 2018',
  '1.0.0',
  '4.0.3 and up'],
 ['Coloring book moana',
  'ART_AND_DESIGN',
  '3.9',
  '967',
  '14M',
  '500,000+',
  'Free',
  '0',
  'Everyone',
  'Art & Design;Pretend Play',
  'January 15, 2018',
  '2.0.0',
  '4.0.3 and up']]

In [73]:
# Function to work with data (Created by Dataquest)
def explore_data(dataset, start, end, rows_and_columns=False):
    dataset_slice = dataset[start:end]
    for row in dataset_slice:
        print(row)
        print('\n')
        
    if rows_and_columns:
        print('Number of rows:', len(dataset))
        print('Number of columns:', len(dataset[0]))

In [74]:
explore_data(apple_apps_data, 1, 5, rows_and_columns=True)

['284882215', 'Facebook', '389879808', 'USD', '0.0', '2974676', '212', '3.5', '3.5', '95.0', '4+', 'Social Networking', '37', '1', '29', '1']


['389801252', 'Instagram', '113954816', 'USD', '0.0', '2161558', '1289', '4.5', '4.0', '10.23', '12+', 'Photo & Video', '37', '0', '29', '1']


['529479190', 'Clash of Clans', '116476928', 'USD', '0.0', '2130805', '579', '4.5', '4.5', '9.24.12', '9+', 'Games', '38', '5', '18', '1']


['420009108', 'Temple Run', '65921024', 'USD', '0.0', '1724546', '3842', '4.5', '4.0', '1.6.2', '9+', 'Games', '40', '5', '1', '1']


Number of rows: 7198
Number of columns: 16


In [75]:
explore_data(google_apps_data, 1, 5, rows_and_columns=True)

['Photo Editor & Candy Camera & Grid & ScrapBook', 'ART_AND_DESIGN', '4.1', '159', '19M', '10,000+', 'Free', '0', 'Everyone', 'Art & Design', 'January 7, 2018', '1.0.0', '4.0.3 and up']


['Coloring book moana', 'ART_AND_DESIGN', '3.9', '967', '14M', '500,000+', 'Free', '0', 'Everyone', 'Art & Design;Pretend Play', 'January 15, 2018', '2.0.0', '4.0.3 and up']


['U Launcher Lite – FREE Live Cool Themes, Hide Apps', 'ART_AND_DESIGN', '4.7', '87510', '8.7M', '5,000,000+', 'Free', '0', 'Everyone', 'Art & Design', 'August 1, 2018', '1.2.4', '4.0.3 and up']


['Sketch - Draw & Paint', 'ART_AND_DESIGN', '4.5', '215644', '25M', '50,000,000+', 'Free', '0', 'Teen', 'Art & Design', 'June 8, 2018', 'Varies with device', '4.2 and up']


Number of rows: 10842
Number of columns: 13


In [76]:
# Column names for iOS data
print(apple_apps_data[0])

['id', 'track_name', 'size_bytes', 'currency', 'price', 'rating_count_tot', 'rating_count_ver', 'user_rating', 'user_rating_ver', 'ver', 'cont_rating', 'prime_genre', 'sup_devices.num', 'ipadSc_urls.num', 'lang.num', 'vpp_lic']


In [77]:
# Column names for Google data
print(google_apps_data[0])

['App', 'Category', 'Rating', 'Reviews', 'Size', 'Installs', 'Type', 'Price', 'Content Rating', 'Genres', 'Last Updated', 'Current Ver', 'Android Ver']


In [78]:
print(len(google_apps_data[10473]))
print(google_apps_data[10473])

12
['Life Made WI-Fi Touchscreen Photo Frame', '1.9', '19', '3.0M', '1,000+', 'Free', '0', 'Everyone', '', 'February 11, 2018', '1.0.19', '4.0 and up']


In [79]:
del google_apps_data[10473]

There are duplicate apps in the dataset. The code below is to discover how many duplicates there are and the names of the apps.

In [80]:
# Using code in lesson to check for duplicate Google Play apps

duplicate_google_apps = []
unique_google_apps = []

for app in google_apps_data:
    name = app[0]
    if name in unique_google_apps:
        duplicate_google_apps.append(name)
    else:
        unique_google_apps.append(name)
        
print('Number of duplicate Google Play apps:', len(duplicate_google_apps))
print('\n')
print('Examples of duplicate Google Play apps:', duplicate_google_apps[:15])

Number of duplicate Google Play apps: 1181


Examples of duplicate Google Play apps: ['Quick PDF Scanner + OCR FREE', 'Box', 'Google My Business', 'ZOOM Cloud Meetings', 'join.me - Simple Meetings', 'Box', 'Zenefits', 'Google Ads', 'Google My Business', 'Slack', 'FreshBooks Classic', 'Insightly CRM', 'QuickBooks Accounting: Invoicing & Expenses', 'HipChat - Chat Built for Teams', 'Xero Accounting Software']


In [81]:
# Using code in lesson to check for duplicate iOS apps

duplicate_apple_apps = []
unique_apple_apps = []

for app in apple_apps_data:
    name = app[1] # track_name index 1
    if name in unique_apple_apps:
        duplicate_apple_apps.append(name)
    else:
        unique_apple_apps.append(name)
        
print('Number of duplicate iOS apps:', len(duplicate_apple_apps))
print('\n')
print('Examples of duplicate iOS apps:', duplicate_apple_apps[:15])

Number of duplicate iOS apps: 2


Examples of duplicate iOS apps: ['Mannequin Challenge', 'VR Roller Coaster']


The duplicate rows will need to be deleted based on which row has the highest reviews. It is assumed that the app with the highest reviews will be the most recent and updated information for the app.

In [82]:
reviews_max = {}

# Looping through Google Play apps, not inluding header row
for app in google_apps_data[1:]:
    name = app[0]
    n_reviews = float(app[3])
    
    # Checking to see which duplicate has more ratings
    # If not in the dict, appending to the dict
    if name in reviews_max and reviews_max[name] < n_reviews:
        reviews_max[name] = n_reviews
    elif name not in reviews_max:
        reviews_max[name] = n_reviews
print(len(reviews_max))

9659


In [83]:
# Using the dictionary as a source to clean out duplicates
# Creating an empty list to have only one of each app
google_play_clean = []
already_added = []

for app in google_apps_data[1:]:
    name = app[0]
    n_reviews = float(app[3])
    
    if n_reviews == reviews_max[name] and name not in already_added:
        google_play_clean.append(app)
        already_added.append(name)
        

In [84]:
# Checking to see if the data was cleaned properly
print(len(google_play_clean))
print(google_play_clean[:5])

9659
[['Photo Editor & Candy Camera & Grid & ScrapBook', 'ART_AND_DESIGN', '4.1', '159', '19M', '10,000+', 'Free', '0', 'Everyone', 'Art & Design', 'January 7, 2018', '1.0.0', '4.0.3 and up'], ['U Launcher Lite – FREE Live Cool Themes, Hide Apps', 'ART_AND_DESIGN', '4.7', '87510', '8.7M', '5,000,000+', 'Free', '0', 'Everyone', 'Art & Design', 'August 1, 2018', '1.2.4', '4.0.3 and up'], ['Sketch - Draw & Paint', 'ART_AND_DESIGN', '4.5', '215644', '25M', '50,000,000+', 'Free', '0', 'Teen', 'Art & Design', 'June 8, 2018', 'Varies with device', '4.2 and up'], ['Pixel Draw - Number Art Coloring Book', 'ART_AND_DESIGN', '4.3', '967', '2.8M', '100,000+', 'Free', '0', 'Everyone', 'Art & Design;Creativity', 'June 20, 2018', '1.1', '4.4 and up'], ['Paper flowers instructions', 'ART_AND_DESIGN', '4.4', '167', '5.6M', '50,000+', 'Free', '0', 'Everyone', 'Art & Design', 'March 26, 2017', '1.0', '2.3 and up']]


In [85]:
# Function to remove non-english apps
# Using ASCII range to check for english characters
def check_english_char(string):
    
    # Non-english char counter
    char_check = 0
    
    for char in string:
        if ord(char) > 127:
            char_check += 1
            
            # If more than 3 char are not engish then it returns false
            if ord(char) > 127 and char_check > 3:
                return False
    return True

In [86]:
check_english_char('Instagram')

True

In [87]:
check_english_char('爱奇艺PPS -《欢乐颂2》电视剧热播')

False

In [88]:
check_english_char('Docs To Go™ Free Office Suite')

True

In [89]:
check_english_char('Instachat 😜')

True

In [90]:
# Removing Non-Enlgish apps from Google Play dataset
android_clean = []

for app in google_play_clean:
    
    if check_english_char(app[0]):
        android_clean.append(app)

In [91]:
print(len(android_clean))
print(android_clean[:5])

9614
[['Photo Editor & Candy Camera & Grid & ScrapBook', 'ART_AND_DESIGN', '4.1', '159', '19M', '10,000+', 'Free', '0', 'Everyone', 'Art & Design', 'January 7, 2018', '1.0.0', '4.0.3 and up'], ['U Launcher Lite – FREE Live Cool Themes, Hide Apps', 'ART_AND_DESIGN', '4.7', '87510', '8.7M', '5,000,000+', 'Free', '0', 'Everyone', 'Art & Design', 'August 1, 2018', '1.2.4', '4.0.3 and up'], ['Sketch - Draw & Paint', 'ART_AND_DESIGN', '4.5', '215644', '25M', '50,000,000+', 'Free', '0', 'Teen', 'Art & Design', 'June 8, 2018', 'Varies with device', '4.2 and up'], ['Pixel Draw - Number Art Coloring Book', 'ART_AND_DESIGN', '4.3', '967', '2.8M', '100,000+', 'Free', '0', 'Everyone', 'Art & Design;Creativity', 'June 20, 2018', '1.1', '4.4 and up'], ['Paper flowers instructions', 'ART_AND_DESIGN', '4.4', '167', '5.6M', '50,000+', 'Free', '0', 'Everyone', 'Art & Design', 'March 26, 2017', '1.0', '2.3 and up']]


In [92]:
# Removing Non-Enlgish apps from iOS dataset
apple_clean = []

for app in apple_apps_data:
    if check_english_char(app[1]):
        apple_clean.append(app)

In [93]:
print(len(apple_clean))
print(apple_clean[:5])

6184
[['id', 'track_name', 'size_bytes', 'currency', 'price', 'rating_count_tot', 'rating_count_ver', 'user_rating', 'user_rating_ver', 'ver', 'cont_rating', 'prime_genre', 'sup_devices.num', 'ipadSc_urls.num', 'lang.num', 'vpp_lic'], ['284882215', 'Facebook', '389879808', 'USD', '0.0', '2974676', '212', '3.5', '3.5', '95.0', '4+', 'Social Networking', '37', '1', '29', '1'], ['389801252', 'Instagram', '113954816', 'USD', '0.0', '2161558', '1289', '4.5', '4.0', '10.23', '12+', 'Photo & Video', '37', '0', '29', '1'], ['529479190', 'Clash of Clans', '116476928', 'USD', '0.0', '2130805', '579', '4.5', '4.5', '9.24.12', '9+', 'Games', '38', '5', '18', '1'], ['420009108', 'Temple Run', '65921024', 'USD', '0.0', '1724546', '3842', '4.5', '4.0', '1.6.2', '9+', 'Games', '40', '5', '1', '1']]


### Making a list with just the free apps

Since the project is based around learning from data trends with English apps and Free apps. There is a need to remove all the apps that aren't free.

In [94]:
android_free_apps = []
ios_free_apps = []

for apps in apple_clean:
    
    if apps[4] == '0.0':
        ios_free_apps.append(apps)
        
for apps in android_clean:
    if apps[7] == '0':
        android_free_apps.append(apps)
        
print('Android Lenght: ', len(android_free_apps))
print('iOS Length: ', len(ios_free_apps))

Android Lenght:  8864
iOS Length:  3222


In [95]:
android_final = android_free_apps
ios_final = ios_free_apps

Since the end goal is to add a popular app on both Google Play and the iOS App Store we need to see which categories do best in both markets.

First, we will have to see what apps/genres are most popular for each market. This will give us a better view into what apps to think about developing that will work for both markets. 

In [96]:
ios_column_names = apple_apps_data[0]
android_column_names = google_apps_data[0]
print(ios_column_names)
print('\n')
print(android_column_names)

['id', 'track_name', 'size_bytes', 'currency', 'price', 'rating_count_tot', 'rating_count_ver', 'user_rating', 'user_rating_ver', 'ver', 'cont_rating', 'prime_genre', 'sup_devices.num', 'ipadSc_urls.num', 'lang.num', 'vpp_lic']


['App', 'Category', 'Rating', 'Reviews', 'Size', 'Installs', 'Type', 'Price', 'Content Rating', 'Genres', 'Last Updated', 'Current Ver', 'Android Ver']


Looking at the column names it seems like the best columns to use would be:
- iOS:
    - 'prime_genre' 
- Android:
    - 'Category'
    - 'Genres'

In [97]:
def freq_table(dataset, index):
    
    freq_dict = {}
    
    for app in dataset:
        if app[index] in freq_dict:
            freq_dict[app[index]] += 1
        else:
            freq_dict[app[index]] = 1
    
    for percentage in freq_dict:
        freq_dict[percentage] /= len(dataset)
        freq_dict[percentage] *= 100
        
    return freq_dict

def display_table(dataset, index):
    table = freq_table(dataset, index)
    table_display = []
    
    for key in table:
        key_val_as_tuple = (table[key], key)
        table_display.append(key_val_as_tuple)
        
    table_sorted = sorted(table_display, reverse = True)
    for entry in table_sorted:
        print(entry[1], ':', entry[0])

In [98]:
display_table(ios_final, 11)

Games : 58.16263190564867
Entertainment : 7.883302296710118
Photo & Video : 4.9658597144630665
Education : 3.662321539416512
Social Networking : 3.2898820608317814
Shopping : 2.60707635009311
Utilities : 2.5139664804469275
Sports : 2.1415270018621975
Music : 2.0484171322160147
Health & Fitness : 2.0173805090006205
Productivity : 1.7380509000620732
Lifestyle : 1.5828677839851024
News : 1.3345747982619491
Travel : 1.2414649286157666
Finance : 1.1173184357541899
Weather : 0.8690254500310366
Food & Drink : 0.8069522036002483
Reference : 0.5586592178770949
Business : 0.5276225946617008
Book : 0.4345127250155183
Navigation : 0.186219739292365
Medical : 0.186219739292365
Catalogs : 0.12414649286157665


Based on the initial percentages of the iOS App Store. The most common and popular apps are based around the `Games` genre. `Games` has about 58% of the usage. The runner-up is `Entertainment` at about 7.8%. There is a huge margin between the two genres. At a glance you can see that genres that are more based around extra-curricular things are more popular. `Games`, `Entertainment`, `Photo & Video`, `Social Networking`, and `Shopping` taking the top spots. `Education` is also in the top spots, but my guess is that the education apps could be close to game and entertainment.
If you were to make an app for iOS the best reccomendation would be to create a game based app. More along the lines of entertainment. 

In [99]:
display_table(android_final, 1) # Category column

FAMILY : 18.907942238267147
GAME : 9.724729241877256
TOOLS : 8.461191335740072
BUSINESS : 4.591606498194946
LIFESTYLE : 3.9034296028880866
PRODUCTIVITY : 3.892148014440433
FINANCE : 3.7003610108303246
MEDICAL : 3.531137184115524
SPORTS : 3.395758122743682
PERSONALIZATION : 3.3167870036101084
COMMUNICATION : 3.2378158844765346
HEALTH_AND_FITNESS : 3.0798736462093865
PHOTOGRAPHY : 2.944494584837545
NEWS_AND_MAGAZINES : 2.7978339350180503
SOCIAL : 2.6624548736462095
TRAVEL_AND_LOCAL : 2.33528880866426
SHOPPING : 2.2450361010830324
BOOKS_AND_REFERENCE : 2.1435018050541514
DATING : 1.861462093862816
VIDEO_PLAYERS : 1.7937725631768955
MAPS_AND_NAVIGATION : 1.3989169675090252
FOOD_AND_DRINK : 1.2409747292418771
EDUCATION : 1.1620036101083033
ENTERTAINMENT : 0.9589350180505415
LIBRARIES_AND_DEMO : 0.9363718411552346
AUTO_AND_VEHICLES : 0.9250902527075812
HOUSE_AND_HOME : 0.8235559566787004
WEATHER : 0.8009927797833934
EVENTS : 0.7107400722021661
PARENTING : 0.6543321299638989
ART_AND_DESIGN : 

In [100]:
display_table(android_final, 9)

Tools : 8.449909747292418
Entertainment : 6.069494584837545
Education : 5.347472924187725
Business : 4.591606498194946
Productivity : 3.892148014440433
Lifestyle : 3.892148014440433
Finance : 3.7003610108303246
Medical : 3.531137184115524
Sports : 3.463447653429603
Personalization : 3.3167870036101084
Communication : 3.2378158844765346
Action : 3.1024368231046933
Health & Fitness : 3.0798736462093865
Photography : 2.944494584837545
News & Magazines : 2.7978339350180503
Social : 2.6624548736462095
Travel & Local : 2.3240072202166067
Shopping : 2.2450361010830324
Books & Reference : 2.1435018050541514
Simulation : 2.0419675090252705
Dating : 1.861462093862816
Arcade : 1.8501805054151623
Video Players & Editors : 1.7712093862815883
Casual : 1.7599277978339352
Maps & Navigation : 1.3989169675090252
Food & Drink : 1.2409747292418771
Puzzle : 1.128158844765343
Racing : 0.9927797833935018
Role Playing : 0.9363718411552346
Libraries & Demo : 0.9363718411552346
Auto & Vehicles : 0.9250902527075

Android apps are a bit different than the iOS App Store. Immediately you can see that it is more productivity based. Just in the top 10 apps you can see that most of them are not entertainment based. `FAMILY`, `TOOLS`, `BUSINESS`, `PRODUCTIVITY`, `FINANCE`, `MEDICAL`, and `PERSONALIZATION` being largely productivity based.
`GAME`, `SPORTS` and `LIFESTYLE` largely entertainment based.
There is also not a huge margin between the genres like in the iOS table. `FAMILY` being the most popular at 18.9% and `GAMES` following at 9.7%. 
<br> 
Even when you look in the *Genres* category you see that the top 5 to 10 popular genres are largely productivity based. 
If you were to make an app for Google Play you would want to make one that is based on trying to increase productivity.

## Average number of user rating App Store

In [101]:
ios_genres = freq_table(ios_final, 11)

In [102]:
for genre in ios_genres:
    
    total = 0
    len_genre = 0
    
    for app in ios_final:
        genre_app = app[11]
        
        if genre_app == genre:
            user_ratings = float(app[5])
            total += user_ratings
            len_genre += 1
            
    average_user_rating = total / len_genre
    
    print(genre, ':', average_user_rating)

Social Networking : 71548.34905660378
Photo & Video : 28441.54375
Games : 22788.6696905016
Music : 57326.530303030304
Reference : 74942.11111111111
Health & Fitness : 23298.015384615384
Weather : 52279.892857142855
Utilities : 18684.456790123455
Travel : 28243.8
Shopping : 26919.690476190477
News : 21248.023255813954
Navigation : 86090.33333333333
Lifestyle : 16485.764705882353
Entertainment : 14029.830708661417
Food & Drink : 33333.92307692308
Sports : 23008.898550724636
Book : 39758.5
Finance : 31467.944444444445
Education : 7003.983050847458
Productivity : 21028.410714285714
Business : 7491.117647058823
Catalogs : 4004.0
Medical : 612.0


Based on the information from average reviews. It looks as though `Navigation` has the most reviews written about it. Then right after that is `Reference` and `Social Networking`.

In [103]:
android_categories = freq_table(android_final, 1)

In [105]:
n_installs = '1,000,000+'

In [109]:
n_installs.replace(',', '')

'1,000,000+'

In [111]:
for category in android_categories:
    
    total = 0
    len_category = 0
    
    for app in android_final:
        category_app = app[1]
        
        if category_app == category:
            n_installs = app[5].replace(',', '')
            n_installs = n_installs.replace('+', '')
            total += float(n_installs)
            len_category += 1
            
    average_category = total / len_category
    
    print(category, average_category)

ART_AND_DESIGN 1986335.0877192982
AUTO_AND_VEHICLES 647317.8170731707
BEAUTY 513151.88679245283
BOOKS_AND_REFERENCE 8767811.894736841
BUSINESS 1712290.1474201474
COMICS 817657.2727272727
COMMUNICATION 38456119.167247385
DATING 854028.8303030303
EDUCATION 1833495.145631068
ENTERTAINMENT 11640705.88235294
EVENTS 253542.22222222222
FINANCE 1387692.475609756
FOOD_AND_DRINK 1924897.7363636363
HEALTH_AND_FITNESS 4188821.9853479853
HOUSE_AND_HOME 1331540.5616438356
LIBRARIES_AND_DEMO 638503.734939759
LIFESTYLE 1437816.2687861272
GAME 15588015.603248259
FAMILY 3695641.8198090694
MEDICAL 120550.61980830671
SOCIAL 23253652.127118643
SHOPPING 7036877.311557789
PHOTOGRAPHY 17840110.40229885
SPORTS 3638640.1428571427
TRAVEL_AND_LOCAL 13984077.710144928
TOOLS 10801391.298666667
PERSONALIZATION 5201482.6122448975
PRODUCTIVITY 16787331.344927534
PARENTING 542603.6206896552
WEATHER 5074486.197183099
VIDEO_PLAYERS 24727872.452830188
NEWS_AND_MAGAZINES 9549178.467741935
MAPS_AND_NAVIGATION 4056941.774193

One recc