# Profitable App Profiles for the App Store and Google Play Markets

 Our goal for this project is to analyze data to help our developers understand what type of apps are likely to attract more users.

Opening the file

In [1]:
import csv
from csv import reader

apple_file = list(reader(open('AppleStore.csv', encoding='utf8')))
google_file = list(reader(open('googleplaystore.csv', encoding='utf8')))



## Exploring Data and finding out how long each row and column is

In [2]:
def explore_data(dataset, start, end, rows_and_columns=False, header = True):
    dataset_slice = dataset[start:end]    
    for row in dataset_slice:
        print(row)
        print('\n') # adds a new (empty) line after each row

    if rows_and_columns and header:
        print('Number of rows:', len(dataset[1:]))
        print('Number of columns:', len(dataset[0]))
    elif rows_and_columns:
        print('Number of rows:', len(dataset))
        print('Number of columns:', len(dataset[0]))
        
print (explore_data(google_file, 0, 3, True))

print (explore_data(apple_file ,0, 3, True))

['App', 'Category', 'Rating', 'Reviews', 'Size', 'Installs', 'Type', 'Price', 'Content Rating', 'Genres', 'Last Updated', 'Current Ver', 'Android Ver']


['Photo Editor & Candy Camera & Grid & ScrapBook', 'ART_AND_DESIGN', '4.1', '159', '19M', '10,000+', 'Free', '0', 'Everyone', 'Art & Design', 'January 7, 2018', '1.0.0', '4.0.3 and up']


['Coloring book moana', 'ART_AND_DESIGN', '3.9', '967', '14M', '500,000+', 'Free', '0', 'Everyone', 'Art & Design;Pretend Play', 'January 15, 2018', '2.0.0', '4.0.3 and up']


Number of rows: 10841
Number of columns: 13
None
['id', 'track_name', 'size_bytes', 'currency', 'price', 'rating_count_tot', 'rating_count_ver', 'user_rating', 'user_rating_ver', 'ver', 'cont_rating', 'prime_genre', 'sup_devices.num', 'ipadSc_urls.num', 'lang.num', 'vpp_lic']


['284882215', 'Facebook', '389879808', 'USD', '0.0', '2974676', '212', '3.5', '3.5', '95.0', '4+', 'Social Networking', '37', '1', '29', '1']


['389801252', 'Instagram', '113954816', 'USD', '0.0', '2161

## Checking for wrong values in the Data

In [3]:
apple_len = len(apple_file[0])
google_len = len(google_file[0])
print(apple_len,google_len)

for row in apple_file:
    if len(row) != apple_len:
        print(row)
        
for row in google_file:
    if len(row) != google_len:
        print(row)
        print(google_file.index(row))

16 13
['Life Made WI-Fi Touchscreen Photo Frame', '1.9', '19', '3.0M', '1,000+', 'Free', '0', 'Everyone', '', 'February 11, 2018', '1.0.19', '4.0 and up']
10473


Removing wrong values from the data

In [4]:
del google_file[10473]

print (google_file[10473])


['osmino Wi-Fi: free WiFi', 'TOOLS', '4.2', '134203', '4.1M', '10,000,000+', 'Free', '0', 'Everyone', 'Tools', 'August 7, 2018', '6.06.14', '4.4 and up']


## Removing if there are duplicate data

In [5]:
unique_apple = []

for row in apple_file:
    if row not in unique_apple:
        unique_apple.append(row)
print (explore_data(unique_apple ,0, 3, True))

['id', 'track_name', 'size_bytes', 'currency', 'price', 'rating_count_tot', 'rating_count_ver', 'user_rating', 'user_rating_ver', 'ver', 'cont_rating', 'prime_genre', 'sup_devices.num', 'ipadSc_urls.num', 'lang.num', 'vpp_lic']


['284882215', 'Facebook', '389879808', 'USD', '0.0', '2974676', '212', '3.5', '3.5', '95.0', '4+', 'Social Networking', '37', '1', '29', '1']


['389801252', 'Instagram', '113954816', 'USD', '0.0', '2161558', '1289', '4.5', '4.0', '10.23', '12+', 'Photo & Video', '37', '0', '29', '1']


Number of rows: 7197
Number of columns: 16
None


In [6]:
unique_google = []

for row in google_file:
    if row not in unique_google:
        unique_google.append(row)
print (explore_data(unique_google ,0, 3, True))

['App', 'Category', 'Rating', 'Reviews', 'Size', 'Installs', 'Type', 'Price', 'Content Rating', 'Genres', 'Last Updated', 'Current Ver', 'Android Ver']


['Photo Editor & Candy Camera & Grid & ScrapBook', 'ART_AND_DESIGN', '4.1', '159', '19M', '10,000+', 'Free', '0', 'Everyone', 'Art & Design', 'January 7, 2018', '1.0.0', '4.0.3 and up']


['Coloring book moana', 'ART_AND_DESIGN', '3.9', '967', '14M', '500,000+', 'Free', '0', 'Everyone', 'Art & Design;Pretend Play', 'January 15, 2018', '2.0.0', '4.0.3 and up']


Number of rows: 10357
Number of columns: 13
None


### Removing if Apps have more than one entry

In [7]:
def reviews(string):
    string = string.replace("+","")
    string = string.replace(",","")
    return float(string)

In [8]:
clean_google = {}

for row in unique_google[1:]:
    if row[0] not in clean_google:
        clean_google[row[0]] = row
    else:
        if reviews(row[3]) > reviews(clean_google.get(row[0])[3]):
            clean_google[row[0]] = row
            
clean_google_ls = [clean_google.get(x) for x in clean_google]
clean_google_ls.insert(0, unique_google[0])

print (explore_data(clean_google_ls ,0, 3, True))

print(len(clean_google_ls))
   
        

['App', 'Category', 'Rating', 'Reviews', 'Size', 'Installs', 'Type', 'Price', 'Content Rating', 'Genres', 'Last Updated', 'Current Ver', 'Android Ver']


['Photo Editor & Candy Camera & Grid & ScrapBook', 'ART_AND_DESIGN', '4.1', '159', '19M', '10,000+', 'Free', '0', 'Everyone', 'Art & Design', 'January 7, 2018', '1.0.0', '4.0.3 and up']


['Coloring book moana', 'FAMILY', '3.9', '974', '14M', '500,000+', 'Free', '0', 'Everyone', 'Art & Design;Pretend Play', 'January 15, 2018', '2.0.0', '4.0.3 and up']


Number of rows: 9659
Number of columns: 13
None
9660


In [9]:
clean_apple = {}

for row in unique_apple[1:]:
    if row[1] not in clean_apple:
        clean_apple[row[1]] = row
    else:        
        if reviews(row[5]) > reviews(clean_apple.get(row[1])[5]):
            clean_apple[row[1]] = row
            
clean_apple_ls = [clean_apple.get(x) for x in clean_apple]
clean_apple_ls.insert(0, unique_apple[0])
            
print (explore_data(clean_apple_ls ,0, 3, True))

print(len(clean_apple_ls))
   

['id', 'track_name', 'size_bytes', 'currency', 'price', 'rating_count_tot', 'rating_count_ver', 'user_rating', 'user_rating_ver', 'ver', 'cont_rating', 'prime_genre', 'sup_devices.num', 'ipadSc_urls.num', 'lang.num', 'vpp_lic']


['284882215', 'Facebook', '389879808', 'USD', '0.0', '2974676', '212', '3.5', '3.5', '95.0', '4+', 'Social Networking', '37', '1', '29', '1']


['389801252', 'Instagram', '113954816', 'USD', '0.0', '2161558', '1289', '4.5', '4.0', '10.23', '12+', 'Photo & Video', '37', '0', '29', '1']


Number of rows: 7195
Number of columns: 16
None
7196


## defining function to check non english apps
to not filter out all the wrong labeled apps, we will limit wrong character limit to three characters

In [10]:
def check_non_english(some_string):
    amount = 0
    for i in some_string:
        if ord(i) > 127:
            amount += 1
            if amount > 3:
                return False
    return True

english_google = []
english_apple = []

for i in clean_google_ls:
    if check_non_english(i[0]):
        english_google.append(i)
print (len(clean_google_ls))
print (len(english_google))

for i in clean_apple_ls:
    if check_non_english(i[1]):
        english_apple.append(i)
print (len(clean_apple_ls))
print (len(english_apple))

9660
9615
7196
6182


## Isolate free apps
as it is our main focus


In [11]:
free_apple_apps  = []
free_google_apps = []

free_apple_apps.append(english_apple[0])
free_google_apps.append(english_google[0])


for i in english_apple[1:]:
    price = i[4]
    price = price.replace('$', '')
    if abs(float(price)) == 0:
        free_apple_apps.append(i)

for i in english_google[1:]:
    price = i[7]
    price = price.replace('$', '')
    if abs(float(price)) == 0:
        free_google_apps.append(i)
        
print (len(free_apple_apps))
print (len(free_google_apps))
        

3221
8865


## More Explanation

We need to find an app that will be successfull both on google play and apple store platforms. for this we need to check what kind of apps are successfull on both platforms and what are common genres. 
our aim is to determine the kinds of apps that are likely to attract more users because our revenue is highly influenced by the number of people using our apps.

Creating frequency table and analysing prime_genre column of the App Store data set, and the Genres and Category columns of the Google Play data set.

In [12]:
def freq_table(dataset, index):
    freq = {}
    for ls in dataset[1:]:
        if ls[index] in freq:
            freq[ls[index]] += 1
        else:
            freq[ls[index]] = 1
    freq_percentages = {}
    for i in freq:
        freq[i] = round((freq.get(i)/len(dataset[1:])) * 100, 2)
    return freq

def display_table(dataset, index):
    table = freq_table(dataset, index)
    table_display = []
    for key in table:
        key_val_as_tuple = (table[key], key)
        table_display.append(key_val_as_tuple)

    table_sorted = sorted(table_display, reverse = True)
    for entry in table_sorted:
        print(entry[1], ':', entry[0])

In [13]:
display_table(free_apple_apps, 11)

Games : 58.14
Entertainment : 7.89
Photo & Video : 4.97
Education : 3.66
Social Networking : 3.29
Shopping : 2.61
Utilities : 2.52
Sports : 2.14
Music : 2.05
Health & Fitness : 2.02
Productivity : 1.74
Lifestyle : 1.58
News : 1.34
Travel : 1.24
Finance : 1.12
Weather : 0.87
Food & Drink : 0.81
Reference : 0.56
Business : 0.53
Book : 0.43
Navigation : 0.19
Medical : 0.19
Catalogs : 0.12


In [14]:
display_table(free_google_apps, 1)


FAMILY : 18.91
GAME : 9.72
TOOLS : 8.46
BUSINESS : 4.59
LIFESTYLE : 3.9
PRODUCTIVITY : 3.89
FINANCE : 3.7
MEDICAL : 3.53
SPORTS : 3.4
PERSONALIZATION : 3.32
COMMUNICATION : 3.24
HEALTH_AND_FITNESS : 3.08
PHOTOGRAPHY : 2.94
NEWS_AND_MAGAZINES : 2.8
SOCIAL : 2.66
TRAVEL_AND_LOCAL : 2.34
SHOPPING : 2.25
BOOKS_AND_REFERENCE : 2.14
DATING : 1.86
VIDEO_PLAYERS : 1.79
MAPS_AND_NAVIGATION : 1.4
FOOD_AND_DRINK : 1.24
EDUCATION : 1.16
ENTERTAINMENT : 0.96
LIBRARIES_AND_DEMO : 0.94
AUTO_AND_VEHICLES : 0.93
HOUSE_AND_HOME : 0.82
WEATHER : 0.8
EVENTS : 0.71
PARENTING : 0.65
ART_AND_DESIGN : 0.64
COMICS : 0.62
BEAUTY : 0.6


In [15]:
display_table(free_google_apps, 9)

Tools : 8.45
Entertainment : 6.07
Education : 5.35
Business : 4.59
Productivity : 3.89
Lifestyle : 3.89
Finance : 3.7
Medical : 3.53
Sports : 3.46
Personalization : 3.32
Communication : 3.24
Action : 3.1
Health & Fitness : 3.08
Photography : 2.94
News & Magazines : 2.8
Social : 2.66
Travel & Local : 2.32
Shopping : 2.25
Books & Reference : 2.14
Simulation : 2.04
Dating : 1.86
Arcade : 1.85
Video Players & Editors : 1.77
Casual : 1.76
Maps & Navigation : 1.4
Food & Drink : 1.24
Puzzle : 1.13
Racing : 0.99
Role Playing : 0.94
Libraries & Demo : 0.94
Auto & Vehicles : 0.93
Strategy : 0.91
House & Home : 0.82
Weather : 0.8
Events : 0.71
Adventure : 0.68
Comics : 0.61
Beauty : 0.6
Art & Design : 0.6
Parenting : 0.5
Card : 0.45
Casino : 0.43
Trivia : 0.42
Educational;Education : 0.39
Board : 0.38
Educational : 0.37
Education;Education : 0.34
Word : 0.26
Casual;Pretend Play : 0.24
Music : 0.2
Racing;Action & Adventure : 0.17
Puzzle;Brain Games : 0.17
Entertainment;Music & Video : 0.17
Casual;

One way to find out what genres are the most popular (have the most users) is to calculate the average number of installs for each app genre. For the Google Play data set, we can find this information in the Installs column, but this information is missing for the App Store data set. As a workaround, we'll take the total number of user ratings as a proxy, which we can find in the rating_count_tot app.

In [None]:
popular_genre(free_apple_apps, 11)

In [33]:
rating = {}
rating_insight = {}

for row in free_apple_apps[1:]:
    if row[11] in rating:
        rating[row[11]] = [rating.get(row[11])[0] + float(row[5]) , rating.get(row[11])[1] + 1]
    else:
        rating[row[11]] = [float(row[5]), 1]
        
for i in rating:
    rating_insight[i] = rating.get(i)[0]/rating.get(i)[1]


print (rating)
print (rating_insight)

{'Social Networking': [7584125.0, 106], 'Photo & Video': [4550647.0, 160], 'Games': [42705795.0, 1872], 'Music': [3783551.0, 66], 'Reference': [1348958.0, 18], 'Health & Fitness': [1514371.0, 65], 'Weather': [1463837.0, 28], 'Utilities': [1513441.0, 81], 'Travel': [1129752.0, 40], 'Shopping': [2261254.0, 84], 'News': [913665.0, 43], 'Navigation': [516542.0, 6], 'Lifestyle': [840774.0, 51], 'Entertainment': [3563577.0, 254], 'Food & Drink': [866682.0, 26], 'Sports': [1587614.0, 69], 'Book': [556619.0, 14], 'Finance': [1132846.0, 36], 'Education': [826470.0, 118], 'Productivity': [1177591.0, 56], 'Business': [127349.0, 17], 'Catalogs': [16016.0, 4], 'Medical': [3672.0, 6]}
{'Social Networking': 71548.34905660378, 'Photo & Video': 28441.54375, 'Games': 22812.92467948718, 'Music': 57326.530303030304, 'Reference': 74942.11111111111, 'Health & Fitness': 23298.015384615384, 'Weather': 52279.892857142855, 'Utilities': 18684.456790123455, 'Travel': 28243.8, 'Shopping': 26919.690476190477, 'News

In [37]:
categories_android = freq_table(free_google_apps, 1)

for category in categories_android:
    total = 0
    len_category = 0
    for app in free_google_apps:
        category_app = app[1]
        if category_app == category:            
            n_installs = app[5]
            n_installs = n_installs.replace(',', '')
            n_installs = n_installs.replace('+', '')
            total += float(n_installs)
            len_category += 1
    avg_n_installs = total / len_category
    print(category, ':', avg_n_installs)

ART_AND_DESIGN : 1986335.0877192982
FAMILY : 3695641.8198090694
AUTO_AND_VEHICLES : 647317.8170731707
BEAUTY : 513151.88679245283
BOOKS_AND_REFERENCE : 8767811.894736841
BUSINESS : 1712290.1474201474
COMICS : 817657.2727272727
COMMUNICATION : 38456119.167247385
TOOLS : 10801391.298666667
DATING : 854028.8303030303
EDUCATION : 1833495.145631068
ENTERTAINMENT : 11640705.88235294
EVENTS : 253542.22222222222
FINANCE : 1387692.475609756
FOOD_AND_DRINK : 1924897.7363636363
HEALTH_AND_FITNESS : 4188821.9853479853
HOUSE_AND_HOME : 1331540.5616438356
LIBRARIES_AND_DEMO : 638503.734939759
LIFESTYLE : 1437816.2687861272
GAME : 15588015.603248259
VIDEO_PLAYERS : 24727872.452830188
MEDICAL : 120550.61980830671
SOCIAL : 23253652.127118643
SHOPPING : 7036877.311557789
PHOTOGRAPHY : 17840110.40229885
SPORTS : 3638640.1428571427
TRAVEL_AND_LOCAL : 13984077.710144928
PERSONALIZATION : 5201482.6122448975
PRODUCTIVITY : 16787331.344927534
PARENTING : 542603.6206896552
WEATHER : 5074486.197183099
NEWS_AND_