# Profitable App Profiles for the App Store and Google Play Markets

The goal of this project is to analyze data to discover what type of apps are likely to attract more users. As of September 2018, there were approximately 2 million iOS apps available on the App Store, and 2.1 million Android apps on Google Play. Collecting data for over 4 million apps requires a significant amount of time and money, so we'll try to analyze a sample of the data instead. To avoid spending resources on collecting new data ourselves, we should first try to see if we can find any relevant existing data at no cost.

In [1]:
from csv import reader


In [2]:
# Opening the App Store data set
opened_apple = open('data/AppleStore.csv')
read_file = reader(opened_apple)
ios = list(read_file)
ios_header = ios[0]
ios = ios[1:]

opened_google = open('data/googleplaystore.csv')
read_file = reader(opened_google)
android = list(read_file)
android_header = android[0]
android = android[1:]

### Exploring our data

In [3]:
# This function is intended to print rows in a readable way. As well to get the number of rows, and columns:
def explore_data(dataset, start, end, rows_and_columns=False):
    dataset_slice = dataset[start:end]
    for row in dataset_slice:
        print(row)
        print('\n')
    if rows_and_columns:
        print('Number of rows: ', len(dataset))
        print('Number of columns: ', len(dataset[0]))
        
# Getting the ios information        
print(ios_header)
print('\n')
explore_data(ios, 0, 3, True)

['id', 'track_name', 'size_bytes', 'currency', 'price', 'rating_count_tot', 'rating_count_ver', 'user_rating', 'user_rating_ver', 'ver', 'cont_rating', 'prime_genre', 'sup_devices.num', 'ipadSc_urls.num', 'lang.num', 'vpp_lic']


['284882215', 'Facebook', '389879808', 'USD', '0.0', '2974676', '212', '3.5', '3.5', '95.0', '4+', 'Social Networking', '37', '1', '29', '1']


['389801252', 'Instagram', '113954816', 'USD', '0.0', '2161558', '1289', '4.5', '4.0', '10.23', '12+', 'Photo & Video', '37', '0', '29', '1']


['529479190', 'Clash of Clans', '116476928', 'USD', '0.0', '2130805', '579', '4.5', '4.5', '9.24.12', '9+', 'Games', '38', '5', '18', '1']


Number of rows:  7197
Number of columns:  16


In [4]:
# Exploring the android information
print(android_header)
print('\n')
explore_data(android, 0, 3, True)

['App', 'Category', 'Rating', 'Reviews', 'Size', 'Installs', 'Type', 'Price', 'Content Rating', 'Genres', 'Last Updated', 'Current Ver', 'Android Ver']


['Photo Editor & Candy Camera & Grid & ScrapBook', 'ART_AND_DESIGN', '4.1', '159', '19M', '10,000+', 'Free', '0', 'Everyone', 'Art & Design', 'January 7, 2018', '1.0.0', '4.0.3 and up']


['Coloring book moana', 'ART_AND_DESIGN', '3.9', '967', '14M', '500,000+', 'Free', '0', 'Everyone', 'Art & Design;Pretend Play', 'January 15, 2018', '2.0.0', '4.0.3 and up']


['U Launcher Lite – FREE Live Cool Themes, Hide Apps', 'ART_AND_DESIGN', '4.7', '87510', '8.7M', '5,000,000+', 'Free', '0', 'Everyone', 'Art & Design', 'August 1, 2018', '1.2.4', '4.0.3 and up']


Number of rows:  10841
Number of columns:  13


### Looking for errors and cleaning data

#### Missing Data

In [5]:
#Finding misisng data using length
for row in android:
    h_lenghth = len(android_header)
    r_legth = len(row)
    if h_lenghth != r_legth:
        print(row)
        print(android.index(row))

['Life Made WI-Fi Touchscreen Photo Frame', '1.9', '19', '3.0M', '1,000+', 'Free', '0', 'Everyone', '', 'February 11, 2018', '1.0.19', '4.0 and up']
10472


In [6]:
del android[10472]

In [7]:
# No missing Data in IOS
for row in ios:
    h_lenghth = len(ios_header)
    r_legth = len(row)
    if h_lenghth != r_legth:
        print(row)
        print(ios.index(row))

#### Duplicates

In [9]:
# Here we will look for duplicate values in Google data file
duplicate_android_apps = []
unique_android_apps = []
for i in android:
    app_name = i[0]
    if app_name in unique_android_apps:
        duplicate_android_apps.append(app_name)
    else:
        unique_android_apps.append(app_name)
        
print(f'There are {len(duplicate_android_apps)} duplicate apps in the data set.')
print(f'A example of duplicated apps is: {duplicate_android_apps[0:9]}')

There are 1181 duplicate apps in the data set.
A example of duplicated apps is: ['Quick PDF Scanner + OCR FREE', 'Box', 'Google My Business', 'ZOOM Cloud Meetings', 'join.me - Simple Meetings', 'Box', 'Zenefits', 'Google Ads', 'Google My Business']


In [10]:
# Here we will look for duplicate values in iOS data file
duplicate_ios_apps = []
unique_ios_apps = []
for i in ios:
    app_name = i[1]
    if app_name in unique_ios_apps:
        duplicate_ios_apps.append(app_name)
    else:
        unique_ios_apps.append(app_name)
        
print(f'There are {len(duplicate_ios_apps)} duplicate apps in the data set.')

There are 2 duplicate apps in the data set.


##### Removing duplicates

In [11]:
reviews_max = {}
for i in android:
    app_name = i[0]
    n_reviews = float(i[3])
    if app_name in reviews_max and (reviews_max[app_name] < n_reviews):
        reviews_max[app_name] = n_reviews
    elif app_name not in reviews_max:
        reviews_max[app_name] = n_reviews
        
print('Expected length: ', len(android) - len(duplicate_android_apps))
print('Actual length: ', len(reviews_max))

Expected length:  9659
Actual length:  9659


In [12]:
android_clean = []
already_added = []
for i in android:
    app_name = i[0]
    n_reviews = float(i[3])
    if (reviews_max[app_name] == n_reviews) and (app_name not in already_added):
        android_clean.append(i)
        already_added.append(app_name)
        
explore_data(android_clean, 1, 3, True)
        

['U Launcher Lite – FREE Live Cool Themes, Hide Apps', 'ART_AND_DESIGN', '4.7', '87510', '8.7M', '5,000,000+', 'Free', '0', 'Everyone', 'Art & Design', 'August 1, 2018', '1.2.4', '4.0.3 and up']


['Sketch - Draw & Paint', 'ART_AND_DESIGN', '4.5', '215644', '25M', '50,000,000+', 'Free', '0', 'Teen', 'Art & Design', 'June 8, 2018', 'Varies with device', '4.2 and up']


Number of rows:  9659
Number of columns:  13


#### Finding and removing apps withous ASCII  system

In [13]:
def ascii_characters(string):
    counter = 0
    for character in string:
        if ord(character) > 127:
            counter += 1
    if counter > 3:
        return False
    else:
        return True

In [14]:
android_english = []

for row in android_clean:
    name = row[0]
    if ascii_characters(name):
        android_english.append(row)
        
ios_english = []

for row in ios:
    name = row[1]
    if ascii_characters(name):
        ios_english.append(row)
        
explore_data(android_english, 0,3, True)
print('\n')
explore_data(ios_english, 0, 3, True)

['Photo Editor & Candy Camera & Grid & ScrapBook', 'ART_AND_DESIGN', '4.1', '159', '19M', '10,000+', 'Free', '0', 'Everyone', 'Art & Design', 'January 7, 2018', '1.0.0', '4.0.3 and up']


['U Launcher Lite – FREE Live Cool Themes, Hide Apps', 'ART_AND_DESIGN', '4.7', '87510', '8.7M', '5,000,000+', 'Free', '0', 'Everyone', 'Art & Design', 'August 1, 2018', '1.2.4', '4.0.3 and up']


['Sketch - Draw & Paint', 'ART_AND_DESIGN', '4.5', '215644', '25M', '50,000,000+', 'Free', '0', 'Teen', 'Art & Design', 'June 8, 2018', 'Varies with device', '4.2 and up']


Number of rows:  9614
Number of columns:  13


['284882215', 'Facebook', '389879808', 'USD', '0.0', '2974676', '212', '3.5', '3.5', '95.0', '4+', 'Social Networking', '37', '1', '29', '1']


['389801252', 'Instagram', '113954816', 'USD', '0.0', '2161558', '1289', '4.5', '4.0', '10.23', '12+', 'Photo & Video', '37', '0', '29', '1']


['529479190', 'Clash of Clans', '116476928', 'USD', '0.0', '2130805', '579', '4.5', '4.5', '9.24.12', '9+'

#### Get only the free-apps

In [15]:
free_android = []

for row in android_english:
    is_free = row[6]
    if is_free == 'Free':
        free_android.append(row)

free_ios = []

for row in ios_english:
    is_free = float(row[4])
    if is_free == 0.0:
        free_ios.append(row)
    
explore_data(free_android, 0,3, True)
print('\n')
explore_data(free_ios, 0, 3, True)  

['Photo Editor & Candy Camera & Grid & ScrapBook', 'ART_AND_DESIGN', '4.1', '159', '19M', '10,000+', 'Free', '0', 'Everyone', 'Art & Design', 'January 7, 2018', '1.0.0', '4.0.3 and up']


['U Launcher Lite – FREE Live Cool Themes, Hide Apps', 'ART_AND_DESIGN', '4.7', '87510', '8.7M', '5,000,000+', 'Free', '0', 'Everyone', 'Art & Design', 'August 1, 2018', '1.2.4', '4.0.3 and up']


['Sketch - Draw & Paint', 'ART_AND_DESIGN', '4.5', '215644', '25M', '50,000,000+', 'Free', '0', 'Teen', 'Art & Design', 'June 8, 2018', 'Varies with device', '4.2 and up']


Number of rows:  8863
Number of columns:  13


['284882215', 'Facebook', '389879808', 'USD', '0.0', '2974676', '212', '3.5', '3.5', '95.0', '4+', 'Social Networking', '37', '1', '29', '1']


['389801252', 'Instagram', '113954816', 'USD', '0.0', '2161558', '1289', '4.5', '4.0', '10.23', '12+', 'Photo & Video', '37', '0', '29', '1']


['529479190', 'Clash of Clans', '116476928', 'USD', '0.0', '2130805', '579', '4.5', '4.5', '9.24.12', '9+'

### Analysis

We will begin analyzling the most common genres for each market. We'll build frequency tables for a few columns in our data sets.