# Data Analysis on Andriod and iOS Mobile Apps
<br>
This project is to discover and learn trends from apps that are used in the Google Play and App store. Which apps are more popular, have most daily users, and spend the most money.

## Gathering Data

In [1]:
from csv import reader

In [2]:
# Pulling in the csv files for both iOS and Google
# Dataset infomation for iOS
# https://www.kaggle.com/ramamet4/app-store-apple-data-set-10k-apps
apple_file = open('AppleStore.csv')
google_file = open('googleplaystore.csv')

# Reading in both of those files
# Dataset information for Google Play
# https://www.kaggle.com/lava18/google-play-store-apps
read_apple_file = reader(apple_file)
read_google_file = reader(google_file)

# Creating a list of lists of all data
apple_apps_data = list(read_apple_file)
google_apps_data = list(read_google_file)

In [3]:
# Checking to see if data read in correctly
apple_apps_data[:3]

[['id',
  'track_name',
  'size_bytes',
  'currency',
  'price',
  'rating_count_tot',
  'rating_count_ver',
  'user_rating',
  'user_rating_ver',
  'ver',
  'cont_rating',
  'prime_genre',
  'sup_devices.num',
  'ipadSc_urls.num',
  'lang.num',
  'vpp_lic'],
 ['284882215',
  'Facebook',
  '389879808',
  'USD',
  '0.0',
  '2974676',
  '212',
  '3.5',
  '3.5',
  '95.0',
  '4+',
  'Social Networking',
  '37',
  '1',
  '29',
  '1'],
 ['389801252',
  'Instagram',
  '113954816',
  'USD',
  '0.0',
  '2161558',
  '1289',
  '4.5',
  '4.0',
  '10.23',
  '12+',
  'Photo & Video',
  '37',
  '0',
  '29',
  '1']]

In [4]:
# Checking to see if data read in correctly
google_apps_data[:3]

[['App',
  'Category',
  'Rating',
  'Reviews',
  'Size',
  'Installs',
  'Type',
  'Price',
  'Content Rating',
  'Genres',
  'Last Updated',
  'Current Ver',
  'Android Ver'],
 ['Photo Editor & Candy Camera & Grid & ScrapBook',
  'ART_AND_DESIGN',
  '4.1',
  '159',
  '19M',
  '10,000+',
  'Free',
  '0',
  'Everyone',
  'Art & Design',
  'January 7, 2018',
  '1.0.0',
  '4.0.3 and up'],
 ['Coloring book moana',
  'ART_AND_DESIGN',
  '3.9',
  '967',
  '14M',
  '500,000+',
  'Free',
  '0',
  'Everyone',
  'Art & Design;Pretend Play',
  'January 15, 2018',
  '2.0.0',
  '4.0.3 and up']]

In [5]:
# Function to work with data (Created by Dataquest)
def explore_data(dataset, start, end, rows_and_columns=False):
    dataset_slice = dataset[start:end]
    for row in dataset_slice:
        print(row)
        print('\n')
        
    if rows_and_columns:
        print('Number of rows:', len(dataset))
        print('Number of columns:', len(dataset[0]))

In [6]:
explore_data(apple_apps_data, 1, 5, rows_and_columns=True)

['284882215', 'Facebook', '389879808', 'USD', '0.0', '2974676', '212', '3.5', '3.5', '95.0', '4+', 'Social Networking', '37', '1', '29', '1']


['389801252', 'Instagram', '113954816', 'USD', '0.0', '2161558', '1289', '4.5', '4.0', '10.23', '12+', 'Photo & Video', '37', '0', '29', '1']


['529479190', 'Clash of Clans', '116476928', 'USD', '0.0', '2130805', '579', '4.5', '4.5', '9.24.12', '9+', 'Games', '38', '5', '18', '1']


['420009108', 'Temple Run', '65921024', 'USD', '0.0', '1724546', '3842', '4.5', '4.0', '1.6.2', '9+', 'Games', '40', '5', '1', '1']


Number of rows: 7198
Number of columns: 16


In [7]:
explore_data(google_apps_data, 1, 5, rows_and_columns=True)

['Photo Editor & Candy Camera & Grid & ScrapBook', 'ART_AND_DESIGN', '4.1', '159', '19M', '10,000+', 'Free', '0', 'Everyone', 'Art & Design', 'January 7, 2018', '1.0.0', '4.0.3 and up']


['Coloring book moana', 'ART_AND_DESIGN', '3.9', '967', '14M', '500,000+', 'Free', '0', 'Everyone', 'Art & Design;Pretend Play', 'January 15, 2018', '2.0.0', '4.0.3 and up']


['U Launcher Lite – FREE Live Cool Themes, Hide Apps', 'ART_AND_DESIGN', '4.7', '87510', '8.7M', '5,000,000+', 'Free', '0', 'Everyone', 'Art & Design', 'August 1, 2018', '1.2.4', '4.0.3 and up']


['Sketch - Draw & Paint', 'ART_AND_DESIGN', '4.5', '215644', '25M', '50,000,000+', 'Free', '0', 'Teen', 'Art & Design', 'June 8, 2018', 'Varies with device', '4.2 and up']


Number of rows: 10842
Number of columns: 13


In [8]:
# Column names for iOS data
print(apple_apps_data[0])

['id', 'track_name', 'size_bytes', 'currency', 'price', 'rating_count_tot', 'rating_count_ver', 'user_rating', 'user_rating_ver', 'ver', 'cont_rating', 'prime_genre', 'sup_devices.num', 'ipadSc_urls.num', 'lang.num', 'vpp_lic']


In [9]:
# Column names for Google data
print(google_apps_data[0])

['App', 'Category', 'Rating', 'Reviews', 'Size', 'Installs', 'Type', 'Price', 'Content Rating', 'Genres', 'Last Updated', 'Current Ver', 'Android Ver']


In [10]:
print(len(google_apps_data[10473]))
print(google_apps_data[10473])

12
['Life Made WI-Fi Touchscreen Photo Frame', '1.9', '19', '3.0M', '1,000+', 'Free', '0', 'Everyone', '', 'February 11, 2018', '1.0.19', '4.0 and up']


In [11]:
del google_apps_data[10473]

There are duplicate apps in the dataset. The code below is to discover how many duplicates there are and the names of the apps.

In [16]:
# Using code in lesson to check for duplicate Google Play apps

duplicate_google_apps = []
unique_google_apps = []

for app in google_apps_data:
    name = app[0]
    if name in unique_google_apps:
        duplicate_google_apps.append(name)
    else:
        unique_google_apps.append(name)
        
print('Number of duplicate Google Play apps:', len(duplicate_google_apps))
print('\n')
print('Examples of duplicate Google Play apps:', duplicate_google_apps[:15])

Number of duplicate Google Play apps: 1181


Examples of duplicate Google Play apps: ['Quick PDF Scanner + OCR FREE', 'Box', 'Google My Business', 'ZOOM Cloud Meetings', 'join.me - Simple Meetings', 'Box', 'Zenefits', 'Google Ads', 'Google My Business', 'Slack', 'FreshBooks Classic', 'Insightly CRM', 'QuickBooks Accounting: Invoicing & Expenses', 'HipChat - Chat Built for Teams', 'Xero Accounting Software']


In [22]:
# Using code in lesson to check for duplicate iOS apps

duplicate_apple_apps = []
unique_apple_apps = []

for app in apple_apps_data:
    name = app[1] # track_name index 1
    if name in unique_apple_apps:
        duplicate_apple_apps.append(name)
    else:
        unique_apple_apps.append(name)
        
print('Number of duplicate iOS apps:', len(duplicate_apple_apps))
print('\n')
print('Examples of duplicate iOS apps:', duplicate_apple_apps[:15])

Number of duplicate iOS apps: 2


Examples of duplicate iOS apps: ['Mannequin Challenge', 'VR Roller Coaster']


The duplicate rows will need to be deleted based on which row has the highest reviews. It is assumed that the app with the highest reviews will be the most recent and updated information for the app.

In [30]:
reviews_max = {}

# Looping through Google Play apps, not inluding header row
for app in google_apps_data[1:]:
    name = app[0]
    n_reviews = float(app[3])
    
    # Checking to see which duplicate has more ratings
    # If not in the dict, appending to the dict
    if name in reviews_max and reviews_max[name] < n_reviews:
        reviews_max[name] = n_reviews
    elif name not in reviews_max:
        reviews_max[name] = n_reviews
print(len(reviews_max))

9659


In [31]:
# Using the dictionary as a source to clean out duplicates
# Creating an empty list to have only one of each app
google_play_clean = []
already_added = []

for app in google_apps_data[1:]:
    name = app[0]
    n_reviews = float(app[3])
    
    if n_reviews == reviews_max[name] and name not in already_added:
        google_play_clean.append(app)
        already_added.append(name)
        

In [33]:
# Checking to see if the data was cleaned properly
print(len(google_play_clean))
print(google_play_clean[:5])

9659
[['Photo Editor & Candy Camera & Grid & ScrapBook', 'ART_AND_DESIGN', '4.1', '159', '19M', '10,000+', 'Free', '0', 'Everyone', 'Art & Design', 'January 7, 2018', '1.0.0', '4.0.3 and up'], ['U Launcher Lite – FREE Live Cool Themes, Hide Apps', 'ART_AND_DESIGN', '4.7', '87510', '8.7M', '5,000,000+', 'Free', '0', 'Everyone', 'Art & Design', 'August 1, 2018', '1.2.4', '4.0.3 and up'], ['Sketch - Draw & Paint', 'ART_AND_DESIGN', '4.5', '215644', '25M', '50,000,000+', 'Free', '0', 'Teen', 'Art & Design', 'June 8, 2018', 'Varies with device', '4.2 and up'], ['Pixel Draw - Number Art Coloring Book', 'ART_AND_DESIGN', '4.3', '967', '2.8M', '100,000+', 'Free', '0', 'Everyone', 'Art & Design;Creativity', 'June 20, 2018', '1.1', '4.4 and up'], ['Paper flowers instructions', 'ART_AND_DESIGN', '4.4', '167', '5.6M', '50,000+', 'Free', '0', 'Everyone', 'Art & Design', 'March 26, 2017', '1.0', '2.3 and up']]


In [39]:
# Function to remove non-english apps
# Using ASCII range to check for english characters
def check_english_char(string):
    
    # Non-english char counter
    char_check = 0
    
    for char in string:
        if ord(char) > 127:
            char_check += 1
            
            # If more than 3 char are not engish then it returns false
            if ord(char) > 127 and char_check > 3:
                return False
    return True

In [40]:
check_english_char('Instagram')

True

In [41]:
check_english_char('爱奇艺PPS -《欢乐颂2》电视剧热播')

False

In [42]:
check_english_char('Docs To Go™ Free Office Suite')

True

In [43]:
check_english_char('Instachat 😜')

True

In [47]:
# Removing Non-Enlgish apps from Google Play dataset
android_clean = []

for app in google_play_clean:
    
    if check_english_char(app[0]):
        android_clean.append(app)

In [52]:
print(len(android_clean))
print(android_clean[:5])

9614
[['Photo Editor & Candy Camera & Grid & ScrapBook', 'ART_AND_DESIGN', '4.1', '159', '19M', '10,000+', 'Free', '0', 'Everyone', 'Art & Design', 'January 7, 2018', '1.0.0', '4.0.3 and up'], ['U Launcher Lite – FREE Live Cool Themes, Hide Apps', 'ART_AND_DESIGN', '4.7', '87510', '8.7M', '5,000,000+', 'Free', '0', 'Everyone', 'Art & Design', 'August 1, 2018', '1.2.4', '4.0.3 and up'], ['Sketch - Draw & Paint', 'ART_AND_DESIGN', '4.5', '215644', '25M', '50,000,000+', 'Free', '0', 'Teen', 'Art & Design', 'June 8, 2018', 'Varies with device', '4.2 and up'], ['Pixel Draw - Number Art Coloring Book', 'ART_AND_DESIGN', '4.3', '967', '2.8M', '100,000+', 'Free', '0', 'Everyone', 'Art & Design;Creativity', 'June 20, 2018', '1.1', '4.4 and up'], ['Paper flowers instructions', 'ART_AND_DESIGN', '4.4', '167', '5.6M', '50,000+', 'Free', '0', 'Everyone', 'Art & Design', 'March 26, 2017', '1.0', '2.3 and up']]


In [55]:
# Removing Non-Enlgish apps from iOS dataset
apple_clean = []

for app in apple_apps_data:
    if check_english_char(app[1]):
        apple_clean.append(app)

In [56]:
print(len(apple_clean))
print(apple_clean[:5])

6184
[['id', 'track_name', 'size_bytes', 'currency', 'price', 'rating_count_tot', 'rating_count_ver', 'user_rating', 'user_rating_ver', 'ver', 'cont_rating', 'prime_genre', 'sup_devices.num', 'ipadSc_urls.num', 'lang.num', 'vpp_lic'], ['284882215', 'Facebook', '389879808', 'USD', '0.0', '2974676', '212', '3.5', '3.5', '95.0', '4+', 'Social Networking', '37', '1', '29', '1'], ['389801252', 'Instagram', '113954816', 'USD', '0.0', '2161558', '1289', '4.5', '4.0', '10.23', '12+', 'Photo & Video', '37', '0', '29', '1'], ['529479190', 'Clash of Clans', '116476928', 'USD', '0.0', '2130805', '579', '4.5', '4.5', '9.24.12', '9+', 'Games', '38', '5', '18', '1'], ['420009108', 'Temple Run', '65921024', 'USD', '0.0', '1724546', '3842', '4.5', '4.0', '1.6.2', '9+', 'Games', '40', '5', '1', '1']]
