# Mobile App Analysis Project
This a project on analyzing the difference between iOS and Android apps revenue

In [None]:
# import relevant file and data analysis module
from csv import reader
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn
import statsmodels.api as sm
import statsmodels.formula.api as smf
import statsmodels.stats.api as sms
from pandas.tools.plotting import scatter_matrix

### The Google Play data set ###
file_1 = open('googleplaystore.csv')
read_file = reader(file_1)
android = list(read_file)
android_header = android[0]
android = android[1:]

### The App Store data set ###
file_2 = open('AppleStore.csv')
read_file = reader(file_2)
ios = list(read_file)
ios_header = ios[0]
ios = ios[1:]




use the explore_data function below to explore dataset

In [None]:
def explore_data(dataset, start, end, rows_and_columns=False):
    dataset_slice = dataset[start:end]    
    for row in dataset_slice:
        print(row)
        print('\n') # adds a new (empty) line between rows
        
    if rows_and_columns:
        print('Number of rows:', len(dataset))
        print('Number of columns:', len(dataset[0]))

print(android_header)
print('\n')
explore_data(android, 0, 3, True)

In [None]:
print(ios_header)
print('\n')
explore_data(ios, 0, 3, True)

In [None]:
print(android[10472])  # incorrect row
print('\n')
print(android_header)  # header
print('\n')
print(android[0])      # correct row

In [None]:
print(len(android))
del android[10472]  # don't run this more than once
print(len(android))

# Data Cleaning

In [None]:
for app in android:
    name = app[0]
    if name == 'Instagram':
        print(app)

**Detect duplicated apps**

In [None]:
duplicate_apps = []
unique_apps = []

for app in android:
    name = app[0]
    if name in unique_apps:
        duplicate_apps.append(name)
    else:
        unique_apps.append(name)
    
print('Number of duplicate apps:', len(duplicate_apps))
print('\n')
print('Examples of duplicate apps:', duplicate_apps[:15])

There are 1181 duplicate rows here; however, we won't remove rows randomly, but rather we'll keep the rows that have the highest number of reviews because the higher the number of reviews, the more reliable the ratings.

In [None]:
reviews_max = {}

for app in android:
    name = app[0]
    n_reviews = float(app[3])
    
    if name in reviews_max and reviews_max[name] < n_reviews:
        reviews_max[name] = n_reviews
        
    elif name not in reviews_max:
        reviews_max[name] = n_reviews

In [None]:
print('Expected length:', len(android) - 1181)
print('Actual length:', len(reviews_max))

use the reviews_max dictionary to remove the duplicates

In [None]:
android_clean = []
already_added = []

for app in android:
    name = app[0]
    n_reviews = float(app[3])
    
    if (reviews_max[name] == n_reviews) and (name not in already_added):
        android_clean.append(app)
        already_added.append(name) # make sure this is inside the if bl

In [None]:
explore_data(android_clean, 0, 3, True)


# Detect wrong and Non-English Data


In [None]:
print(ios[813][1])
print(ios[6731][1])

print(android_clean[4412][0])
print(android_clean[7940][0])

In [None]:
def is_english(string):
    
    for character in string:
        if ord(character) > 127:
            return False
    
    return True

print(is_english('Instagram'))
print(is_english('爱奇艺PPS -《欢乐颂2》电视剧热播'))

**Use the following function to detect non-English apps**

In [None]:
def is_english(string):
    non_ascii = 0
    
    for character in string:
        if ord(character) > 127:
            non_ascii += 1
    
    if non_ascii > 3:
        return False
    else:
        return True

print(is_english('Docs To Go™ Free Office Suite'))
print(is_english('Instachat 😜'))

In [None]:

android_english = []
ios_english = []

for app in android_clean:
    name = app[0]
    if is_english(name):
        android_english.append(app)
        
for app in ios:
    name = app[1]
    if is_english(name):
        ios_english.append(app)
        
explore_data(android_english, 0, 3, True)
print('\n')
explore_data(ios_english, 0, 3, True)

**Isolate free apps in our data**

In [None]:
android_final = []
ios_final = []

for app in android_english:
    price = app[7]
    if price == '0':
        android_final.append(app)
        
for app in ios_english:
    price = app[5]
    if price == '0':
        ios_final.append(app)
        
print(len(android_final))
print(len(ios_final))

Left with 8864 free Android apps and 4056 iOS apps

# Identify the most common genre in the market

use these two functions to analyze frequency

In [None]:
def freq_table(dataset, index):
    table = {}
    total = 0
    for row in dataset:
        total += 1
        value = row[index]
        if value in table:
            table[value] += 1
        else:
            table[value] = 1
            
    table_percentage = {}
    for key in table:
        percentage = (table[value] / total) * 100
        table_percentage[key] = percentage
    
    return table_percentage

def display_table(dataset, index):
    table = freq_table(dataset, index)
    table_display = []
    for key in table:
        key_val_as_tuple = (table[key], key)
        table_display.append(key_val_as_tuple)

    table_sorted = sorted(table_display, reverse = True)
    for entry in table_sorted:
        print(entry[1], ':', entry[0])
        

In [None]:
display_table(android_final, 1) # Category

In [None]:
display_table(ios_final, -5)


In [None]:
display_table(android_final, -4)


Use the average of number of rating in iOS apps as the number of user per genre

In [None]:
genre_ios = freq_table(ios_final, -5)
print(genre_ios)

for genre in genre_ios:
    total = 0
    len_genre = 0
    for app in ios_final:
        genre_app = app[-5]
        if genre_app == genre:            
            n_ratings = float(app[6])
            total += n_ratings
            len_genre += 1
    avg_n_ratings = total / len_genre
    print(genre, ':', avg_n_ratings)

Check number of install by user in per Android app category 

In [None]:
categories_android = freq_table(android_final, 1)

for category in categories_android:
    total = 0
    len_category = 0
    for app in android_final:
        category_app = app[1]
        if category_app == category:
            n_installs = app[5]
            n_installs = n_installs.replace(',', '')
            n_installs = n_installs.replace('+', '')
            total += float(n_installs)
            len_category += 1
    avg_n_installs = total / len_category
    print(category, ':', avg_n_installs)


# Experiment with Regression

Read these two files again with the pandas module

In [None]:
data_ios = pd.read_csv('AppleStore.csv')
data_android = pd.read_csv('googleplaystore.csv')

In [None]:
data_ios.head()

In [None]:
data_android.head()

In [None]:
data_ios.info()

In [None]:
data_android.info()

Uses multivariate regression methods, we would like to see what variables contribute to the score of 'rating' in each category

In [None]:
lm_ios = smf.ols(formula = 'user_rating ~ price + cont_rating', data = data_ios).fit()
print(lm_ios.params)

In [None]:
lm_android = smf.ols(formula = 'Rating ~ Reviews + Installs + Price', data = data_android).fit()
print(lm_android.params)