In [None]:
from csv import reader

play_store_file = open("googleplaystore.csv", encoding="Latin1")
read_file = reader(play_store_file)
play_store = list(read_file)
play_store_header = play_store[0]
play_store_data = play_store[1:]

app_store_file = open("AppleStore.csv", encoding="Latin1")
read_file = reader(app_store_file)
app_store = list(read_file)
app_store_header = app_store[0]
app_store_data = app_store[1:]

In [None]:
def explore_data(dataset, start, end, rows_and_columns=False):
    dataset_slice = dataset[start:end]
    for row in dataset_slice:
        print(row)
        print("\n")

    if rows_and_columns:
        print(f"Number of rows: {len(dataset)}")
        print(f"Number of columns: {len(dataset[0])}")

In [None]:
print(play_store_header)
print("\n")
explore_data(play_store_data, 0, 3, True)

print(app_store_header)
print("\n")
explore_data(app_store_data, 0, 3, True)

In [None]:
print(play_store_data[10472])  # incorrect row
print('\n')
print(play_store_header)  # header
print('\n')
print(play_store_data[0])      # correct row

In [None]:
del play_store_data[10472]

In [None]:
for row in play_store_data:
    name = row[0]
    if name == "Instagram":
        print(row)

In [None]:
# Count duplicate and unique apps
duplicate_apps = []
unique_apps = []

for row in play_store_data:
    name = row[0]
    if name in unique_apps:
        duplicate_apps.append(name)
    else:
        unique_apps.append(name)

print(len(unique_apps))
print(len(duplicate_apps))
print(duplicate_apps[:15])

In [None]:
# Only keep entries with highest number of reviews - most recent and reliable ratings
reviews_max = {}

for row in play_store_data:
    name = row[0]
    num_reviews = float(row[3])

    if name in reviews_max and reviews_max[name] < num_reviews:
        reviews_max[name] = num_reviews
    
    elif name not in reviews_max:
        reviews_max[name] = num_reviews

print(len(reviews_max))

In [None]:
# Remove duplicate entries
ps_data_clean = []
already_added = []

for row in play_store_data:
    name = row[0]
    num_reviews = float(row[3])

    if reviews_max[name] == num_reviews and name not in already_added:
        ps_data_clean.append(row)
        already_added.append(name)

explore_data(ps_data_clean, 0, 3, True)

In [None]:
# Remove non-English apps (only if name has > 3 non-ASCII characters)
def is_English(string):
    non_ascii = 0

    for character in string:
        if ord(character) > 127:
            non_ascii += 1

    if non_ascii > 3:
        return False
    
    return True

print(is_English('Docs To Go™ Free Office Suite'))
print(is_English('Instachat " src="https://s.w.org/images/core/emoji/11.2.0/svg/1f61c.svg">" src="https://s.w.org/images/core/emoji/11.2.0/svg/1f61c.svg">" src="https://s.w.org/images/core/emoji/11.2.0/svg/1f61c.svg">'))

In [None]:
ps_eng = []
as_eng = []

for row in ps_data_clean:
    name = row[0]
    if is_English(name):
        ps_eng.append(row)

for row in app_store_data:
    name = row[2]
    if is_English(name):
        as_eng.append(row)

explore_data(ps_eng, 0, 3, True)
print("\n")
explore_data(as_eng, 0, 3, True)

In [None]:
# Only keep free apps
ps_final = []
as_final = []

for row in ps_eng:
    price = row[7]
    if price == "0":
        ps_final.append(row)

for row in as_eng:
    price = row[5]
    if price == "0":
        as_final.append(row)

print(len(ps_final))
print(len(as_final))

In [None]:
# Build frequency table
def freq_table(dataset, index):
    table = {}
    total = 0

    for row in dataset:
        total += 1
        value = row[index]
        if value in table:
            table[value] += 1
        else:
            table[value] = 1

    table_percent = {}
    for key in table:
        percentage = (table[key] / total) * 100
        table_percent[key] = percentage

    return table_percent

# Display frequency percentages in descending order
def display_table(dataset, index):
    table = freq_table(dataset, index)
    table_display = []
    for key in table:
        entry_tuple = (table[key], key)     # Put value first so we can sort by it
        table_display.append(entry_tuple)

    table_sorted = sorted(table_display, reverse=True)
    for entry in table_sorted:
        print(f"{entry[1]}: {entry[0]}%")


In [None]:
display_table(as_final, -5)     # prime_genre column

In [None]:
display_table(ps_final, 1)      # Category column

In [None]:
display_table(ps_final, -4)     # Genres column

In [None]:
genres_ios = freq_table(as_final, -5)       # Build frequency table for genres

# Calculate avg no. of user ratings for each genre
for genre in genres_ios:
    total = 0
    len_genre = 0

    for app in as_final:
        app_genre = app[-5]
        if app_genre == genre:
            num_ratings = float(app[6])
            total += num_ratings
            len_genre += 1

    avg_num_ratings = total / len_genre
    print(f"{genre}: {avg_num_ratings}")


In [None]:
for app in as_final:
    if app[-5] == "Navigation":
        print(f"{app[2]}: {app[6]}")

In [None]:
for app in as_final:
    if app[-5] == "Reference":
        print(f"{app[2]}: {app[6]}")

In [None]:
display_table(ps_final, 5)

In [None]:
categories_android = freq_table(ps_final, 1)

for category in categories_android:
    total = 0
    len_category = 0

    for app in ps_final:
        app_category = app[1]
        if app_category == category:
            num_installs = app[5]
            num_installs = num_installs.replace(",", "")
            num_installs = num_installs.replace("+", "")
            total += float(num_installs)
            len_category += 1

    avg_num_installs = total / len_category
    print(f"{category}: {avg_num_installs}")

In [None]:
for app in ps_final:
    if app[1] == 'COMMUNICATION' and (app[5] == '1,000,000,000+' or app[5] == '500,000,000+' or app[5] == '100,000,000+'):
        print(f"{app[0]}: {app[5]}")

In [None]:
under_100_mil = []

for app in ps_final:
    num_installs = app[5]
    num_installs = num_installs.replace(",", "")
    num_installs = num_installs.replace("+", "")
    if float(num_installs) < 100000000:
        under_100_mil.append(float(num_installs))

sum(under_100_mil) / len(under_100_mil)

In [None]:
for app in ps_final:
    if app[1] == "BOOKS_AND_REFERENCE":
        print(f"{app[0]}: {app[5]}")

In [25]:
for app in ps_final:
    if app[1] == "BOOKS_AND_REFERENCE" and (app[5] == '1,000,000,000+' or app[5] == '500,000,000+' or app[5] == '100,000,000+'):
        print(f"{app[0]}: {app[5]}")

Google Play Books: 1,000,000,000+
Bible: 100,000,000+
Amazon Kindle: 100,000,000+
Audiobooks from Audible: 100,000,000+


In [26]:
for app in ps_final:
    if app[1] == "BOOKS_AND_REFERENCE" and (app[5] == '1,000,000+' or app[5] == '5,000,000+' or app[5] == '10,000,000+' or app[5] == '50,000,000+'):
        print(f"{app[0]}: {app[5]}")

Wikipedia: 10,000,000+
Cool Reader: 10,000,000+
Book store: 1,000,000+
FBReader: Favorite Book Reader: 10,000,000+
Free Books - Spirit Fanfiction and Stories: 1,000,000+
AlReader -any text book reader: 5,000,000+
FamilySearch Tree: 1,000,000+
Cloud of Books: 1,000,000+
ReadEra â free ebook reader: 1,000,000+
Ebook Reader: 5,000,000+
Read books online: 5,000,000+
eBoox: book reader fb2 epub zip: 1,000,000+
All Maths Formulas: 1,000,000+
Ancestry: 5,000,000+
HTC Help: 10,000,000+
Moon+ Reader: 10,000,000+
English-Myanmar Dictionary: 1,000,000+
Golden Dictionary (EN-AR): 1,000,000+
All Language Translator Free: 1,000,000+
Aldiko Book Reader: 10,000,000+
Dictionary - WordWeb: 5,000,000+
50000 Free eBooks & Free AudioBooks: 5,000,000+
Al-Quran (Free): 10,000,000+
Al Quran Indonesia: 10,000,000+
Al'Quran Bahasa Indonesia: 10,000,000+
Al Quran Al karim: 1,000,000+
Al Quran : EAlim - Translations & MP3 Offline: 5,000,000+
Koran Read &MP3 30 Juz Offline: 1,000,000+
Hafizi Quran 15 lines per p