In [None]:
# Profitable App Profiles for the App Store and Google Play Markets

In this project, we will analyze mobile app data to help developers identify which types of apps are likely to attract more users on the App Store and Google Play. Since our company develops only free apps, the main source of revenue is through in-app advertisements. Therefore, increasing the user base for our apps directly correlates with generating more revenue.

The goal of this project is to provide data-driven insights into app categories that are popular and profitable. We will work with two datasets — one from the App Store and one from Google Play — and use our Python programming skills to analyze these datasets and inform future app development decisions.


In [11]:
android_clean = []  # This will store the cleaned data
already_added = []  # This will store the names of apps already added

for app in googleplay_data[1:]:  # Exclude the header
    name = app[0]
    n_reviews = float(app[3])

    # Keep only the row with the maximum number of reviews for each app
    if (n_reviews == reviews_max[name]) and (name not in already_added):
        android_clean.append(app)
        already_added.append(name)

# Check the number of rows in the cleaned dataset (should be 9,659)
print(f"Number of rows in the cleaned dataset: {len(android_clean)}")


ValueError: could not convert string to float: '3.0M'

In [5]:
import csv

# Open and read the uploaded Google Play dataset
with open('googleplaystore.csv', encoding='utf8') as file:
    googleplay_data = list(csv.reader(file))

# Open and read the uploaded Google Play User Reviews dataset
with open('googleplaystore_user_reviews.csv', encoding='utf8') as file:
    user_reviews_data = list(csv.reader(file))

# Explore the first few rows of both datasets to ensure they load correctly
print("Google Play Data (First 5 rows):")
for row in googleplay_data[:5]:
    print(row)

print("\nGoogle Play User Reviews Data (First 5 rows):")
for row in user_reviews_data[:5]:
    print(row)


Google Play Data (First 5 rows):
['App', 'Category', 'Rating', 'Reviews', 'Size', 'Installs', 'Type', 'Price', 'Content Rating', 'Genres', 'Last Updated', 'Current Ver', 'Android Ver']
['Photo Editor & Candy Camera & Grid & ScrapBook', 'ART_AND_DESIGN', '4.1', '159', '19M', '10,000+', 'Free', '0', 'Everyone', 'Art & Design', 'January 7, 2018', '1.0.0', '4.0.3 and up']
['Coloring book moana', 'ART_AND_DESIGN', '3.9', '967', '14M', '500,000+', 'Free', '0', 'Everyone', 'Art & Design;Pretend Play', 'January 15, 2018', '2.0.0', '4.0.3 and up']
['U Launcher Lite – FREE Live Cool Themes, Hide Apps', 'ART_AND_DESIGN', '4.7', '87510', '8.7M', '5,000,000+', 'Free', '0', 'Everyone', 'Art & Design', 'August 1, 2018', '1.2.4', '4.0.3 and up']
['Sketch - Draw & Paint', 'ART_AND_DESIGN', '4.5', '215644', '25M', '50,000,000+', 'Free', '0', 'Teen', 'Art & Design', 'June 8, 2018', 'Varies with device', '4.2 and up']

Google Play User Reviews Data (First 5 rows):
['App', 'Translated_Review', 'Sentiment',

In [6]:
import os

# List all files in the current directory to check for the correct filenames
print(os.listdir())


['googleplaystore_user_reviews.csv', 'AppleStore.csv', '.ipynb_checkpoints', 'Basics.ipynb', 'googleplaystore.csv']


In [7]:
def explore_data(dataset, start, end, rows_and_columns=False):
    dataset_slice = dataset[start:end]    
    for row in dataset_slice:
        print(row)
        print('\n')  # adds a new (empty) line after each row

    if rows_and_columns:
        print('Number of rows:', len(dataset))
        print('Number of columns:', len(dataset[0]))


In [8]:
# Open and read the Apple Store dataset
with open('AppleStore.csv', encoding='utf8') as file:
    applestore_data = list(csv.reader(file))

# Explore the first few rows of each dataset

print("Google Play Data (First 5 rows):")
explore_data(googleplay_data, 0, 5, True)

print("\nGoogle Play User Reviews Data (First 5 rows):")
explore_data(user_reviews_data, 0, 5, True)

print("\nApple Store Data (First 5 rows):")
explore_data(applestore_data, 0, 5, True)


Google Play Data (First 5 rows):
['App', 'Category', 'Rating', 'Reviews', 'Size', 'Installs', 'Type', 'Price', 'Content Rating', 'Genres', 'Last Updated', 'Current Ver', 'Android Ver']


['Photo Editor & Candy Camera & Grid & ScrapBook', 'ART_AND_DESIGN', '4.1', '159', '19M', '10,000+', 'Free', '0', 'Everyone', 'Art & Design', 'January 7, 2018', '1.0.0', '4.0.3 and up']


['Coloring book moana', 'ART_AND_DESIGN', '3.9', '967', '14M', '500,000+', 'Free', '0', 'Everyone', 'Art & Design;Pretend Play', 'January 15, 2018', '2.0.0', '4.0.3 and up']


['U Launcher Lite – FREE Live Cool Themes, Hide Apps', 'ART_AND_DESIGN', '4.7', '87510', '8.7M', '5,000,000+', 'Free', '0', 'Everyone', 'Art & Design', 'August 1, 2018', '1.2.4', '4.0.3 and up']


['Sketch - Draw & Paint', 'ART_AND_DESIGN', '4.5', '215644', '25M', '50,000,000+', 'Free', '0', 'Teen', 'Art & Design', 'June 8, 2018', 'Varies with device', '4.2 and up']


Number of rows: 10842
Number of columns: 13

Google Play User Reviews Data (Fi

In [9]:
# Print column names for Google Play dataset
print("Google Play Column Names:")
print(googleplay_data[0])

# Print column names for Google Play User Reviews dataset
print("Google Play User Reviews Column Names:")
print(user_reviews_data[0])

# Print column names for Apple Store dataset
print("Apple Store Column Names:")
print(applestore_data[0])


Google Play Column Names:
['App', 'Category', 'Rating', 'Reviews', 'Size', 'Installs', 'Type', 'Price', 'Content Rating', 'Genres', 'Last Updated', 'Current Ver', 'Android Ver']
Google Play User Reviews Column Names:
['App', 'Translated_Review', 'Sentiment', 'Sentiment_Polarity', 'Sentiment_Subjectivity']
Apple Store Column Names:
['id', 'track_name', 'size_bytes', 'currency', 'price', 'rating_count_tot', 'rating_count_ver', 'user_rating', 'user_rating_ver', 'ver', 'cont_rating', 'prime_genre', 'sup_devices.num', 'ipadSc_urls.num', 'lang.num', 'vpp_lic']


In [10]:
# Print the row at index 10472 in the Google Play dataset
print(googleplay_data[10472])


['Xposed Wi-Fi-Pwd', 'PERSONALIZATION', '3.5', '1042', '404k', '100,000+', 'Free', '0', 'Everyone', 'Personalization', 'August 5, 2014', '3.0.0', '4.0.3 and up']


In [12]:
# Create two lists: one for duplicate apps and one for unique apps
duplicate_apps = []
unique_apps = []

# Loop through the Google Play dataset (excluding the header)
for app in googleplay_data[1:]:
    name = app[0]  # App name is the first element
    if name in unique_apps:
        duplicate_apps.append(name)
    else:
        unique_apps.append(name)

# Print the number of duplicate apps
print('Number of duplicate apps:', len(duplicate_apps))

# Print a few duplicate app names to confirm
print('Examples of duplicate apps:', duplicate_apps[:10])

Number of duplicate apps: 1181
Examples of duplicate apps: ['Quick PDF Scanner + OCR FREE', 'Box', 'Google My Business', 'ZOOM Cloud Meetings', 'join.me - Simple Meetings', 'Box', 'Zenefits', 'Google Ads', 'Google My Business', 'Slack']


In [1]:
# Remove the row at index 10472
del googleplay_data[10472]


NameError: name 'googleplay_data' is not defined

In [2]:
# Remove the row at index 10472
del googleplay_data[10472]


NameError: name 'googleplay_data' is not defined

In [4]:
import csv

# Open and read the Google Play dataset
with open('googleplaystore.csv', encoding='utf8') as file:
    googleplay_data = list(csv.reader(file))

    

In [5]:
# Remove the row at index 10472
del googleplay_data[10472]


In [6]:
# Print the row at index 10472 to confirm it has been removed
print(googleplay_data[10471:10473])


[['Jazz Wi-Fi', 'COMMUNICATION', '3.4', '49', '4.0M', '10,000+', 'Free', '0', 'Everyone', 'Communication', 'February 10, 2017', '0.1', '2.3 and up'], ['Life Made WI-Fi Touchscreen Photo Frame', '1.9', '19', '3.0M', '1,000+', 'Free', '0', 'Everyone', '', 'February 11, 2018', '1.0.19', '4.0 and up']]


In [7]:
# Create two lists: one for duplicate apps and one for unique apps
duplicate_apps = []
unique_apps = []

# Loop through the Google Play dataset (excluding the header)
for app in googleplay_data[1:]:  # Start from index 1 to skip the header
    name = app[0]  # App name is the first element in each row
    if name in unique_apps:
        duplicate_apps.append(name)
    else:
        unique_apps.append(name)

# Print the number of duplicate apps
print('Number of duplicate apps:', len(duplicate_apps))

# Print a few duplicate app names to confirm
print('Examples of duplicate apps:', duplicate_apps[:10])


Number of duplicate apps: 1181
Examples of duplicate apps: ['Quick PDF Scanner + OCR FREE', 'Box', 'Google My Business', 'ZOOM Cloud Meetings', 'join.me - Simple Meetings', 'Box', 'Zenefits', 'Google Ads', 'Google My Business', 'Slack']


In [8]:
# Create a dictionary to store the app with the highest number of reviews
reviews_max = {}

# Loop through the dataset to populate the dictionary
for app in googleplay_data[1:]:
    name = app[0]
    n_reviews = float(app[3])  # Number of reviews (convert to float)

    # If the app is already in the dictionary, keep the entry with the higher number of reviews
    if name in reviews_max:
        if reviews_max[name] < n_reviews:
            reviews_max[name] = n_reviews
    else:
        reviews_max[name] = n_reviews

# Create a new dataset with only unique apps
googleplay_clean = []
already_added = []

# Loop through the dataset again and keep only the rows with the highest number of reviews
for app in googleplay_data[1:]:
    name = app[0]
    n_reviews = float(app[3])
    
    # If this row has the highest number of reviews and the app hasn't been added yet, keep it
    if (reviews_max[name] == n_reviews) and (name not in already_added):
        googleplay_clean.append(app)
        already_added.append(name)

# Print the number of rows in the cleaned dataset to verify
print('Expected rows after removing duplicates:', len(googleplay_clean))


ValueError: could not convert string to float: '3.0M'

In [9]:
# Function to clean and convert the 'Reviews' column to a float
def convert_reviews(reviews_str):
    if 'M' in reviews_str:
        return float(reviews_str.replace('M', '')) * 1_000_000  # Convert 'M' to millions
    elif 'k' in reviews_str:
        return float(reviews_str.replace('k', '')) * 1_000  # Convert 'k' to thousands
    else:
        return float(reviews_str)  # If no 'M' or 'k', convert directly to float

# Create a dictionary to store the app with the highest number of reviews
reviews_max = {}

# Loop through the dataset to populate the dictionary
for app in googleplay_data[1:]:
    name = app[0]
    try:
        n_reviews = convert_reviews(app[3])  # Clean and convert the 'Reviews' column
    except ValueError:
        continue  # Skip the row if there's an issue with conversion

    # If the app is already in the dictionary, keep the entry with the higher number of reviews
    if name in reviews_max:
        if reviews_max[name] < n_reviews:
            reviews_max[name] = n_reviews
    else:
        reviews_max[name] = n_reviews

# Create a new dataset with only unique apps
googleplay_clean = []
already_added = []

# Loop through the dataset again and keep only the rows with the highest number of reviews
for app in googleplay_data[1:]:
    name = app[0]
    n_reviews = convert_reviews(app[3])

    # If this row has the highest number of reviews and the app hasn't been added yet, keep it
    if (reviews_max[name] == n_reviews) and (name not in already_added):
        googleplay_clean.append(app)
        already_added.append(name)

# Print the number of rows in the cleaned dataset to verify
print('Expected rows after removing duplicates:', len(googleplay_clean))



Expected rows after removing duplicates: 9659


In [10]:
# Function to check if the app name is in English
def is_english(string):
    count_non_english = 0
    for char in string:
        if ord(char) > 127:  # Non-ASCII characters have an ord value greater than 127
            count_non_english += 1
        if count_non_english > 3:  # Allowing up to 3 non-English characters (for things like emojis)
            return False
    return True

# Filter out non-English apps
googleplay_clean_english = [app for app in googleplay_clean if is_english(app[0])]

# Print the number of apps remaining after removing non-English apps
print('Number of English apps:', len(googleplay_clean_english))


Number of English apps: 9614


In [11]:
# Filter out non-free apps
googleplay_final = [app for app in googleplay_clean_english if app[6] == 'Free']

# Print the number of free apps remaining
print('Number of free apps:', len(googleplay_final))


Number of free apps: 8862


In [13]:
# Function to clean and convert the 'Reviews' column to a float
def convert_reviews(reviews_str):
    if 'M' in reviews_str:
        return float(reviews_str.replace('M', '')) * 1_000_000  # Convert 'M' to millions
    elif 'k' in reviews_str:
        return float(reviews_str.replace('k', '')) * 1_000  # Convert 'k' to thousands
    else:
        return float(reviews_str)  # If no 'M' or 'k', convert directly to float


In [14]:
# Create a dictionary to store the app with the highest number of reviews
reviews_max = {}

# Loop through the dataset (excluding the header)
for app in googleplay_data[1:]:
    name = app[0]  # App name is the first element
    try:
        n_reviews = convert_reviews(app[3])  # Clean and convert the 'Reviews' column
    except ValueError:
        continue  # Skip rows with invalid data

    # If the app is already in the dictionary, update it if the current entry has more reviews
    if name in reviews_max:
        if reviews_max[name] < n_reviews:
            reviews_max[name] = n_reviews
    else:
        reviews_max[name] = n_reviews

# Print the number of unique apps (should be 9,659)
print('Expected number of unique apps:', len(reviews_max))


Expected number of unique apps: 9659


In [15]:
# Create two empty lists: one for the cleaned dataset and one to track added app names
android_clean = []
already_added = []

# Loop through the dataset again to remove duplicates
for app in googleplay_data[1:]:
    name = app[0]
    n_reviews = convert_reviews(app[3])  # Convert the number of reviews

    # If the current entry matches the max reviews and hasn't been added yet, keep it
    if (reviews_max[name] == n_reviews) and (name not in already_added):
        android_clean.append(app)  # Add the app to the clean list
        already_added.append(name)  # Track added apps

# Print the number of rows in the cleaned dataset (should be 9,659)
print('Number of rows in the cleaned dataset:', len(android_clean))


Number of rows in the cleaned dataset: 9659


In [16]:
# Function to detect if a string is in English
def is_english(string):
    for char in string:
        if ord(char) > 127:  # If the character's ASCII value is greater than 127
            return False  # It's likely a non-English app name
    return True  # If all characters are within the ASCII range for English


In [17]:
# Test the function with the given app names
print(is_english('Instagram'))  # Expected output: True
print(is_english('爱奇艺PPS -《欢乐颂2》电视剧热播'))  # Expected output: False
print(is_english('Docs To Go™ Free Office Suite'))  # Expected output: False (™ symbol)
print(is_english('Instachat 😜'))  # Expected output: False (emoji)


True
False
False
False


In [18]:
# Filter out non-English apps from the cleaned dataset
googleplay_english = [app for app in android_clean if is_english(app[0])]

# Print the number of remaining apps after removing non-English apps
print('Number of English apps:', len(googleplay_english))

Number of English apps: 9117


In [19]:
# Function to detect if a string is in English, allowing up to 3 non-ASCII characters
def is_english(string):
    count_non_english = 0
    for char in string:
        if ord(char) > 127:
            count_non_english += 1
        if count_non_english > 3:  # Allow up to 3 non-ASCII characters
            return False
    return True


In [20]:
# Test the updated function with app names
print(is_english('Docs To Go™ Free Office Suite'))  # Expected output: True
print(is_english('Instachat 😜'))  # Expected output: True
print(is_english('爱奇艺PPS -《欢乐颂2》电视剧热播'))  # Expected output: False


True
True
False


In [21]:
# Filter out non-English apps from the cleaned Google Play dataset
googleplay_english = [app for app in android_clean if is_english(app[0])]

# Print the number of remaining apps after filtering non-English apps
print('Number of English apps:', len(googleplay_english))


Number of English apps: 9614


In [22]:
# Explore the first few rows of the filtered datasets
print(googleplay_english[:5])
print(applestore_english[:5])


[['Photo Editor & Candy Camera & Grid & ScrapBook', 'ART_AND_DESIGN', '4.1', '159', '19M', '10,000+', 'Free', '0', 'Everyone', 'Art & Design', 'January 7, 2018', '1.0.0', '4.0.3 and up'], ['U Launcher Lite – FREE Live Cool Themes, Hide Apps', 'ART_AND_DESIGN', '4.7', '87510', '8.7M', '5,000,000+', 'Free', '0', 'Everyone', 'Art & Design', 'August 1, 2018', '1.2.4', '4.0.3 and up'], ['Sketch - Draw & Paint', 'ART_AND_DESIGN', '4.5', '215644', '25M', '50,000,000+', 'Free', '0', 'Teen', 'Art & Design', 'June 8, 2018', 'Varies with device', '4.2 and up'], ['Pixel Draw - Number Art Coloring Book', 'ART_AND_DESIGN', '4.3', '967', '2.8M', '100,000+', 'Free', '0', 'Everyone', 'Art & Design;Creativity', 'June 20, 2018', '1.1', '4.4 and up'], ['Paper flowers instructions', 'ART_AND_DESIGN', '4.4', '167', '5.6M', '50,000+', 'Free', '0', 'Everyone', 'Art & Design', 'March 26, 2017', '1.0', '2.3 and up']]


NameError: name 'applestore_english' is not defined

In [23]:
# Isolate free apps from the Google Play dataset
googleplay_free = [app for app in googleplay_english if app[7] == '0']

# Check how many free apps remain
print('Number of free apps (Google Play):', len(googleplay_free))


Number of free apps (Google Play): 8863


In [1]:
# Isolate free apps from the App Store dataset
applestore_free = [app for app in applestore_english if app[4] == '0.0']  # Assuming price is in column 4

# Check how many free apps remain
print('Number of free apps (App Store):', len(applestore_free))


NameError: name 'applestore_english' is not defined

In [2]:
import csv

# Open and read the Apple Store dataset (make sure to use the correct file path)
with open('AppleStore.csv', encoding='utf8') as file:
    applestore_data = list(csv.reader(file))

# Check the first few rows to confirm the data is loaded correctly
print(applestore_data[:5])


[['id', 'track_name', 'size_bytes', 'currency', 'price', 'rating_count_tot', 'rating_count_ver', 'user_rating', 'user_rating_ver', 'ver', 'cont_rating', 'prime_genre', 'sup_devices.num', 'ipadSc_urls.num', 'lang.num', 'vpp_lic'], ['284882215', 'Facebook', '389879808', 'USD', '0.0', '2974676', '212', '3.5', '3.5', '95.0', '4+', 'Social Networking', '37', '1', '29', '1'], ['389801252', 'Instagram', '113954816', 'USD', '0.0', '2161558', '1289', '4.5', '4.0', '10.23', '12+', 'Photo & Video', '37', '0', '29', '1'], ['529479190', 'Clash of Clans', '116476928', 'USD', '0.0', '2130805', '579', '4.5', '4.5', '9.24.12', '9+', 'Games', '38', '5', '18', '1'], ['420009108', 'Temple Run', '65921024', 'USD', '0.0', '1724546', '3842', '4.5', '4.0', '1.6.2', '9+', 'Games', '40', '5', '1', '1']]


In [3]:
# Function to check if the app name is in English
def is_english(string):
    count_non_english = 0
    for char in string:
        if ord(char) > 127:
            count_non_english += 1
        if count_non_english > 3:  # Allow up to 3 non-English characters
            return False
    return True

# Filter out non-English apps from the App Store dataset
applestore_english = [app for app in applestore_data[1:] if is_english(app[1])]  # Assuming app name is in column 1

# Check the number of English apps
print('Number of English apps (App Store):', len(applestore_english))


Number of English apps (App Store): 6183


In [4]:
# Isolate free apps from the App Store dataset
applestore_free = [app for app in applestore_english if app[4] == '0.0']  # Assuming price is in column 4

# Check how many free apps remain
print('Number of free apps (App Store):', len(applestore_free))


Number of free apps (App Store): 3222


In [5]:
# Function to generate frequency tables
def freq_table(dataset, index):
    table = {}
    total = 0

    # Loop through the dataset and count occurrences
    for app in dataset:
        total += 1
        genre = app[index]
        if genre in table:
            table[genre] += 1
        else:
            table[genre] = 1

    # Convert counts to percentages
    table_percentages = {key: (value / total) * 100 for key, value in table.items()}
    return table_percentages


In [2]:
import csv

# Open and read the Google Play dataset
with open('googleplaystore.csv', encoding='utf8') as file:
    googleplay_data = list(csv.reader(file))

# Open and read the Google Play User Reviews dataset
with open('googleplaystore_user_reviews.csv', encoding='utf8') as file:
    user_reviews_data = list(csv.reader(file))

# Open and read the Apple Store dataset
with open('AppleStore.csv', encoding='utf8') as file:
    applestore_data = list(csv.reader(file))

# Confirm the data is loaded correctly by printing the first few rows of each dataset
print(googleplay_data[:5])
print(user_reviews_data[:5])
print(applestore_data[:5])

[['App', 'Category', 'Rating', 'Reviews', 'Size', 'Installs', 'Type', 'Price', 'Content Rating', 'Genres', 'Last Updated', 'Current Ver', 'Android Ver'], ['Photo Editor & Candy Camera & Grid & ScrapBook', 'ART_AND_DESIGN', '4.1', '159', '19M', '10,000+', 'Free', '0', 'Everyone', 'Art & Design', 'January 7, 2018', '1.0.0', '4.0.3 and up'], ['Coloring book moana', 'ART_AND_DESIGN', '3.9', '967', '14M', '500,000+', 'Free', '0', 'Everyone', 'Art & Design;Pretend Play', 'January 15, 2018', '2.0.0', '4.0.3 and up'], ['U Launcher Lite – FREE Live Cool Themes, Hide Apps', 'ART_AND_DESIGN', '4.7', '87510', '8.7M', '5,000,000+', 'Free', '0', 'Everyone', 'Art & Design', 'August 1, 2018', '1.2.4', '4.0.3 and up'], ['Sketch - Draw & Paint', 'ART_AND_DESIGN', '4.5', '215644', '25M', '50,000,000+', 'Free', '0', 'Teen', 'Art & Design', 'June 8, 2018', 'Varies with device', '4.2 and up']]
[['App', 'Translated_Review', 'Sentiment', 'Sentiment_Polarity', 'Sentiment_Subjectivity'], ['10 Best Foods for You

In [3]:
# Filter out non-free apps (assuming the price column is index 7)
googleplay_free = [app for app in googleplay_data[1:] if app[7] == '0']

# Check how many free apps remain
print('Number of free apps (Google Play):', len(googleplay_free))

Number of free apps (Google Play): 10040


In [4]:
# Filter out non-free apps (assuming the price column is index 4)
applestore_free = [app for app in applestore_data[1:] if app[4] == '0.0']

# Check how many free apps remain
print('Number of free apps (Apple Store):', len(applestore_free))

Number of free apps (Apple Store): 4056


In [5]:
# Function to generate frequency tables
def freq_table(dataset, index):
    table = {}
    total = 0

    # Loop through the dataset and count occurrences
    for app in dataset:
        total += 1
        genre = app[index]
        if genre in table:
            table[genre] += 1
        else:
            table[genre] = 1

    # Convert counts to percentages
    table_percentages = {key: (value / total) * 100 for key, value in table.items()}
    return table_percentages

# Function to display the frequency table in descending order
def display_table(dataset, index):
    table = freq_table(dataset, index)  # Generate the frequency table
    table_display = []
    
    # Transform into list of tuples (value, key) for sorting
    for key in table:
        key_val_as_tuple = (table[key], key)
        table_display.append(key_val_as_tuple)
    
    # Sort the list in descending order by percentage
    table_sorted = sorted(table_display, reverse=True)
    
    # Display the sorted table
    for entry in table_sorted:
        print(entry[1], ":", entry[0])

# Display frequency tables for prime_genre in the Apple Store dataset
print("App Store - Prime Genre Frequency Table:")
display_table(applestore_free, 11)  # Assuming prime_genre is at index 11

# Display frequency tables for Genres and Category in the Google Play dataset
print("\nGoogle Play - Genres Frequency Table:")
display_table(googleplay_free, 9)  # Assuming Genres is at index 9

print("\nGoogle Play - Category Frequency Table:")
display_table(googleplay_free, 1)  # Assuming Category is at index 1

App Store - Prime Genre Frequency Table:
Games : 55.64595660749507
Entertainment : 8.234714003944774
Photo & Video : 4.117357001972387
Social Networking : 3.5256410256410255
Education : 3.2544378698224854
Shopping : 2.983234714003945
Utilities : 2.687376725838264
Lifestyle : 2.3175542406311638
Finance : 2.0710059171597637
Sports : 1.947731755424063
Health & Fitness : 1.8737672583826428
Music : 1.6518737672583828
Book : 1.6272189349112427
Productivity : 1.5285996055226825
News : 1.4299802761341223
Travel : 1.3806706114398422
Food & Drink : 1.0601577909270217
Weather : 0.7642998027613412
Reference : 0.4930966469428008
Navigation : 0.4930966469428008
Business : 0.4930966469428008
Catalogs : 0.22189349112426035
Medical : 0.19723865877712032

Google Play - Genres Frequency Table:
Tools : 7.609561752988048
Entertainment : 6.01593625498008
Education : 5.169322709163347
Business : 4.442231075697211
Productivity : 3.944223107569721
Sports : 3.7250996015936253
Lifestyle : 3.6055776892430282
Comm

In [6]:
# Generate a frequency table for the prime_genre column to get unique genres
genres_frequency = freq_table(applestore_data, 11)  # prime_genre column is at index 11

# Loop through each unique genre
for genre in genres_frequency:
    total = 0  # This variable will store the sum of user ratings
    len_genre = 0  # This will store the number of apps in that genre
    
    # Loop through the App Store dataset
    for app in applestore_data[1:]:  # Skipping the header row
        genre_app = app[11]  # prime_genre column is at index 11
        
        if genre_app == genre:
            n_ratings = float(app[5])  # rating_count_tot column is at index 5
            total += n_ratings
            len_genre += 1
    
    # Calculate the average number of user ratings per genre
    if len_genre > 0:  # Avoid division by zero
        avg_ratings = total / len_genre
    else:
        avg_ratings = 0
    
    # Print the genre and its average user rating
    print(f"{genre} : {avg_ratings:.2f}")

prime_genre : 0.00
Social Networking : 45498.90
Photo & Video : 14352.28
Games : 13692.00
Music : 28842.02
Reference : 22410.84
Health & Fitness : 9913.17
Weather : 22181.03
Utilities : 6863.82
Travel : 14129.44
Shopping : 18615.33
News : 13015.07
Navigation : 11853.96
Lifestyle : 6161.76
Entertainment : 7533.68
Food & Drink : 13938.62
Sports : 14026.93
Book : 5125.44
Finance : 11047.65
Education : 2239.23
Productivity : 8051.33
Business : 4788.09
Catalogs : 1732.50
Medical : 592.78


In [7]:
# Function to generate frequency table for the Category column
genres_frequency = freq_table(googleplay_data, 1)  # Category column is at index 1

# Loop through each unique genre in the Category column
for category in genres_frequency:
    total = 0  # Variable to store the sum of installs
    len_category = 0  # Variable to store the number of apps in this category
    
    # Loop through the Google Play dataset
    for app in googleplay_data[1:]:  # Skip the header row
        category_app = app[1]  # Category is at index 1
        
        if category_app == category:
            # Clean the 'Installs' column by removing commas and the '+' sign
            installs = app[5].replace(',', '').replace('+', '')  # Installs is at index 5
            installs = float(installs)  # Convert the cleaned string to float
            
            total += installs  # Add the number of installs to total
            len_category += 1  # Increment the app count for this category
    
    # Calculate the average installs per category
    if len_category > 0:  # Avoid division by zero
        avg_installs = total / len_category
    else:
        avg_installs = 0
    
    # Print the category and its average installs
    print(f"{category} : {avg_installs:.2f}")


Category : 0.00
ART_AND_DESIGN : 1912893.85
AUTO_AND_VEHICLES : 625061.31
BEAUTY : 513151.89
BOOKS_AND_REFERENCE : 8318050.11
BUSINESS : 2178075.79
COMICS : 934769.17
COMMUNICATION : 84359886.95
DATING : 1129533.36
EDUCATION : 5586230.77
ENTERTAINMENT : 19256107.38
EVENTS : 249580.64
FINANCE : 2395215.12
FOOD_AND_DRINK : 2156683.08
HEALTH_AND_FITNESS : 4642441.38
HOUSE_AND_HOME : 1917187.06
LIBRARIES_AND_DEMO : 741128.35
LIFESTYLE : 1407443.82
GAME : 30669601.76
FAMILY : 5201959.18
MEDICAL : 115026.86
SOCIAL : 47694467.46
SHOPPING : 12491726.10
PHOTOGRAPHY : 30114172.10
SPORTS : 4560350.26
TRAVEL_AND_LOCAL : 26623593.59
TOOLS : 13585731.81
PERSONALIZATION : 5932384.65
PRODUCTIVITY : 33434177.76
PARENTING : 525351.83
WEATHER : 5196347.80
VIDEO_PLAYERS : 35554301.26
NEWS_AND_MAGAZINES : 26488755.34
MAPS_AND_NAVIGATION : 5286729.12


ValueError: could not convert string to float: 'Free'