# App Profile Recommender

This is an mini-project based on *AppleStore* dataset from **Kaggle**

In [1]:
def open_data_set(file_name):
    file = open(file_name, encoding='utf8')
    from csv import reader
    file_data = reader(file)
    data = list(file_data)
    return data

In [2]:
def explore_data(dataset, start, end, rows_and_columns=False):
    dataset_slice = dataset[start:end]    
    for row in dataset_slice:
        print(row)
        print('\n') # adds a new (empty) line after each row

    if rows_and_columns:
        print('Number of rows:', len(dataset))
        print('Number of columns:', len(dataset[0]))

In [5]:
apps_data = open_data_set('AppleStore.csv')
plays_data = open_data_set('googleplaystore.csv')

In [6]:
explore_data(apps_data,0,5)

['', 'id', 'track_name', 'size_bytes', 'currency', 'price', 'rating_count_tot', 'rating_count_ver', 'user_rating', 'user_rating_ver', 'ver', 'cont_rating', 'prime_genre', 'sup_devices.num', 'ipadSc_urls.num', 'lang.num', 'vpp_lic']


['1', '281656475', 'PAC-MAN Premium', '100788224', 'USD', '3.99', '21292', '26', '4', '4.5', '6.3.5', '4+', 'Games', '38', '5', '10', '1']


['2', '281796108', 'Evernote - stay organized', '158578688', 'USD', '0', '161065', '26', '4', '3.5', '8.2.2', '4+', 'Productivity', '37', '5', '23', '1']


['3', '281940292', 'WeatherBug - Local Weather, Radar, Maps, Alerts', '100524032', 'USD', '0', '188583', '2822', '3.5', '4.5', '5.0.0', '4+', 'Weather', '37', '5', '3', '1']


['4', '282614216', 'eBay: Best App to Buy, Sell, Save! Online Shopping', '128512000', 'USD', '0', '262241', '649', '4', '4.5', '5.10.0', '12+', 'Shopping', '37', '5', '9', '1']




In [7]:
explore_data(plays_data,0,2)

['App', 'Category', 'Rating', 'Reviews', 'Size', 'Installs', 'Type', 'Price', 'Content Rating', 'Genres', 'Last Updated', 'Current Ver', 'Android Ver']


['Photo Editor & Candy Camera & Grid & ScrapBook', 'ART_AND_DESIGN', '4.1', '159', '19M', '10,000+', 'Free', '0', 'Everyone', 'Art & Design', 'January 7, 2018', '1.0.0', '4.0.3 and up']




In [10]:
explore_data(plays_data,10473,10474)

['osmino Wi-Fi: free WiFi', 'TOOLS', '4.2', '134203', '4.1M', '10,000,000+', 'Free', '0', 'Everyone', 'Tools', 'August 7, 2018', '6.06.14', '4.4 and up']




In [9]:
del plays_data[10473]

# Redundant Data

    
    
   Further analysis of the datasets gives us the overview that  multiple applications in the dataset have possible duplicate entries. Hence the duplicate entries must be removed. Instead of following a random procedure, we use the  __reviews__ column to determine the app with the most numbers of reviews, making sure we have the latest data for the app.

In [11]:
duplicate_apps = []
unique_apps=[]

In [12]:
for apps in plays_data:
    name = apps[0]
    if name in unique_apps:
        duplicate_apps.append(name)
    else:
        unique_apps.append(name)

In [13]:
print(len(duplicate_apps))

1181


In [14]:
print("Some duplicate apps are:",*duplicate_apps[0:4],sep = "\n")

Some duplicate apps are:
Quick PDF Scanner + OCR FREE
Box
Google My Business
ZOOM Cloud Meetings


# Removing the Duplicate Entries
   
   The duplicate entries of the apps in the duplicate_apps list can be removed use the below code:

In [15]:
reviews_max = {}
for apps in plays_data[1:]:
    name = apps[0]
    reviews_count = float(apps[3])
    if name in reviews_max and reviews_count > reviews_max[name]:
        reviews_max[name] = reviews_count
    else:
        reviews_max[name] = reviews_count
        

In [16]:
print(len(reviews_max))

9659


In [17]:
android_clean = []
already_added = []

In [18]:
for apps in plays_data[1:]:
    name = apps[0]
    reviews = float(apps[3])
    if (reviews == reviews_max[name]) and (name not in already_added):
        android_clean.append(apps)
        already_added.append(name)

In [19]:
print(len(android_clean))

9659


# Removing Non-English Apps
   The play store / app store will have multiple apps which use different languages other than english. Hence, we filter out those apps and keep the remaining to perform further analysis.

In [37]:
def english_or_not(words):
    flag = True
    count = 0
    for letters in words:
        if ord(letters) > 127:
            count += 1
        if count > 3:
            flag = False
            break
    return flag

In [38]:
print(english_or_not("Instagram"))

True


In [39]:
print(english_or_not("爱奇艺《欢乐颂2"))

False


In [40]:
print(english_or_not("Instagram😜"))

True


In [65]:
android_english = []
ios_english = []

In [66]:
for apps in android_clean:
    name = apps[0]
    if english_or_not(name):
        android_english.append(apps)

In [67]:
for apps in apps_data[1:]:
    name = apps[2]
    if english_or_not(name):
        ios_english.append(apps)

In [68]:
print(len(android_english))

9614


In [69]:
print(len(ios_english))

6183


# Isolating the Free Apps

In [70]:
android_free = []
ios_free = []

In [71]:
for apps in android_english:
    price = apps[7]
    if price == '0':
        android_free.append(apps)

In [75]:
for apps in ios_english:
    price = apps[5]
    if price == '0':
        ios_free.append(apps)

In [76]:
print(len(android_free))

8864


In [77]:
print(len(ios_free))

3222


# Filtering Most Common Apps by Genre

It is important for an organiztion that develops applications to understand the market. For this case,  it is important to identify the  no. of apps under each category, the application that attracts the most users under each category etc. 

In [78]:
def freq_table(dataset, index):
    table = {}
    total = 0
    for items in dataset:
        total += 1
        data = items[index]
        if data in table:
            table[data] += 1
        else:
            table[data] = 1
            
    table_percentages = {}
    for key in table:
        percentage = (table[key] / total) * 100
        table_percentages[key] = percentage 
    
    return table_percentages

In [79]:
print(freq_table(ios_free, -5))

{'Productivity': 1.7380509000620732, 'Weather': 0.8690254500310366, 'Shopping': 2.60707635009311, 'Reference': 0.5586592178770949, 'Finance': 1.1173184357541899, 'Music': 2.0484171322160147, 'Utilities': 2.5139664804469275, 'Travel': 1.2414649286157666, 'Social Networking': 3.2898820608317814, 'Sports': 2.1415270018621975, 'Health & Fitness': 2.0173805090006205, 'Games': 58.16263190564867, 'Food & Drink': 0.8069522036002483, 'News': 1.3345747982619491, 'Book': 0.4345127250155183, 'Photo & Video': 4.9658597144630665, 'Entertainment': 7.883302296710118, 'Business': 0.5276225946617008, 'Lifestyle': 1.5828677839851024, 'Education': 3.662321539416512, 'Navigation': 0.186219739292365, 'Medical': 0.186219739292365, 'Catalogs': 0.12414649286157665}


In [80]:
def display_data(dictionary):
    data = []
    for key, value in dictionary.items():
        data.append((value,key))
    sorted_data = sorted(data, reverse = True)
    for items in sorted_data:
        print(items[1],":",items[0])

In [81]:
display_data(freq_table(ios_free, -5))

Games : 58.16263190564867
Entertainment : 7.883302296710118
Photo & Video : 4.9658597144630665
Education : 3.662321539416512
Social Networking : 3.2898820608317814
Shopping : 2.60707635009311
Utilities : 2.5139664804469275
Sports : 2.1415270018621975
Music : 2.0484171322160147
Health & Fitness : 2.0173805090006205
Productivity : 1.7380509000620732
Lifestyle : 1.5828677839851024
News : 1.3345747982619491
Travel : 1.2414649286157666
Finance : 1.1173184357541899
Weather : 0.8690254500310366
Food & Drink : 0.8069522036002483
Reference : 0.5586592178770949
Business : 0.5276225946617008
Book : 0.4345127250155183
Navigation : 0.186219739292365
Medical : 0.186219739292365
Catalogs : 0.12414649286157665


In [82]:
display_data(freq_table(android_free, 1))

FAMILY : 19.223826714801444
GAME : 9.510379061371841
TOOLS : 8.461191335740072
BUSINESS : 4.580324909747293
LIFESTYLE : 3.9034296028880866
PRODUCTIVITY : 3.892148014440433
FINANCE : 3.7003610108303246
MEDICAL : 3.5424187725631766
SPORTS : 3.4183212996389893
PERSONALIZATION : 3.3167870036101084
COMMUNICATION : 3.2490974729241873
HEALTH_AND_FITNESS : 3.068592057761733
PHOTOGRAPHY : 2.944494584837545
NEWS_AND_MAGAZINES : 2.7978339350180503
SOCIAL : 2.6624548736462095
TRAVEL_AND_LOCAL : 2.33528880866426
SHOPPING : 2.2450361010830324
BOOKS_AND_REFERENCE : 2.1435018050541514
DATING : 1.861462093862816
VIDEO_PLAYERS : 1.782490974729242
MAPS_AND_NAVIGATION : 1.3989169675090252
FOOD_AND_DRINK : 1.2409747292418771
EDUCATION : 1.128158844765343
LIBRARIES_AND_DEMO : 0.9363718411552346
AUTO_AND_VEHICLES : 0.9250902527075812
ENTERTAINMENT : 0.8799638989169676
HOUSE_AND_HOME : 0.8235559566787004
WEATHER : 0.8009927797833934
EVENTS : 0.7107400722021661
PARENTING : 0.6543321299638989
ART_AND_DESIGN : 0

In [83]:
display_data(freq_table(android_free, -4))

Tools : 8.449909747292418
Entertainment : 6.069494584837545
Education : 5.347472924187725
Business : 4.580324909747293
Productivity : 3.892148014440433
Lifestyle : 3.892148014440433
Finance : 3.7003610108303246
Medical : 3.5424187725631766
Sports : 3.463447653429603
Personalization : 3.3167870036101084
Communication : 3.2490974729241873
Action : 3.1024368231046933
Health & Fitness : 3.068592057761733
Photography : 2.944494584837545
News & Magazines : 2.7978339350180503
Social : 2.6624548736462095
Travel & Local : 2.3240072202166067
Shopping : 2.2450361010830324
Books & Reference : 2.1435018050541514
Simulation : 2.0419675090252705
Dating : 1.861462093862816
Arcade : 1.861462093862816
Video Players & Editors : 1.782490974729242
Casual : 1.7486462093862816
Maps & Navigation : 1.3989169675090252
Food & Drink : 1.2409747292418771
Puzzle : 1.128158844765343
Racing : 0.9927797833935018
Role Playing : 0.9363718411552346
Libraries & Demo : 0.9363718411552346
Auto & Vehicles : 0.925090252707581

## Calculating Average Number of Ratings per App in Each Genre

In [84]:
genres_ios = freq_table(ios_free, -5)

In [85]:
for genre in genres_ios:
    total_ratings = 0
    no_of_apps = 0
    for apps in ios_free:
        app_genre = apps[-5]
        if genre == app_genre:
            total_ratings += float(apps[6])
            no_of_apps += 1
    avg_ratings = total_ratings/no_of_apps
    print(genre,":",avg_ratings)

Productivity : 21028.410714285714
Weather : 52279.892857142855
Shopping : 26919.690476190477
Reference : 74942.11111111111
Finance : 31467.944444444445
Music : 57326.530303030304
Utilities : 18684.456790123455
Travel : 28243.8
Social Networking : 71548.34905660378
Sports : 23008.898550724636
Health & Fitness : 23298.015384615384
Games : 22788.6696905016
Food & Drink : 33333.92307692308
News : 21248.023255813954
Book : 39758.5
Photo & Video : 28441.54375
Entertainment : 14029.830708661417
Business : 7491.117647058823
Lifestyle : 16485.764705882353
Education : 7003.983050847458
Navigation : 86090.33333333333
Medical : 612.0
Catalogs : 4004.0


In [86]:
category_android = freq_table(android_free,1)

In [88]:
for category in category_android:
    total_installs = 0
    no_of_apps = 0
    for apps in android_free:
        app_category = apps[1]
        if category == app_category:
            installs = apps[5].replace("+","")
            installs = installs.replace(",","")
            total_installs += float(installs)
            no_of_apps += 1
    avg_ratings = total_installs/no_of_apps
    print(category,":",avg_ratings)

ART_AND_DESIGN : 1986335.0877192982
AUTO_AND_VEHICLES : 647317.8170731707
BEAUTY : 513151.88679245283
BOOKS_AND_REFERENCE : 8767811.894736841
BUSINESS : 1704192.3399014778
COMICS : 817657.2727272727
COMMUNICATION : 38326063.197916664
DATING : 854028.8303030303
EDUCATION : 1768500.0
ENTERTAINMENT : 9146923.076923076
EVENTS : 253542.22222222222
FINANCE : 1387692.475609756
FOOD_AND_DRINK : 1924897.7363636363
HEALTH_AND_FITNESS : 4167457.3602941176
HOUSE_AND_HOME : 1331540.5616438356
LIBRARIES_AND_DEMO : 638503.734939759
LIFESTYLE : 1437816.2687861272
GAME : 12914435.883748516
FAMILY : 5180161.789906103
MEDICAL : 123064.7898089172
SOCIAL : 23253652.127118643
SHOPPING : 7036877.311557789
PHOTOGRAPHY : 17840110.40229885
SPORTS : 4274688.722772277
TRAVEL_AND_LOCAL : 13984077.710144928
TOOLS : 10801391.298666667
PERSONALIZATION : 5201482.6122448975
PRODUCTIVITY : 16772838.591304347
PARENTING : 542603.6206896552
WEATHER : 5074486.197183099
VIDEO_PLAYERS : 24790074.17721519
NEWS_AND_MAGAZINES : 