# Program to get the content of the page for Apps in iTunes  

# Load libraries...

In [1]:
import json
import requests
import yaml # to parse json file without unicode. Refer http://stackoverflow.com/questions/956867/how-to-get-string-objects-instead-of-unicode-ones-from-json-in-python
import numpy as np
import pymongo
import pprint

# Connect to MongoDB, create a DB and a Collection

In [None]:
#client.drop_database('apple')

In [2]:
from pymongo import MongoClient
client = MongoClient('localhost', 27017)
db=client.apple  # database name apple and collection name app
collection = db.apps

# List all the app Ids for which we want to fetch the content

In [3]:
# put all the app ids into an array
app_id = np.array(['1044202386','298844386','542511686','756904853','317469184','302920553','331177714',
                 '321327691','284035177','324684580','512939461','642821482','403858572','434613896',
                  '389781154','441813332','1017492454','407558537','283646709','445170859','296581815','505911015',
                '348177453','333177961','288429040','284882215','372513032','310633997','456034437','588647136'])
# Initializing an empty list
json_list =[]

# Loop through these four steps below for each App Id
 1) Get the app's page content.
 
 2) Parse it into json object
 
 3) Clean some unwanted data.
 
 4) Insert the json object into the MongoDb

In [4]:
for ii in app_id:
    link = 'https://itunes.apple.com/lookup?id='+str(ii)
    #Copying the content
    resp = requests.get(link)
    
    # parse into json object without unicode
    d = yaml.safe_load(resp.content)
    
    # delete unnecessary content. For example some Urls
    key_to_remove = ('ipadScreenshotUrls','description','artistViewUrl','artworkUrl100','artworkUrl512',
                     'screenshotUrls','artworkUrl60','isGameCenterEnabled','isVppDeviceBasedLicensingEnabled',
                     'releaseNotes','supportedDevices','languageCodesISO2A','trackViewUrl','bundleId')
    for key in key_to_remove:
        del d["results"][0][key]
    #del d["results"][0]["screenshotUrls"]

    # Inserting the Id to json object. currently it is not there
    d["results"][0]['Id']= ii
    
    # append all the json objects in a list
    # json_list.append(d["results"][0])
    collection.insert_one(d["results"][0])

In [None]:
#json_list
#Writing the description to a file
#output_file = open("C:/ADMS/Project/apps.json","w")
#output_file.writelines( "%s\n" % item for item in json_list)
#output_file.close()
json_list

# Lets explore...

In [8]:
# Number of Document
collection.count()

30

In [9]:
# Aggregate function
pipe =  [
            {"$group":{"_id":"$primaryGenreName","count":{"$sum":1}}}
        ]
list(collection.aggregate(pipe))

[{u'_id': u'Social Networking', u'count': 5},
 {u'_id': u'Games', u'count': 4},
 {u'_id': u'Business', u'count': 3},
 {u'_id': u'Food & Drink', u'count': 5},
 {u'_id': u'Sports', u'count': 4},
 {u'_id': u'Music', u'count': 4},
 {u'_id': u'Finance', u'count': 5}]

In [10]:
collection.find_one()

{u'Id': u'1044202386',
 u'_id': ObjectId('58430c2f4d992d1aec031ba5'),
 u'advisories': [],
 u'appletvScreenshotUrls': [],
 u'artistId': 344048912,
 u'artistName': u'Games2win',
 u'averageUserRating': 5.0,
 u'averageUserRatingForCurrentVersion': 4.5,
 u'contentAdvisoryRating': u'4+',
 u'currency': u'USD',
 u'currentVersionReleaseDate': u'2016-04-28T02:29:29Z',
 u'features': [u'iosUniversal'],
 u'fileSizeBytes': u'360538112',
 u'formattedPrice': u'Free',
 u'genreIds': [u'6014', u'6016', u'7009', u'7012'],
 u'genres': [u'Games', u'Entertainment', u'Family', u'Puzzle'],
 u'kind': u'software',
 u'minimumOsVersion': u'7.0',
 u'price': 0.0,
 u'primaryGenreId': 6014,
 u'primaryGenreName': u'Games',
 u'releaseDate': u'2015-12-03T08:14:08Z',
 u'sellerName': u'Games2win India',
 u'sellerUrl': u'http://www.games2win.com/itunes/index.asp',
 u'trackCensoredName': u'Miranda Sings vs Haters',
 u'trackContentRating': u'4+',
 u'trackId': 1044202386,
 u'trackName': u'Miranda Sings vs Haters',
 u'userRatin

In [None]:
# Projection
pprint.pprint(list(collection.find(
    {},{"artistName":1,"primaryGenreName":1,"_id":0}
).sort("primaryGenreName",1)))

In [7]:
# Projection
pprint.pprint(list(collection.find(
    {},{"artistName":1,"averageUserRating":1,"_id":0}
).sort("averageUserRating",-1)))

[{u'artistName': u'Games2win', u'averageUserRating': 5.0},
 {u'artistName': u'GrubHub.com', u'averageUserRating': 4.5},
 {u'artistName': u'Spotify Ltd.', u'averageUserRating': 4.5},
 {u'artistName': u'Kiloo', u'averageUserRating': 4.5},
 {u'artistName': u'Halfbrick Studios', u'averageUserRating': 4.5},
 {u'artistName': u'Saavn', u'averageUserRating': 4.5},
 {u'artistName': u'Capital One', u'averageUserRating': 4.5},
 {u'artistName': u'OpenTable, Inc.', u'averageUserRating': 4.5},
 {u'artistName': u'TangoMe, Inc.', u'averageUserRating': 4.5},
 {u'artistName': u'Quora, Inc.', u'averageUserRating': 4.5},
 {u'artistName': u'Cisco', u'averageUserRating': 4.0},
 {u'artistName': u'FIFA', u'averageUserRating': 4.0},
 {u'artistName': u'Starbucks Coffee Company', u'averageUserRating': 4.0},
 {u'artistName': u'Pandora Media, Inc.', u'averageUserRating': 4.0},
 {u'artistName': u'Rovio Entertainment Ltd', u'averageUserRating': 4.0},
 {u'artistName': u'PayPal, Inc.', u'averageUserRating': 4.0},
 {u'

In [None]:
# Query with single key
pprint.pprint(list(collection.find(
{"Id":"1017492454"}
)))

In [None]:
# Query with multiple key
pprint.pprint(list(collection.find(
{"primaryGenreName":"Sports", "averageUserRating" : 4.0}, {"artistName":1,"_id":0}
)))

In [None]:
# Query with in inequalities
pprint.pprint(list(collection.find(
    {"averageUserRating":{"$gte":4.5}}, {"artistName":1,"_id":0}
)))

In [11]:
pprint.pprint(list(collection.find(
    {
        "$or":[
        {"primaryGenreName":"Sports"},
        {"averageUserRating" : 4.5}
        ]
    }
)))

[{u'Id': u'542511686',
  u'_id': ObjectId('58430c2f4d992d1aec031ba7'),
  u'advisories': [],
  u'appletvScreenshotUrls': [],
  u'artistId': 303544359,
  u'artistName': u'NBCUniversal Media, LLC',
  u'averageUserRating': 2.5,
  u'averageUserRatingForCurrentVersion': 2.5,
  u'contentAdvisoryRating': u'4+',
  u'currency': u'USD',
  u'currentVersionReleaseDate': u'2016-10-10T20:55:29Z',
  u'features': [u'iosUniversal'],
  u'fileSizeBytes': u'111559680',
  u'formattedPrice': u'Free',
  u'genreIds': [u'6004', u'6016'],
  u'genres': [u'Sports', u'Entertainment'],
  u'kind': u'software',
  u'minimumOsVersion': u'7.0',
  u'price': 0.0,
  u'primaryGenreId': 6004,
  u'primaryGenreName': u'Sports',
  u'releaseDate': u'2012-07-12T12:43:45Z',
  u'sellerName': u'NBCUniversal Media, LLC',
  u'trackCensoredName': u'NBC Sports',
  u'trackContentRating': u'4+',
  u'trackId': 542511686,
  u'trackName': u'NBC Sports',
  u'userRatingCount': 44950,
  u'userRatingCountForCurrentVersion': 917,
  u'version': u'4

In [None]:
# Sort by UserRatingCount
pprint.pprint(list(collection.find(
    {},{"artistName":1,"userRatingCount":1,"_id":0}
).sort("userRatingCount",-1)))

In [None]:
# Find the App with maximum userRatingCount
collection.find_one({}, {"artistName":1, "_id":0}, sort=[("userRatingCount",-1)])