# Info Crowler

This IPython module crowles Genre, Genre IDs, Rating Count, Rating Average, Rating Percentage for each Rate and explicit status from iTunes website. It reads urls from csv filem urls_path on these information 

In [6]:
import logging
import threading
import time
import concurrent.futures
import pandas as pd 
import requests
from pyquery import PyQuery as pq
import datetime
import sys
import traceback

urls_path = 'popular_urls.csv'
output_name = r'./popular_info.csv'
_range = 83
max_len = 8285
thread_num = 102

names = [None]*max_len
genres = [None]*max_len
rating_count = [None]*max_len
explicit = [None]*max_len
rating_average = [None]*max_len
rating_stars = [[None]*max_len,[None]*max_len,[None]*max_len,[None]*max_len,[None]*max_len] 
deleted_podcasts = []

df = pd.read_csv(urls_path)

### get_url_content

Download iTunes webpage and returnes it's content.

In [7]:
def get_url_content( URL ):
    S = requests.Session()
    R = S.get(url=URL)
    return R.content

### extract_podcast_name

Extract podcast name from provided html markdown

In [8]:
def extract_podcast_name(html):
    tmp = [html("span").eq(i).attr('aria-label') for i in range(len(html("span")))]
    return [x for x in tmp if x is not None][0]

### is_explicit

Check to see if podcast is explicit or not.

In [9]:
def is_explicit(html):
    if len(html(".icon-explicit-large")) == 0:
        return 0
    else:
        return 1  

### Threads

To reduce the running time of application, we dispatch it on multiple threads.

In [None]:
def thread_function(name):
    _from = name * _range
    _to = (name+1)* _range
        
    if(_to > max_len):
        _to = max_len
    
    print("from: " + str(_from) + " to: " + str(_to))
        
    for i in range(_from, _to):
        try:
            if(i%10 == 0):
                print(str(_from) + " --> " + str(i))
            content = get_url_content(df['url'][i])
            d = pq(content)
            names[i] = extract_podcast_name(d)
            genres[i] = d("li.inline-list__item--bulleted.inline-list__item--bulleted").text()
            explicit[i] = is_explicit(d)
            rating = d('div.we-customer-ratings')
            rating_count[i] = rating('p').text()[:-8]
            rating_average[i] = rating('span.we-customer-ratings__averages__display').text()
            if(rating_count[i]!=''):
                for k,j in zip(rating('div.we-star-bar-graph__bar__foreground-bar'),range(0,5)):
                    rating_stars[j][i] = k.attrib.get('style')[7:-2]
            else:
                print('\033[91m: Page: ' + df['url'][i] + ' don\'t have rating.\033[0m')
                deleted_podcasts.append(i)
                for j in range(0,5):
                    rating_stars[j][i]=''
        except:
            print('\033[91m' + str(len(deleted_podcasts)) + ': Page: ' + df['url'][i] + ' was deleted.\033[0m')
            do_something_with_exception()

def do_something_with_exception():
    exc_type, exc_value = sys.exc_info()[:2]
    print ('Handling %s exception with message "%s" in %s' % \
        (exc_type.__name__, exc_value, threading.current_thread().name))
    
print(datetime.datetime.now().time())
            
threads = list()
for index in range(thread_num):
    x = threading.Thread(target=thread_function, args=(index,))
    threads.append(x)
    x.start()
    
for index, thread in enumerate(threads):
    logging.info("Main    : before joining thread %d.", index)
    thread.join()
    logging.info("Main    : thread %d done", index)
    
print(datetime.datetime.now().time())

### Insert data to table

Insert crowled info from podcast's web page to dataframe and write it to the file name output_name

In [11]:
df.insert(0, "Name", names, True) 
df.insert(1, "Genre", genres, True)
df.insert(2, "Rating Count", rating_count, True) 
df.insert(3, "Rating Average", rating_average, True) 
df.insert(4, "Rating 5 Star", rating_stars[0], True) 
df.insert(5, "Rating 4 Star", rating_stars[1], True) 
df.insert(6, "Rating 3 Star", rating_stars[2], True) 
df.insert(7, "Rating 2 Star", rating_stars[3], True) 
df.insert(8, "Rating 1 Star", rating_stars[4], True) 
df.insert(9, "is explicit", explicit, True)

df.to_csv(output_name, index = False, header=True)