# Problem Set #1
## Name: Sahithi Adari
### Date: 02/07/2021

In [63]:
import requests # For downloading the website
from bs4 import BeautifulSoup # For parsing the website
import pandas as pd
import numpy as np
import json
import csv

## Part 1 

In [64]:
# REVIEW the website to be scraped
webpage = 'http://www.gutenberg.org/'

In [65]:
# ACCESS
server_response = requests.get(webpage)

In [66]:
# PARSE/Parsing the content 
soup = BeautifulSoup(server_response.content, 'html.parser')

# Previewing the raw the code of the downloaded website 
print(soup.prettify())

<!DOCTYPE html>
<html class="client-nojs" dir="ltr" lang="en">
 <head>
  <meta charset="utf-8"/>
  <title>
   Free eBooks | Project Gutenberg
  </title>
  <link href="/gutenberg/style.css?v=1.1" rel="stylesheet"/>
  <link href="/gutenberg/collapsible.css?1.1" rel="stylesheet"/>
  <link href="/gutenberg/new_nav.css?v=1.321231" rel="stylesheet"/>
  <link href="/gutenberg/pg-desktop-one.css" rel="stylesheet"/>
  <meta content="width=device-width, initial-scale=1" name="viewport"/>
  <meta content="books, ebooks, free, kindle, android, iphone, ipad" name="keywords">
   <meta content="wucOEvSnj5kP3Ts_36OfP64laakK-1mVTg-ptrGC9io" name="google-site-verification"/>
   <meta content="4WNaCljsE-A82vP_ih2H_UqXZvM" name="alexaVerifyID"/>
   <link href="https://www.gnu.org/copyleft/fdl.html" rel="copyright">
    <link href="/gutenberg/favicon.ico?v=1.1" rel="shortcut icon">
     <meta content="Project Gutenberg" property="og:title"/>
     <meta content="website" property="og:type"/>
     <meta cont

### Question 1

If we take a look at the raw code we can see that there are three different HTML tags right at the top of the output: `<head>`, `<title>`, and `<link>`. The `<head>` tag is used as a container for the metadata of the webpage. Since the metadata of the page refers to the HTML document itself, it is not displayed on the website. The `<title>` tag defines the title of document and can usually be seen at the top of each new website/browser tab. Lastly the `<link>` tag defines a relationship between the current HTML document and, usually, an external stylesheet. Because all of the `<link>` tags are within `<head>` tag, this means these stylesheets are located in the metadata of the website and not necessarily viewable to a user.

Definitions for the various HTML tags were liberally copied and modified from the [W3 School website](https://www.w3schools.com/tags/).

### Question 2

In [67]:
# ACCESS 
# Requesting the contents of the webpage and displaying the HTTP response code
server_response.status_code

200

The HTTP response code is `200`.

### Question 3

In [68]:
# PARSE/Finding all the instance on the webpage containing the "h2" tags (version 1)
[i for i in soup.find_all('h2')]

[<h2 class="subtitle">Project Gutenberg is a library of over 60,000 free eBooks</h2>,
 <h2 id="find-free-ebooks">Find Free eBooks</h2>,
 <h2 id="get-help">Get Help</h2>,
 <h2 id="how-to-help">How to Help</h2>,
 <h2 id="special-areas">Special Areas</h2>,
 <h2 id="terms-of-use">Terms of Use</h2>,
 <h2 id="social-media">Social Media</h2>]

In [69]:
# PARSE/Finding all the instance on the webpage containing the "h2" tags (version 2)
soup.find_all('h2')

[<h2 class="subtitle">Project Gutenberg is a library of over 60,000 free eBooks</h2>,
 <h2 id="find-free-ebooks">Find Free eBooks</h2>,
 <h2 id="get-help">Get Help</h2>,
 <h2 id="how-to-help">How to Help</h2>,
 <h2 id="special-areas">Special Areas</h2>,
 <h2 id="terms-of-use">Terms of Use</h2>,
 <h2 id="social-media">Social Media</h2>]

### Question 4

In [70]:
# PARSE/Finding all the instance on the webpage containing the "li" tags
li_list = [] # Creating an empty list
for tag in soup.find_all('li'): #for loop that grabs the text of each "li" tag
    words = tag.text
    li_list.append(words)

In [71]:
# Slicing the part of the li_list for the text that is included as bullet points on the website
li_list[25:44]

['Search and browse. By author, title, subject, language, type, popularity, and more.',
 'Bookshelves of related eBooks.',
 'Frequently downloaded: Top 100, or ranked by popularity.',
 'Offline catalogs: handy eBook listings and metadata to consult offline.',
 'Recently added. The latest new and updated eBooks.',
 'Visit self.gutenberg.org for free eBooks by contemporary authors.',
 'Frequently Asked Questions about Project Gutenberg.',
 'Help, How-To and FAQs: In depth information about many topics.',
 'Tablets, phones and eReaders How-To: Using tablets, Kindle, Nook, cell phone, and other mobile devices and readers.',
 'Distributed Proofreaders welcomes new volunteers. This is the main source of new Project Gutenberg eBooks. Getting started is easy, and just a page a day will help!',
 'Fix and improve Project Gutenberg by reporting errors, bugs, typos, and suggesting changes.',
 'Record audiobooks with our affiliate, LibriVox.',
 'About Project Gutenberg.',
 'Donating to Project Gute

## Part 2 

### Question 1 

In [72]:
# REVIEW
# ============================
host = 'https://itunes.apple.com/search'

In [73]:
# ACCESS
# ============================
# Building the api GET request
# Including the parameters limit = 200 to get 200 attributes and media = music to only get songs 
params = "?term=EPIK+HIGH&limit=200&media=music"
url = host + params

In [74]:
# Checking the HTTP response code
response = requests.request('GET', url)
response.status_code

200

In [75]:
# PARSE
# ============================
# Parsing the JSON data into a python dictionary
music = response.json()

In [76]:
# TRANSFORM
# ============================
# Displaying the result in an easy-to-read format
print(json.dumps(music, indent=4, sort_keys=True))
data = music["results"]

{
    "resultCount": 200,
    "results": [
        {
            "artistId": 139334133,
            "artistName": "Epik High",
            "artistViewUrl": "https://music.apple.com/us/artist/epik-high/139334133?uo=4",
            "artworkUrl100": "https://is5-ssl.mzstatic.com/image/thumb/Music118/v4/82/48/4d/82484d36-358d-9e5a-f5b9-58004a4c895e/source/100x100bb.jpg",
            "artworkUrl30": "https://is5-ssl.mzstatic.com/image/thumb/Music118/v4/82/48/4d/82484d36-358d-9e5a-f5b9-58004a4c895e/source/30x30bb.jpg",
            "artworkUrl60": "https://is5-ssl.mzstatic.com/image/thumb/Music118/v4/82/48/4d/82484d36-358d-9e5a-f5b9-58004a4c895e/source/60x60bb.jpg",
            "collectionCensoredName": "SHOEBOX",
            "collectionExplicitness": "explicit",
            "collectionId": 1316881090,
            "collectionName": "SHOEBOX",
            "collectionPrice": 11.99,
            "collectionViewUrl": "https://music.apple.com/us/album/born-hater-feat-beenzino-verbal-jint-b-i-mino-b

In [77]:
# STORE
# ============================
# Storing the data into a dataframe
EH = pd.DataFrame.from_dict(data)

### Question 2 

* *collectionId*
  * As *collectionId* appears to be a numerical ID for an album, I would guess that this is a qualitative, discrete, and nominal attribute? It isn't clear how the IDs are assigned and when comparing the IDs of their most recent albums there doesn't appear to be an overarching pattern. Based on this very limited dataset, I feel most comfortable with claiming it as a nominal attribute rather than an ordinal one.
* *trackPrice*
  * This is a quantitative, continuous ratio. A difference (+ or -) and a ratio ($\times$ or $\div$) between two *trackPrice* attributes are thought to be meaningful.
* *trackTimeMillis*
  * This is a quantitative, continuous ratio. Because the units of analysis are taking the form of XXX milliseconds rather than xx:xx:xx, we can still arrive at a meaningful value either through a difference and a ratio.
* *primaryGenreName*
  * This is a qualitative, discrete, and nominal attribute. The *primaryGenreName* attribute, in it of itself, only provides enough information to distinguish one song from another.
* *contentAdvisoryRating* 
  * This is a qualitative, discrete, nominal, binary attribute as there are only two options: Explicit or NaN.
* *isStreamable*
  * This is a qualitative, discrete, nominal, binary attribute as it only consists of two options: True or False.

### Question 3

A situation where the identification numbers (ID) would be useful would be if the ID was an ordinal attribute, interval or a ratio. That is to say, if the ID took on a meaningful value in comparison to another ID, similar to the Dewey Decimal System, then we can make a prediction based on ID itself.

### Question 4

#### Part A 

In [78]:
#Determining the number of missing values per variable
EH.isna().sum()

wrapperType                  0
kind                         0
artistId                     0
collectionId                 0
trackId                      0
artistName                   0
collectionName               0
trackName                    0
collectionCensoredName       0
trackCensoredName            0
artistViewUrl                0
collectionViewUrl            0
trackViewUrl                 0
previewUrl                   0
artworkUrl30                 0
artworkUrl60                 0
artworkUrl100                0
collectionPrice              0
trackPrice                   0
releaseDate                  0
collectionExplicitness       0
trackExplicitness            0
discCount                    0
discNumber                   0
trackCount                   0
trackNumber                  0
trackTimeMillis              0
country                      0
currency                     0
primaryGenreName             0
contentAdvisoryRating      179
isStreamable                 0
collecti

#### Part B

In [79]:
#Determining the average trackPrice
EH['trackPrice'].mean()

1.2149999999999996

#### Part C

In [80]:
#Determining the number of songs that are explicit and not explicit
EH["trackExplicitness"].value_counts()

notExplicit    179
explicit        21
Name: trackExplicitness, dtype: int64

### Question 5

In [81]:
# Creating a function
def play_time(dta): 
    '''
    This function takes the millisecond playtime of each song and converts it into a tuple format of (minutes, seconds).

    Args:
        dta (pandas column): a pandas dataframe column to be processed one row at a time

    Returns:
        album_time (list): list containing all the tuples of playtime per song
    '''
    album_time = []
    for time in dta:
        minutes = int(np.floor(time/60000)) # Using numpy floor to round down the minutes
        seconds = int(np.floor((time - (minutes * 60000))/1000)) # Using numpy floor to round down the seconds
        album_time.append((minutes, seconds))
    return album_time

### Question 6

In [82]:
# Outputting the playtime tuples of the first 5 songs
play_time(EH['trackTimeMillis'].head(5))

[(5, 27), (3, 33), (3, 43), (4, 12), (3, 50)]

## Bonus

In [83]:
# Reviewing the website
webpage = 'http://jse.amstat.org/jse_data_archive.htm'

#Checking access 
server_response = requests.get(webpage)

#Parsing
soup = BeautifulSoup(server_response.text, features="html.parser")

In [84]:
# Creating a list of dictionaries that indicates if a file is missing or not
links_list = []
for tag in soup.find_all('a'):
    link = tag['href']
    name = tag.text
    host = 'http://jse.amstat.org/'
    if name[-3:] == 'txt':
        if str((requests.get(host + link)).status_code) != '200': # Coverting the status code into a string 
            missing = 'yes' # Creating a 'missing' variable to track what files are missing
        else:
            missing = 'no'
        links_list.append({'link': link, 'name': name, 'missing a file?': missing})

In [85]:
# Storing list of links as csv file
with open('data_links.csv', 'w') as csvfile:
    fieldnames = ['link', 'name', 'missing a file?']
    writer = csv.DictWriter(csvfile, fieldnames)
    writer.writeheader()
    for link in links_list:
        writer.writerow(link)
    print('Links added to file: ' + str(len(links_list)))

Links added to file: 209


In [86]:
# Outputting the first 5 rows of the CSV
pd.read_csv('data_links.csv')[:5]

Unnamed: 0,link,name,missing a file?
0,v9n2/4cdata.txt,4cdata.txt,yes
1,v9n2/4c1data.txt,4c1data.txt,yes
2,v9n2/4c.txt,4c.txt,yes
3,datasets/93cars.dat.txt,93cars.dat.txt,no
4,datasets/93cars.txt,93cars.txt,no
