<a href="https://colab.research.google.com/github/prteek/data-science/blob/master/GettingData.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Getting Data
*To write it, it took three months; to concieve it, three minutes; to collect the data in it, all my life -F. Scott Fitzgerald*

In [0]:
# This cell is not required to be executed (i.e. ignore any error) if Notebook is run locally or in Binder
# Authorise and mount google drive to access code and data files

project_folder = '/content/drive/My Drive/git_repos/data-science/'

import os

if os.path.isdir('/content'):
    from google.colab import drive
    drive.mount('/content/drive')

    if not(os.path.isdir(project_folder)):
      os.makedirs(project_folder)
      print("new project folder created")

    os.chdir(project_folder)

In [0]:
%%capture
# To supress the output when calling another file
%run ./GradientDescent.ipynb
import re
import csv

### Reading Files
First step is to obtain a *file object* using open:
* 'r' means read-only
* 'w' is write -- will destroy the file if it already exists
* 'a' is append --for adding to the end of file
##### Remember to close the file

In [0]:
file_for_reading = open("requirements.txt", "r")
file_for_reading.close()

# Always use a "with" block, which will automatically close the file at the end
with open("comma_delimited_stock_prices.txt","r") as f:
    data = [line for line in f]
# at this point f has already been closed
try:
    data_again = [line for line in f]
except:
    print("ValueError: File is already closed")
    
# to read a whole text file iterate over the lines of file:
starts_with_slash =0
with open("email_addresses.txt","r") as f:
    for line in f:                     # look at each line in file
        if re.match("^/",line):        # use a regex to see if it starts with "/"
            starts_with_slash +=1      # if it does, add 1 to the count
# Note: f is closed at this point
print("number of lines starting with '/': ",starts_with_slash)

# every line read this way ends in a newline character and so we strip() it before doing anything with it

def get_domain(email_address):
    """split on "@" and return the last piece"""
    return email_address.lower().split("@")[-1]

with open("email_addresses.txt","r") as f:
    list_of_domains = [get_domain(line.strip()) 
                            for line in f
                            if "@" in line]
    domain_counts   = Counter(list_of_domains)
    
print("domain counts: ", domain_counts)

### Delimited files


In [0]:
with open("tab_delimited_stock_prices.txt") as f:
    reader = csv.reader(f, delimiter='\t')
    date   = list()
    symbol = list()
    closing_price = list()
    for row in reader:
        date.append(row[0])
        symbol.append(row[1])
        closing_price.append(float(row[2]))

print("dates:", date, "symbols:", symbol, "closing prices", closing_price)

# if file has headers then skip header row using reader.next() or use csv.DictReader

with open("colon_delimited_stock_prices.txt", "r") as f:
    reader = csv.DictReader(f, delimiter=":")
    date   = list()
    symbol = list()
    closing_price = list()
    for row in reader:
        date.append(row["date"])
        symbol.append(row["symbol"])
        closing_price.append(float(row["closing_price"]))

print("dates:", date, "symbols:", symbol, "closing prices", closing_price)

# writing out data using csv.writer (suppressed to not change file frequently)

# today_prices = {"AAPL":90.91, "MSFT":41.68, "FB":64.5}
# with open("comma_delimited_stock_prices.txt","w") as f:
#     writer = csv.writer(f, delimiter=",")
#     for stock, price in today_prices.items():
#         writer.writerow([stock, price])
        


### Scraping the web
#### HTML and Parsing thereof

In [0]:
from bs4 import BeautifulSoup
import requests

# requests library makes HTTP requests better than anything that's built into Python
# To use BeautifulSoup we'll need to pass some HTML into BeautifulSoup() function. 
# This HTML will be the result of requests.get

html = requests.get("https://jupyter.org").text
soup = BeautifulSoup(html, "html5lib") 
# htmltlib parser is better than Python's built in one to cope with badly formatted HTML

# We'll work with tag objects, which correspond to tags representing the structure of an HTML page
first_paragraph       = soup.p # or soup.find("p") p is tag for paragraph
first_paragraph_text  = soup.p.text
first_paragraph_words = soup.p.text.split()

# and we can extract a tag's attributes by treating it like a dict:
first_paragraph_id    = soup.p.get("id") # returns None if no ID

# get multiple tags at once:
all_paragraphs        = soup("p") # or just soup.find_all("p")
paragraphs_with_ids   = [p for p in all_paragraphs if p.get("id")]

# if we want tags with specific class:
important_paragraphs = soup("p", {"class":"Notebook"})