# Standard Library Review

Realizing the power of Python's standard library

---

In [None]:
# system libraries
import os
import sys

# check system information
print('Python Information', sys.version)
print('This is your current directory', os.getcwd())

In [None]:
# datetime libraries
import datetime

# assgin current date and time
currentDate = datetime.date.today()
currentTime = datetime.datetime.now()

# check datetime information
print('Today is {}'.format(currentDate))
print('Today is', datetime.datetime.strftime(currentDate, '%m/%d/%Y'))
print('The time is', datetime.datetime.strftime(currentTime, '%H:%M:%S'))

In [None]:
import zipfile
import gzip
import glob
import csv
import shutil
import re
import random
import heapq
import sqlite3

import collections as clt

from collections import Counter
from pprint import pprint
from urllib import request

1. [Alternative Fuel Stations](#Alternative-Fuel-Stations)
1. [Loans](#Loans)
1. [Baby Names](#Baby-Names)
1. [Crappy Gifts](#Crappy-Gifts)
1. [Solar Power](#Solar-Power)
1. [USA](#USA)
1. [Census Estimates](#Census-Estimates)
1. [IMDB](#IMDB)
1. [Yahoo Finance](#Yahoo-Finance)
1. [Gutenburg](#Gutenburg)

## Alternative Fuel Stations

In [None]:
URL = 'https://data.ny.gov/api/views/bpkx-gmh7/rows.csv?accessType=DOWNLOAD&sorting=true'
FILE = (request.urlsplit(URL).path).split("/")[-1]
print(URL)
print(FILE)

In [None]:
request.urlretrieve(URL, FILE)

### Using `open`

In [None]:
with open(FILE, "r") as f:
    header = f.readline()
    data = f.readlines()

In [None]:
header.split(",")

In [None]:
len(data)

In [None]:
tesla = [x for x in data if "tesla" in x.lower()]
len(tesla)

In [None]:
tesla[:5]

In [None]:
tesla[-5:]

In [None]:
with open(file="tesla_list.csv", mode="w") as f:
    f.write(header)
    f.writelines(tesla)

### Using `csv.DictReader`

In [None]:
with open(file=FILE, mode="r") as csvfile:    
    data = list(csv.DictReader(csvfile))

In [None]:
len(data)

In [None]:
data[1]

In [None]:
tesla = [row for row in data if 'tesla' in row['Station Name'].lower()]

len(tesla)

In [None]:
header = list(tesla[0].keys())
header

In [None]:
with open('tesla_dict.csv', 'w', newline="", encoding='utf-8') as f:
    writer = csv.DictWriter(f, fieldnames=header)
    writer.writeheader()
    writer.writerows(tesla)

In [None]:
source_file = "tesla_list.csv"
target_file = "tesla_dict.csv"


with open(source_file, mode='r') as f1:
    data1 = f1.readlines()[1:]

with open(target_file, mode='r') as f2:
    data2 = f2.readlines()[1:]

data1 == data2

In [None]:
set(data2).symmetric_difference(set(data1))

## Loans

In [None]:
def loan_func(present_value, number_periods, interest_rate):
    interest_rate = interest_rate/100/12
    return round(interest_rate * present_value / (1-(1+interest_rate)** -number_periods),2)

In [None]:
# Auto
print(loan_func(25515, 72, 3.19))

# Mortgage
print(loan_func(398153, 360, 4.5))

In [None]:
ir = 3.7

current_date = datetime.datetime.now().strftime("%m/%d/%Y")

auto = [
    {'Present Value':pv,
     'Number of Periods':np,
     'Interest Rate':ir,
     'Monthly Payment':loan_func(pv, np, ir),
     'Document Date': current_date,
    } for pv in range(22000, 30000, 1000) for np in range(12,84,12)
]

print(len(auto))

In [None]:
for row in random.choices(auto, k=4):
    pprint(row)

In [None]:
ir = 2.6

current_date = datetime.datetime.now().strftime("%m/%d/%Y")

home = [
    {'Present Value':pv,
     'Number of Periods':np,
     'Interest Rate':ir,
     'Monthly Payment':loan_func(pv, np, ir),
     'Document Date': current_date,
    } for pv in range(450000, 510000, 10000) for np in [180, 360]
]

print(len(home))

In [None]:
for row in random.choices(home, k=4):
    pprint(row)

In [None]:
 loans = auto + home

In [None]:
with open(file='loans.csv', mode='w', newline='') as f:
    writer = csv.DictWriter(f, fieldnames = loans[0].keys())
    writer.writeheader()
    writer.writerows(loans)

In [None]:
del auto
del home
del loans

## Baby Names

In [None]:
# set path to save files
print('This is your current directory', os.getcwd())

In [None]:
URL = 'https://www.ssa.gov/oact/babynames/state/namesbystate.zip'
FILE = (request.urlsplit(URL).path).split("/")[-1]
print(URL)
print(FILE)

In [None]:
request.urlretrieve(URL, FILE)

In [None]:
with zipfile.ZipFile(FILE, mode='r') as z:
    z.extractall(path='BABY_DATA')

In [None]:
files = glob.glob('BABY_DATA/*.TXT')
pprint(files, compact=True, width=80)
print(len(files))

In [None]:
baby_names = [line.strip().split(',') for file in files for line in open(file, 'r').readlines()]
print(f"Total records = {len(baby_names):,}")

In [None]:
baby_names[0:5]

In [None]:
Counter(row[1] for row in baby_names)

In [None]:
baby_names[0]

In [None]:
baby_names[0][3]

In [None]:
baby_names[0][3][0]

In [None]:
for row in baby_names:
    row.extend([row[3][0]])
    
baby_names[0:5]

In [None]:
Counter(row[5] for row in baby_names)

In [None]:
len(baby_names[0][3])

In [None]:
for row in baby_names:
    row.extend([len(row[3])])
    
baby_names[0:5]

In [None]:
Counter(row[-1] for row in baby_names)

In [None]:
names = [row[3] for row in baby_names if row[1] == 'F']

Counter(names).most_common(25)

In [None]:
idx = 3
idy = 0
val = 4

pivot_data = sorted(set(x[idx] for x in baby_names if 'Greg' in x[idx]))

for z in pivot_data:
    pivot_totals = []
    for name in baby_names:
        if (name[idx] == z) & (name[idy] == 'WI'):
            pivot_totals.append(int(name[val]))
    print("{} = {:,}".format(z, sum(pivot_totals)))

In [None]:
del baby_names

In [None]:
header = ['state_abbr', 'gender', 'birth_year', 'birth_name', 'total']
row_dict_list = list()

for file in files:
    with open(file, mode='r') as f:
        for line in f.readlines():
            row = line.strip().split(',')
            row_dict = dict(zip(header, row))            
            row_dict['initial'] = row_dict['birth_name'][0]
            row_dict['length'] = len(row_dict['birth_name'])

            row_dict_list.append(row_dict)
            
print("Number of records in collection:", len(row_dict_list))

In [None]:
for row in (random.choices(row_dict_list, k=6)):
    pprint(row)

In [None]:
with open(file='baby_names.csv', mode='w', newline='') as f:
    writer = csv.DictWriter(f, fieldnames = row_dict_list[0].keys())
    writer.writeheader()
    writer.writerows(row_dict_list)

In [None]:
del row_dict_list

## Crappy Gifts

In [None]:
products = ['Blinking Robot', '27in televsion', 'Laptop', '8 x 10 Rug',
            '14pc Cutlery Set', 'Stuffed Alien - Grey', 'Mint Creme Cookies',
            'Kale Chips', 'Baseball Cap', 'Shoes', 'XL Hoodie']

employees = ['Hattie', 'Jes', 'Kira']

locations = ['NY', 'TX', 'CA', 'OH', 'MI', 'PR']

clients = ['ULTA', 'ALK', 'TM', 'BUD', 'CXO', 'ACN', 'MA', 'WHR']

In [None]:
start_date = datetime.date(2021,1,1)

transactions = [
    {'product':random.choice(products),
     'employee':e,
     'location':random.choice(locations),
     'client':random.choice(clients),
     'quantity':random.randrange(0,1000),
     'sales_rate':round(random.random(),2),
     'sales_date':(start_date + datetime.timedelta(d)).strftime(format="%Y-%m-%d"),
     'sales_total':0,
     'check':False,
    } for e in employees for d in range(0, 720)
]

for row in transactions:
    row['sales_total'] = round((row['quantity'] * row['sales_rate']),2)
    if row['quantity'] > 975:
        row['check'] = True            

print(len(transactions))

In [None]:
transactions[:3]

In [None]:
for row in random.choices(transactions, k=3):
    pprint(row)

In [None]:
transactions[-3:]

In [None]:
with open(file='crappy_gifts.csv', mode='w', newline='') as f:
    writer = csv.DictWriter(f, fieldnames = transactions[0].keys())
    writer.writeheader()
    writer.writerows(transactions)

In [None]:
del transactions

## Solar Power

In [None]:
URL = 'https://data.ny.gov/api/views/3x8r-34rs/rows.csv?accessType=DOWNLOAD&sorting=true'
FILE = (request.urlsplit(URL).path).split("/")[-1]
print(URL)
print(FILE)

In [None]:
request.urlretrieve(URL, FILE)

In [None]:
with open(FILE, newline="", encoding='utf-8') as f:
    solar_data = list(csv.reader(f, delimiter = ","))
    solar_header = solar_data.pop(0)

In [None]:
def nums(x):
    try:
        return float(x)
    except ValueError:
        return 0
    
print(list(map(nums, [5,.5,0.5,'5',5e3,'5e3','five'])))

In [None]:
for i, x in enumerate(random.choice(solar_data)):
    print(i, solar_header[i], ">>>", x)

In [None]:
for e, z in enumerate(solar_header):
    data_sets = [x[e] for x in solar_data]
    print("Index # {} for {} has {:,} record(s)".format(e,z,len(set(data_sets))))

In [None]:
pprint(Counter(x[3] for x in solar_data))

In [None]:
pprint(Counter(x[9] for x in solar_data))

In [None]:
pprint(Counter(x[9] for x in solar_data if x[3] == 'Oswego'))

In [None]:
del solar_header
del solar_data

In [None]:
solar_json = []

with open (FILE, 'r') as f:
    reader = csv.DictReader(f, delimiter=',')
    header = reader.fieldnames
    for row in reader:
        
        if row['Project Status'] == 'Complete':
        
            # split into a list
            row['Program Type'] = row['Program Type'].split("/")

            #app_list = row['Georeference'].split("\n")            
            #lat = nums(app_list[-1].split(",")[0].replace("(",""))
            #lon = nums(app_list[-1].split(",")[-1].strip().replace(")",""))
            try:
                app_list = re.findall(pattern="([-+]?\d+\.\d+\s[-+]?\d+\.\d+)", string=row['Georeference'])
                addr = app_list[0]
                lat = eval(addr.split(" ")[0])
                lon = eval(addr.split(" ")[-1])
                row['Georeference'] = dict(zip(['Address', 'Latitude', 'Longitude'], [addr, lat, lon]))        
            except:
                row['Georeference'] = dict(zip(['Address', 'Latitude', 'Longitude'], [addr, None, None])) 

            # covert numeric fields
            for col in ['Total Inverter Quantity', 'Total Nameplate kW DC', 'Total PV Module Quantity',
                        'Expected KWh Annual Production', 'Project Cost', '$Incentive']:    
                row[col] = nums(row[col])
                
            # replace columns        
            row['Incentive'] = row['$Incentive']

            # delete column
            del row['$Incentive']

            solar_json.append(row)

In [None]:
pprint(random.choice(solar_json))

In [None]:
with open(file='solar.csv', mode='w', newline='') as f:
    writer = csv.DictWriter(f, fieldnames = solar_json[0].keys())
    writer.writeheader()
    writer.writerows(solar_json)

In [None]:
del solar_json

## USA

In [None]:
fips = (
    1, 2, 4, 5, 6, 8, 9, 10, 12, 13, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24,
    25, 26, 27, 28, 29, 30, 31, 32, 33, 34, 35, 36, 37, 38, 39, 40, 41, 42,
    44, 45, 46, 47, 48, 49, 50, 51, 53, 54, 55, 56, 72, 66, 78, 60, 69,
)

stname = (
    'Alabama', 'Alaska', 'Arizona', 'Arkansas', 'California', 'Colorado', 'Connecticut',
    'Delaware', 'Florida', 'Georgia', 'Hawaii', 'Idaho', 'Illinois', 'Indiana', 'Iowa', 'Kansas',
    'Kentucky', 'Louisiana',  'Maine', 'Maryland', 'Massachusetts', 'Michigan', 'Minnesota',
    'Mississippi', 'Missouri', 'Montana', 'Nebraska', 'Nevada', 'New Hampshire', 'New Jersey',
    'New Mexico', 'New York', 'North Carolina', 'North Dakota', 'Ohio', 'Oklahoma', 'Oregon',
    'Pennsylvania', 'Rhode Island', 'South Carolina', 'South Dakota', 'Tennessee', 'Texas',
    'Utah', 'Vermont', 'Virginia', 'Washington', 'West Virginia', 'Wisconsin', 'Wyoming',
    'Puerto Rico', 'Guam', 'U.S. Virgin Islands', 'American Samoa', 'Northern Mariana Islands',
)

stabbr = (
    'AL', 'AK', 'AZ', 'AR', 'CA', 'CO', 'CT', 'DE', 'FL', 'GA', 'HI',
    'ID', 'IL', 'IN', 'IA', 'KS', 'KY', 'LA', 'ME', 'MD', 'MA', 'MI',
    'MN', 'MS', 'MO', 'MT', 'NE', 'NV', 'NH', 'NJ', 'NM', 'NY', 'NC',
    'ND', 'OH', 'OK', 'OR', 'PA', 'RI', 'SC', 'SD', 'TN', 'TX', 'UT',
    'VT', 'VA', 'WA', 'WV', 'WI', 'WY', 'PR', 'GU', 'VI', 'AS', 'MP',
)

region = (
    'South', 'West', 'West', 'South', 'West', 'West', 'Northeast', 'South', 'South', 'South',
    'West', 'West', 'Midwest', 'Midwest', 'Midwest', 'Midwest', 'South', 'South', 'Northeast',
    'South', 'Northeast', 'Midwest', 'Midwest', 'South', 'Midwest', 'West', 'Midwest', 'West',
    'Northeast', 'Northeast', 'West', 'Northeast', 'South', 'Midwest', 'Midwest', 'South',
    'West', 'Northeast', 'Northeast', 'South', 'Midwest', 'South', 'South', 'West', 'Northeast',
    'South', 'West', 'South', 'Midwest', 'West', 'South', 'South', 'South', 'South', 'South',
)

division = (
    'East South Central', 'Pacific', 'Mountain', 'West South Central', 'Pacific',
    'Mountain', 'New England', 'South Atlantic', 'South Atlantic', 'South Atlantic',
    'Pacific', 'Mountain', 'East North Central', 'East North Central', 'West North Central',
    'West North Central', 'East South Central', 'West South Central', 'New England',
    'South Atlantic', 'New England', 'East North Central', 'West North Central', 
    'East South Central', 'West North Central', 'Mountain', 'West North Central', 'Mountain',
    'New England', 'Middle Atlantic', 'Mountain', 'Middle Atlantic', 'South Atlantic',
    'West North Central', 'East North Central', 'West South Central', 'Pacific',
    'Middle Atlantic', 'New England', 'South Atlantic', 'West North Central',
    'East South Central', 'West South Central', 'Mountain', 'New England',
    'South Atlantic', 'Pacific', 'South Atlantic', 'East North Central', 'Mountain',
    'South Atlantic', 'Pacific','South Atlantic', 'Pacific', 'Pacific',
)

seats = (
    7,1,9,4,53,7,5,1,27,14,2,2,18,9,4,4,6,6,2,8,9,14,
    8,4,8,1,3,4,2,12,3,27,13,1,16,5,5,18,2,7,1,9,36,4,
    1,11,10,3,8,1,0,0,0,0,0,
)

### Using `Collections`

In [None]:
columns = ('st_fips', 'st_name', 'st_abbr', 'st_regs', 'st_divs', 'st_seats')
states_data = []

for values in zip(fips, stname, stabbr, region, division, seats):
    states_data.append(values)
    
for row in states_data[0:5]:
    pprint(clt.OrderedDict(zip(columns, row)))

### Using `SQLite`

In [None]:
#cnx = sqlite3.connect("usa.db") # create a database file
cnx = sqlite3.connect(":memory:") # create in memory
cnx.isolation_level=None
cursor = cnx.cursor()

In [None]:
cursor.execute('DROP TABLE IF EXISTS states_tbl')

In [None]:
stmt = \
"""
CREATE table states_tbl (
    st_fips int,
    st_name varchar(40),
    st_abbr char(2),
    st_regs varchar(30),
    st_divs varchar(40),
    st_seats int
)
"""

cursor.execute(stmt)
cnx.commit()

In [None]:
fields = "?" * len(states_data[0])
fields = ','.join(fields)
fields

In [None]:
table = 'states_tbl'

cursor.executemany("INSERT INTO {} {} VALUES ({})".format(table, columns, fields), states_data)
cnx.commit()

In [None]:

stmt = \
"""
SELECT *
FROM states_tbl
WHERE st_divs LIKE 'West%'
"""

results = cursor.execute(stmt)

for row in results:
    print(row)

In [None]:
stmt = \
"""
SELECT st_regs, st_divs, count(st_seats) totals
FROM states_tbl
GROUP BY st_regs, st_divs
ORDER BY 3 DESC
"""

results = cursor.execute(stmt)

for row in results:
    print(row)

In [None]:
cursor.close()
cnx.close()

In [None]:
del states_data

## Census Estimates

In [None]:
year = 2020
URL = f"http://www2.census.gov/programs-surveys/popest/datasets/2010-{year}/national/totals/nst-est{year}-alldata.csv"


FILE = (request.urlsplit(URL).path).split("/")[-1]
print(URL)
print(FILE)

In [None]:
request.urlretrieve(URL, FILE)

In [None]:
with open(FILE, newline="", encoding='latin-1') as f:
    estimate_data = list(csv.reader(f, delimiter = ","))
    estimate_header = estimate_data.pop(0)

In [None]:
print(len(estimate_data))

In [None]:
for i, x in enumerate(estimate_header):
    print(i, x)

In [None]:
pprint(dict(zip(estimate_header, random.choice(estimate_data))))

In [None]:
for e, z in enumerate(estimate_header):
    data_sets = [x[e] for x in estimate_data]
    print("Index # {} for {} has {:,} records".format(e,z,len(set(data_sets))))

In [None]:
def nums(x):
    try:
        return float(x)
    except ValueError:
        return 0
    
print(list(map(nums, [5,.5,0.5,'5',5e3,'5e3','five'])))

In [None]:
estimate_header.index("POPESTIMATE2020")

In [None]:
index = 0
value = 17

pivot_data = sorted(set(x[index] for x in estimate_data))

for z in pivot_data:
    pivot_totals = []
    for x in estimate_data:
        if x[index] == z:
            pivot_totals.append(nums(x[value]))
    print("{} = {:,.2f}".format(z, sum(pivot_totals)))

In [None]:
pivot_cols = estimate_header[7:18]

for p in pivot_cols:
    print(p)

In [None]:
pivot_cols = [x for x in estimate_header if x.startswith("POPEST")]

for p in pivot_cols:
    print(p)

In [None]:
print('POPESTIMATE2020'[-4:])

In [None]:
#periods = {p: pivot_cols[i][-4:]+"-12-31" for i, p in enumerate(pivot_cols)}

periods = {p: p[-4:]+"-12-31" for p in pivot_cols}

pprint(periods)

In [None]:
estimates_json = []

with open(FILE) as f:
    reader = csv.DictReader(f)
    for row in reader:
        if row['SUMLEV'] == '040':
            for columns in pivot_cols:
                estimates_json.append(
                    dict(                 
                        scenario=columns,
                        year_end=periods[columns],
                        summary_level=row['SUMLEV'],
                        location=row['NAME'],
                        region=row['REGION'],
                        division=row['DIVISION'],
                        population=row[columns],
                    )
                )

In [None]:
for row in random.choices(estimates_json, k=4):
    pprint(row)

In [None]:
with open(file='estimates.csv', mode='w', newline='') as f:
    writer = csv.DictWriter(f, fieldnames = estimates_json[0].keys())
    writer.writeheader()
    writer.writerows(estimates_json)

In [None]:
del estimate_data
del estimates_json

## IMDB

In [None]:
URL = "https://datasets.imdbws.com/"

with request.urlopen(URL) as f:
    web_data = f.read().decode('utf-8')
    
data_sets = re.findall(pattern="(https.*\.gz)>", string=web_data)

for i, d in enumerate(data_sets):
    print(i, d)

In [None]:
URL = 'https://datasets.imdbws.com/title.basics.tsv.gz'
FILE = (request.urlsplit(URL).path).split("/")[-1]
print(URL)
print(FILE)

In [None]:
request.urlretrieve(URL, FILE)

In [None]:
with gzip.open(FILE, mode='r') as f_in, open(FILE.replace('.gz',''), mode='wb') as f_out:
    shutil.copyfileobj(f_in, f_out)

In [None]:
uncompressed_file = FILE.replace('.gz','')

print(uncompressed_file)

In [None]:
with open(uncompressed_file, mode='r', newline='', encoding='utf-8') as f:
    imdb_data = list(csv.reader(f, delimiter='\t', quoting=csv.QUOTE_NONE, ))
    imdb_header = imdb_data.pop(0)
    
print(len(imdb_data))

In [None]:
pprint(imdb_data[0])

In [None]:
for e, z in enumerate(imdb_header):
    try:
        data_sets = [x[e] for x in imdb_data]
        print("Index #{} for {} has {:,} records".format(e, z, len(set(data_sets))))
    except:
        print("Index #{} has a error".format(e))

In [None]:
pprint(Counter(x[4] for x in imdb_data))

In [None]:
pprint(Counter(x[1] for x in imdb_data))

In [None]:
star_wars = [title for title in imdb_data if 'Star Wars' in title[2]]
print(len(star_wars))

In [None]:
for head, row in zip(imdb_header, random.choice(star_wars)):
    print(head, '>>>', row)

In [None]:
del imdb_data

In [None]:
URL = 'https://m.imdb.com/chart/top'

with request.urlopen(URL) as f:
    web_data = f.read().decode('utf-8')

In [None]:
movie_titles = re.findall(pattern="(tt\d+)", string=web_data)

movie_titles = set(movie_titles)

pprint(movie_titles, compact=True, width=132)
print()
print(len(movie_titles))

In [None]:
with open(uncompressed_file, mode='r', newline='', encoding='utf-8') as f:
    reader = csv.DictReader(f, delimiter='\t', quoting=csv.QUOTE_NONE)
    
    with open('imdb_titles.csv', mode='w', newline='', encoding='utf-8')as fw:
        fieldnames = reader.fieldnames
        writer = csv.DictWriter(fw, dialect='excel', fieldnames=fieldnames)
        
        writer.writeheader()

        for row in reader:
            if (row['isAdult'] == '0') & (row['tconst'] in movie_titles):
                writer.writerow(row)

In [None]:
del movie_titles

## Yahoo Finance

In [None]:
URL="https://query1.finance.yahoo.com/v7/finance/download/GTEC?period1=1605828807&period2=1637364807&interval=1d&events=history&includeAdjustedClose=true"

In [None]:
data_dates = re.findall(pattern="period[\d]=(\d+)", string=URL)
data_dates

In [None]:
datetime.datetime.fromtimestamp(int(data_dates[0])).strftime("%Y-%m-%d")

In [None]:
datetime.datetime.fromtimestamp(int(data_dates[1])).strftime("%Y-%m-%d")

In [None]:
datetime.datetime.now()

In [None]:
datetime.datetime.now() - datetime.datetime(1970,1,1)

In [None]:
(datetime.datetime.now() - datetime.datetime(1970,1,1)).total_seconds()

In [None]:
d1=(datetime.datetime.now() - datetime.datetime(1970,1,1)).total_seconds()
datetime.datetime.fromtimestamp(d1).strftime(format="%Y-%m-%d %H:%M:%S")

In [None]:
d2=(datetime.datetime.now() - datetime.timedelta(90) - datetime.datetime(1970,1,1)).total_seconds()
datetime.datetime.fromtimestamp(d2).strftime(format="%Y-%m-%d %H:%M:%S")

In [None]:
period1 = int((datetime.datetime.now() - datetime.timedelta(90) - datetime.datetime(1970,1,1)).total_seconds())
period2 = int((datetime.datetime.now() - datetime.datetime(1970,1,1)).total_seconds())

print(period1, period2)

In [None]:
symbol = 'AMD'
FILE = f"{symbol}.csv"
URL = f"https://query1.finance.yahoo.com/v7/finance/download/{symbol}?period1={period1}&period2={period2}&interval=1d&events=history&includeAdjustedClose=true"

print(FILE)
print(URL)

In [None]:
request.urlretrieve(URL, FILE)

In [None]:
stock = list()

with open(FILE, "r") as f:
    headers = f.readline().strip().split(",")
    for line in f.readlines():
        row = line.strip().split(",")
        row_dict = dict(zip(headers, row))
        row_dict['Volume'] = int(row_dict['Volume'])
        for c in ['Open','Close', 'High', 'Low', 'Adj Close']:
            #row_dict[c] = round(float(row_dict[c]), 2)
            row_dict[c] = eval(row_dict[c])
        stock.append(row_dict)

In [None]:
pprint(stock[:3], width=80, compact=True)

In [None]:
pprint(stock[-3:], width=80, compact=True)

In [None]:
prices = [x['Adj Close'] for x in stock]

In [None]:
best = heapq.nlargest(5, prices)
best

In [None]:
best_days = [x for x in stock if x['Adj Close'] in best]
best_days

In [None]:
worst = heapq.nsmallest(5, prices)
worst

In [None]:
worst_days = [x for x in stock if x['Adj Close'] in worst]
worst_days

In [None]:
del stock

## Gutenburg

What are some interesting books to read?

In [None]:
URL = 'http://www.gutenberg.org/cache/epub/5061/pg5061.txt'
FILE = (request.urlsplit(URL).path).split("/")[-1]
print(URL)
print(FILE)

In [None]:
request.urlretrieve(URL, FILE)

In [None]:
with open(FILE, encoding='utf8') as f:
    data = f.readlines()
    
print(type(data))

In [None]:
print(len(data))

In [None]:
for i, line in enumerate(data[0:20]):
    print(i, line.strip())

In [None]:
for i, line in enumerate(data):
    if "Tiny Tim" in line:
        print(i, "->", line.strip())

In [None]:
with open(FILE, encoding='utf8') as f:
    data = f.read()

print(type(data))

In [None]:
pprint(data[0:1000])

In [None]:
words = re.findall('\w{10,}', data.lower())
Counter(words).most_common(25)

In [None]:
def regex(pattern, string):
    patt = re.compile(pattern)
    matches = patt.finditer(string)    
    return list(matches)

In [None]:
# Tiny Tim?
for d in regex(r'Tiny Tim', data):
    print(d)

In [None]:
# Get sentences with "Tiny Tim"
get_sentence = re.findall(r"[^.]*Tiny Tim[^.]*\.", data)

print(len(get_sentence))

In [None]:
for sentence in get_sentence:
    print(sentence.strip())
    print("-"*80)

In [None]:
del data