In [None]:
# system libraries
import os
import sys

# check system information
print('Python Information', sys.version)
print('This is your current directory', os.getcwd())

In [None]:
# datetime libraries
import datetime
from datetime import time

# assgin current date and time
currentDate = datetime.date.today()
currentTime = datetime.datetime.now()

# check datetime information
print('Today is {}'.format(currentDate))
print('Today is', datetime.datetime.strftime(currentDate, '%m/%d/%Y'))
print('The time is', datetime.datetime.strftime(currentTime, '%H:%M:%S'))

In [None]:
import zipfile
import glob
import csv

import re
import random

from collections import Counter
from pprint import pprint

In [None]:
!pip install wget

In [None]:
import wget

1. [Auto Loans](#Auto-Loans)
1. [Baby Names](#Baby-Names)
1. [Crappy Gifts](#Crappy-Gifts)
1. [Solar Power](#Solar-Power)
1. [USA](#USA)
1. [Census Estimates](#Census-Estimates)
1. [IMDB](#IMDB)
1. [Gutenburg](#Gutenburg)

## Loans

In [None]:
def loan_func(present_value, number_periods, interest_rate):
    interest_rate = interest_rate/100/12
    return round(interest_rate * present_value / (1-(1+interest_rate)** -number_periods),2)

In [None]:
# Auto
print(loan_func(25515, 72, 3.19))

# Mortgage
print(loan_func(403700, 360, 4.5))

In [None]:
ir = 3.19

current_date = datetime.datetime.now().strftime("%m/%d/%Y")

loans = [
    {'Present Value':pv,
     'Number of Periods':np,
     'Interest Rate':ir,
     'Monthly Payment':loan_func(pv, np, ir),
     'Document Date': current_date,
    } for pv in range(22000, 30000, 1000) for np in range(12,84,12)
]

print(len(loans))

In [None]:
for row in random.choices(loans, k=4):
    pprint(row)

In [None]:
with open(file='loans.csv', mode='w', newline='') as f:
    writer = csv.DictWriter(f, fieldnames = loans[0].keys())
    writer.writeheader()
    writer.writerows(loans)

## Baby Names

In [None]:
# set path to save files
print('This is your current directory', os.getcwd())

In [None]:
try:
    os.mkdir('OUTPUT')
except FileExistsError:
     print('The directory already exists')

In [None]:
URL = 'https://www.ssa.gov/oact/babynames/state/namesbystate.zip'
filename = wget.download(URL)

In [None]:
print(filename)

In [None]:
try:
    os.mkdir('BABY_DATA')
except FileExistsError:
     print('The directory already exists')

In [None]:
with zipfile.ZipFile(filename, mode='r') as z:
    z.extractall(path='BABY_DATA')

In [None]:
files = glob.glob('BABY_DATA/*.TXT')
pprint(files, compact=True, width=80)
print(len(files))

In [None]:
header = ['state_abbr', 'gender', 'birth_year', 'birth_name', 'total']
row_dict_list = list()

for file in files:
    with open(file, mode='r') as f:
        for line in f.readlines():
            row = line.strip().split(',')
            row_dict = dict(zip(header, row))
            row_dict['year'] = row_dict['birth_year'] + "-12-31"
            row_dict['initial'] = row_dict['birth_name'][0]
            row_dict['length'] = len(row_dict['birth_name'])

            row_dict_list.append(row_dict)
            
print("Number of records in collection:", len(row_dict_list))

In [None]:
for row in (random.choices(row_dict_list, k=6)):
    pprint(row)

In [None]:
with open(file='babies.csv', mode='w', newline='') as f:
    writer = csv.DictWriter(f, fieldnames = row_dict_list[0].keys())
    writer.writeheader()
    writer.writerows(row_dict_list)

In [None]:
del row_dict_list

## Crappy Gifts

In [None]:
products = ['Blinking Robot', '27in televsion', 'Laptop', '8 x 10 Rug',
            '14pc Cutlery Set', 'Stuffed Alien - Grey', 'Mint Creme Cookies',
            'Kale Chips', 'Baseball Cap', 'Shoes', 'XL Hoodie']

employees = ['Hattie', 'Jes', 'Kira']

locations = ['NY', 'TX', 'CA', 'OH', 'MI', 'PR']

clients = ['ULTA', 'ALK', 'TM', 'BUD', 'CXO', 'ACN', 'MA', 'WHR']

In [None]:
start_date = datetime.date(2020,1,1)

transactions = [
    {'product':random.choice(products),
     'employee':e,
     'location':random.choice(locations),
     'client':random.choice(clients),
     'quantity':random.randrange(0,1000),
     'sales_rate':round(random.random(),2),
     'sales_date':(start_date + datetime.timedelta(d)).strftime(format="%Y-%m-%d"),
     'sales_total':0,
     'check':False,
    } for e in employees for d in range(0,(366+365))
]

for row in transactions:
    row['sales_total'] = round((row['quantity'] * row['sales_rate']),2)
    if row['quantity'] > 975:
        row['check'] = True            

print(len(transactions))

In [None]:
for row in random.choices(transactions, k=3):
    pprint(row)

In [None]:
with open(file='crappy_gifts.csv', mode='w', newline='') as f:
    writer = csv.DictWriter(f, fieldnames = transactions[0].keys())
    writer.writeheader()
    writer.writerows(transactions)

## Solar Power

In [None]:
URL = 'https://data.ny.gov/api/views/3x8r-34rs/rows.csv?accessType=DOWNLOAD&sorting=true'

filename = wget.download(URL)

In [None]:
print(filename)

In [None]:
with open(filename, newline="", encoding='utf-8') as f:
    solar_data = list(csv.reader(f, delimiter = ","))
    solar_header = solar_data.pop(0)

In [None]:
def nums(x):
    try:
        return float(x)
    except ValueError:
        return 0
    
print(list(map(nums, [5,.5,0.5,'5',5e3,'5e3','five'])))

In [None]:
for i, x in enumerate(random.choice(solar_data)):
    print(i, solar_header[i], ">>>", x)

In [None]:
for e, z in enumerate(solar_header):
    data_sets = [x[e] for x in solar_data]
    print("Index # {} for {} has {:,} records".format(e,z,len(set(data_sets))))

In [None]:
pprint(Counter(x[13] for x in solar_data))

In [None]:
pprint(Counter(x[9] for x in solar_data))

In [None]:
solar_json = []

with open (filename, 'r') as f:
    reader = csv.DictReader(f, delimiter=',')
    header = reader.fieldnames
    for row in reader:
        
        if row['Project Status'] == 'Complete':
        
            # split into a list
            row['Program Type'] = row['Program Type'].split("/")

            app_list = row['Location 1'].split("\n")
            addr = app_list[0]
            lat = nums(app_list[-1].split(",")[0].replace("(",""))
            lon = nums(app_list[-1].split(",")[-1].strip().replace(")",""))
            row['Location 1'] = dict(zip(['Address', 'Latitude', 'Longitude'], [addr, lat, lon]))        

            # covert numeric fields
            for col in ['Total Inverter Quantity', 'Total Nameplate kW DC', 'Total PV Module Quantity',
                        'Expected KWh Annual Production', 'Project Cost', '$Incentive']:    
                row[col] = nums(row[col])
                
            # replace columns        
            row['Incentive'] = row['$Incentive']

            # delete column
            del row['$Incentive']

            solar_json.append(row)

In [None]:
pprint(random.choice(solar_json))

In [None]:
with open(file='solar.csv', mode='w', newline='') as f:
    writer = csv.DictWriter(f, fieldnames = solar_json[0].keys())
    writer.writeheader()
    writer.writerows(solar_json)

In [None]:
del solar_json

## USA

In [None]:
fips = (
    1, 2, 4, 5, 6, 8, 9, 10, 12, 13, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24,
    25, 26, 27, 28, 29, 30, 31, 32, 33, 34, 35, 36, 37, 38, 39, 40, 41, 42,
    44, 45, 46, 47, 48, 49, 50, 51, 53, 54, 55, 56, 72, 66, 78, 60, 69,
)

stname = (
    'Alabama', 'Alaska', 'Arizona', 'Arkansas', 'California', 'Colorado', 'Connecticut',
    'Delaware', 'Florida', 'Georgia', 'Hawaii', 'Idaho', 'Illinois', 'Indiana', 'Iowa', 'Kansas',
    'Kentucky', 'Louisiana',  'Maine', 'Maryland', 'Massachusetts', 'Michigan', 'Minnesota',
    'Mississippi', 'Missouri', 'Montana', 'Nebraska', 'Nevada', 'New Hampshire', 'New Jersey',
    'New Mexico', 'New York', 'North Carolina', 'North Dakota', 'Ohio', 'Oklahoma', 'Oregon',
    'Pennsylvania', 'Rhode Island', 'South Carolina', 'South Dakota', 'Tennessee', 'Texas',
    'Utah', 'Vermont', 'Virginia', 'Washington', 'West Virginia', 'Wisconsin', 'Wyoming',
    'Puerto Rico', 'Guam', 'U.S. Virgin Islands', 'American Samoa', 'Northern Mariana Islands',
)

stabbr = (
    'AL', 'AK', 'AZ', 'AR', 'CA', 'CO', 'CT', 'DE', 'FL', 'GA', 'HI',
    'ID', 'IL', 'IN', 'IA', 'KS', 'KY', 'LA', 'ME', 'MD', 'MA', 'MI',
    'MN', 'MS', 'MO', 'MT', 'NE', 'NV', 'NH', 'NJ', 'NM', 'NY', 'NC',
    'ND', 'OH', 'OK', 'OR', 'PA', 'RI', 'SC', 'SD', 'TN', 'TX', 'UT',
    'VT', 'VA', 'WA', 'WV', 'WI', 'WY', 'PR', 'GU', 'VI', 'AS', 'MP',
)

region = (
    'South', 'West', 'West', 'South', 'West', 'West', 'Northeast', 'South', 'South', 'South',
    'West', 'West', 'Midwest', 'Midwest', 'Midwest', 'Midwest', 'South', 'South', 'Northeast',
    'South', 'Northeast', 'Midwest', 'Midwest', 'South', 'Midwest', 'West', 'Midwest', 'West',
    'Northeast', 'Northeast', 'West', 'Northeast', 'South', 'Midwest', 'Midwest', 'South',
    'West', 'Northeast', 'Northeast', 'South', 'Midwest', 'South', 'South', 'West', 'Northeast',
    'South', 'West', 'South', 'Midwest', 'West', 'South', 'South', 'South', 'South', 'South',
)

division = (
    'East South Central', 'Pacific', 'Mountain', 'West South Central', 'Pacific',
    'Mountain', 'New England', 'South Atlantic', 'South Atlantic', 'South Atlantic',
    'Pacific', 'Mountain', 'East North Central', 'East North Central', 'West North Central',
    'West North Central', 'East South Central', 'West South Central', 'New England',
    'South Atlantic', 'New England', 'East North Central', 'West North Central', 
    'East South Central', 'West North Central', 'Mountain', 'West North Central', 'Mountain',
    'New England', 'Middle Atlantic', 'Mountain', 'Middle Atlantic', 'South Atlantic',
    'West North Central', 'East North Central', 'West South Central', 'Pacific',
    'Middle Atlantic', 'New England', 'South Atlantic', 'West North Central',
    'East South Central', 'West South Central', 'Mountain', 'New England',
    'South Atlantic', 'Pacific', 'South Atlantic', 'East North Central', 'Mountain',
    'South Atlantic', 'Pacific','South Atlantic', 'Pacific', 'Pacific',
)

seats = (
    7,1,9,4,53,7,5,1,27,14,2,2,18,9,4,4,6,6,2,8,9,14,
    8,4,8,1,3,4,2,12,3,27,13,1,16,5,5,18,2,7,1,9,36,4,
    1,11,10,3,8,1,0,0,0,0,0,
)

In [None]:
keys = ('st_fips', 'st_name', 'st_abbr', 'st_regs', 'st_divs', 'st_seats')
states_json = []

for values in zip(fips, stname, stabbr, region, division, seats):
    row = dict(zip(keys, values))
    states_json.append(row)
    
for row in states_json[0:5]:
    pprint(row)

In [None]:
with open(file='united_states.csv', mode='w', newline='') as f:
    writer = csv.DictWriter(f, fieldnames = states_json[0].keys())
    writer.writeheader()
    writer.writerows(states_json)

## Census Estimates

In [None]:
URL = 'http://www2.census.gov/programs-surveys/popest/datasets/2010-{0}/national/totals/nst-est{0}-alldata.csv'

year = '2019'

print(URL.format(year))

In [None]:
filename = wget.download(URL.format(year))

In [None]:
print(filename)

In [None]:
with open(filename, newline="", encoding='latin-1') as f:
    estimate_data = list(csv.reader(f, delimiter = ","))
    estimate_header = estimate_data.pop(0)

In [None]:
print(len(estimate_data))

In [None]:
for i, x in enumerate(estimate_header):
    print(i, x)

In [None]:
pprint(dict(zip(estimate_header, random.choice(estimate_data))))

In [None]:
for e, z in enumerate(estimate_header):
    data_sets = [x[e] for x in estimate_data]
    print("Index # {} for {} has {:,} records".format(e,z,len(set(data_sets))))

In [None]:
def nums(x):
    try:
        return float(x)
    except ValueError:
        return 0
    
print(list(map(nums, [5,.5,0.5,'5',5e3,'5e3','five'])))

In [None]:
estimate_header.index("POPESTIMATE2019")

In [None]:
index = 0
value = 16

pivot_data = sorted(set(x[index] for x in estimate_data))

for z in pivot_data:
    pivot_totals = []
    for x in estimate_data:
        if x[index] == z:
            pivot_totals.append(nums(x[value]))
    print("{} = {:,.2f}".format(z, sum(pivot_totals)))

In [None]:
pivot_cols = estimate_header[7:17]

for p in pivot_cols:
    print(p)

In [None]:
pivot_cols = [x for x in estimate_header if x.startswith("POPEST")]

for p in pivot_cols:
    print(p)

In [None]:
print('POPESTIMATE2019'[-4:])

In [None]:
periods = {p: pivot_cols[i][-4:]+"-12-31" for i, p in enumerate(pivot_cols)}

pprint(periods)

In [None]:
estimates_json = []
with open(filename) as f:
    reader = csv.DictReader(f)
    for row in reader:
        if row['SUMLEV'] == '040':
            for columns in pivot_cols:
                estimates_json.append(
                    dict(                 
                        scenario=columns,
                        year_end=periods[columns],
                        summary_level=row['SUMLEV'],
                        location=row['NAME'],
                        region=row['REGION'],
                        division=row['DIVISION'],
                        population=row[columns],
                    )
                )

In [None]:
for row in random.choices(estimates_json, k=4):
    pprint(row)

In [None]:
with open(file='estimates.csv', mode='w', newline='') as f:
    writer = csv.DictWriter(f, fieldnames = estimates_json[0].keys())
    writer.writeheader()
    writer.writerows(estimates_json)

In [None]:
del estimates_json

## IMDB

In [None]:
import gzip
import shutil
import urllib.request

In [None]:
URL = 'https://datasets.imdbws.com/title.basics.tsv.gz'

filename = wget.download(URL)

In [None]:
print(filename)

In [None]:
with gzip.open(filename, mode='r') as f_in, open(filename.replace('.gz',''), mode='wb') as f_out:
    shutil.copyfileobj(f_in, f_out)

In [None]:
uncompressed_file = filename.replace('.gz','')

print(uncompressed_file)

In [None]:
with open(uncompressed_file, mode='r', newline='', encoding='utf-8') as f:
    imdb_data = list(csv.reader(f, delimiter='\t', quoting=csv.QUOTE_NONE, ))
    imdb_header = imdb_data.pop(0)
    
print(len(imdb_data))

In [None]:
pprint(imdb_data[0])

In [None]:
for e, z in enumerate(imdb_header):
    try:
        data_sets = [x[e] for x in imdb_data]
        print("Index #{} for {} has {:,} records".format(e, z, len(set(data_sets))))
    except:
        print("Index #{} has a error".format(e))

In [None]:
pprint(Counter(x[4] for x in imdb_data))

In [None]:
pprint(Counter(x[1] for x in imdb_data))

In [None]:
star_wars = [title for title in imdb_data if 'Star Wars' in title[2]]
print(len(star_wars))

In [None]:
for head, row in zip(imdb_header, random.choice(star_wars)):
    print(head, '>>>', row)

In [None]:
del imdb_data

In [None]:
URL = 'https://m.imdb.com/chart/top'

with urllib.request.urlopen(URL) as f:
    web_data = f.read().decode('utf-8')

In [None]:
movie_titles = re.findall(pattern="(tt\d+)", string=web_data)

movie_titles = set(movie_titles)

pprint(movie_titles, compact=True, width=132)
print()
print(len(movie_titles))

In [None]:
with open(uncompressed_file, mode='r', newline='', encoding='utf-8') as f:
    reader = csv.DictReader(f, delimiter='\t', quoting=csv.QUOTE_NONE)
    
    with open('imdb_titles.csv', mode='w', newline='', encoding='utf-8')as fw:
        fieldnames = reader.fieldnames
        writer = csv.DictWriter(fw, dialect='excel', fieldnames=fieldnames)
        
        writer.writeheader()

        for row in reader:
            if (row['isAdult'] == '0') & (row['tconst'] in movie_titles):
                writer.writerow(row)

## Gutenburg

http://www.gutenberg.org/

In [None]:
URL = 'http://www.gutenberg.org/cache/epub/5061/pg5061.txt'

book = wget.download(URL)

In [None]:
print(book)

In [None]:
with open(book, encoding='utf8') as f:
    data = f.readlines()
    
print(type(data))

In [None]:
print(len(data))

In [None]:
for i, line in enumerate(data[0:20]):
    print(i, line.strip())

In [None]:
for i, line in enumerate(data):
    if "Tiny Tim" in line:
        print(i, "->", line.strip())

In [None]:
with open(book, encoding='utf8') as f:
    data = f.read()

print(type(data))

In [None]:
pprint(data[0:1000])

https://regex101.com/

In [None]:
def regex(pattern, string):
    patt = re.compile(pattern)
    matches = patt.finditer(string)    
    return list(matches)

In [None]:
# Tiny Tim?
for d in regex(r'Tiny Tim', data):
    print(d)

In [None]:
# Get sentences with "Tiny Tim"
get_sentence = re.findall(r"[^.]*Tiny Tim[^.]*\.", data)

print(len(get_sentence))

In [None]:
for sentence in get_sentence:
    print(sentence.strip())
    print("-"*80)