In [637]:
!pip install gspread oauth2client



# Sales Data (resource - CSVs)
## Extracting data

In [638]:
import pandas as pd
import gspread 
from oauth2client.service_account import ServiceAccountCredentials
import os
from datetime import datetime

In [639]:
file_location = 'Data Resources/ZHVI/'
files = os.listdir(file_location)
files

file_list = list()

for data_file in files:
    file_dict={}
    file_dict['file'] = f"{file_location}{data_file}"
    file_dict['Bedroom Count'] = int((data_file.split('Zip_zhvi_bdrmcnt_')[1]).split('.')[0])
    file_list.append(file_dict)
file_list

['Zip_zhvi_bdrmcnt_1.csv',
 'Zip_zhvi_bdrmcnt_2.csv',
 'Zip_zhvi_bdrmcnt_3.csv',
 'Zip_zhvi_bdrmcnt_4.csv',
 'Zip_zhvi_bdrmcnt_5.csv']

[{'file': 'Data Resources/ZHVI/Zip_zhvi_bdrmcnt_1.csv', 'Bedroom Count': 1},
 {'file': 'Data Resources/ZHVI/Zip_zhvi_bdrmcnt_2.csv', 'Bedroom Count': 2},
 {'file': 'Data Resources/ZHVI/Zip_zhvi_bdrmcnt_3.csv', 'Bedroom Count': 3},
 {'file': 'Data Resources/ZHVI/Zip_zhvi_bdrmcnt_4.csv', 'Bedroom Count': 4},
 {'file': 'Data Resources/ZHVI/Zip_zhvi_bdrmcnt_5.csv', 'Bedroom Count': 5}]

## Transforming data

In [923]:
# from IPython.core.interactiveshell import InteractiveShell
# InteractiveShell.ast_node_interactivity = "all"

zhvi_complete_df = pd.DataFrame()

number_of_record = 0

for data_file in file_list:
    
    print()
    
    zhvi_df = pd.read_csv(data_file['file'])

    print(f"File {data_file['file']} : {len(zhvi_df)} numbers of records")
          
    number_of_record += int(zhvi_df['RegionID'].count())
    zhvi_df.rename(columns={"RegionName":"Zip Code"}, inplace=True)

    zhvi_df.drop(columns=["RegionID","SizeRank","RegionType","StateName"], inplace=True)

    columns_to_drop = []
    columns_to_rename = {}
    
    for column in zhvi_df.columns[5:]:
#         print(column)
        if int(column.split('-')[0]) < 2015:
#             print(column)
            columns_to_drop.append(column)
        else:
            columns_to_rename[column] = datetime.strftime(datetime.strptime(column, "%Y-%m-%d"),'%m/%d/%Y')  
#     print(columns_to_rename)
    # columns_to_drop
    
    zhvi_df.drop(columns=columns_to_drop, inplace=True)
    
    zhvi_df.rename(columns=columns_to_rename, inplace=True)
    
    column_list=['Zip Code','City','CountyName','Metro','State']

    column_list_data = zhvi_df[column_list]

    zhvi_df.drop(columns=column_list, inplace = True)
    zhvi_df = zhvi_df.fillna(0)
        
    for column in column_list:
        zhvi_df.insert(column_list.index(column), column, column_list_data[column])

    zhvi_df.insert(0,'Bedroom Count',int(data_file['Bedroom Count']))
    
    zhvi_complete_df = zhvi_complete_df.append(zhvi_df,ignore_index=True,sort=False)

zhvi_complete_df


File Data Resources/ZHVI/Zip_zhvi_bdrmcnt_1.csv : 18548 numbers of records

File Data Resources/ZHVI/Zip_zhvi_bdrmcnt_2.csv : 26754 numbers of records

File Data Resources/ZHVI/Zip_zhvi_bdrmcnt_3.csv : 28799 numbers of records

File Data Resources/ZHVI/Zip_zhvi_bdrmcnt_4.csv : 26579 numbers of records

File Data Resources/ZHVI/Zip_zhvi_bdrmcnt_5.csv : 22073 numbers of records


Unnamed: 0,Bedroom Count,Zip Code,City,CountyName,Metro,State,01/31/2015,02/28/2015,03/31/2015,04/30/2015,...,04/30/2020,05/31/2020,06/30/2020,07/31/2020,08/31/2020,09/30/2020,10/31/2020,11/30/2020,12/31/2020,01/31/2021
0,1,10025,New York,New York County,New York-Newark-Jersey City,NY,687255.0,692013.0,692222.0,691676.0,...,689319.0,688401.0,687989.0,686916.0,685278.0,682705.0,681110.0,680490.0,681329.0,681740.0
1,1,60657,Chicago,Cook County,Chicago-Naperville-Elgin,IL,205053.0,204925.0,205317.0,206342.0,...,217151.0,217353.0,217807.0,218778.0,220062.0,221396.0,222713.0,224034.0,225383.0,226155.0
2,1,10023,New York,New York County,New York-Newark-Jersey City,NY,832919.0,834728.0,835364.0,840480.0,...,764479.0,762927.0,763024.0,762572.0,762335.0,762696.0,764745.0,766037.0,768289.0,769852.0
3,1,60614,Chicago,Cook County,Chicago-Naperville-Elgin,IL,233676.0,233681.0,234129.0,235117.0,...,246589.0,246965.0,247455.0,247958.0,248226.0,248181.0,248364.0,249036.0,250130.0,250644.0
4,1,77449,Katy,Harris County,Houston-The Woodlands-Sugar Land,TX,104257.0,105348.0,105860.0,107274.0,...,137762.0,138753.0,139456.0,140111.0,140617.0,142016.0,143976.0,146953.0,149562.0,151943.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
122748,5,18348,Pocono Lake,Monroe County,East Stroudsburg,PA,0.0,0.0,0.0,239530.0,...,208013.0,208351.0,208001.0,209010.0,211422.0,214524.0,215693.0,214938.0,216432.0,219000.0
122749,5,21405,Annapolis,Anne Arundel County,Baltimore-Columbia-Towson,MD,956662.0,960987.0,960050.0,965402.0,...,1038744.0,1041542.0,1044743.0,1047632.0,1054339.0,1059607.0,1071507.0,1080129.0,1099575.0,1110337.0
122750,5,4109,Portland,Cumberland County,Portland-South Portland,ME,766902.0,761535.0,765015.0,762734.0,...,954064.0,963020.0,970516.0,975945.0,986262.0,995443.0,1010473.0,1027687.0,1050814.0,1077295.0
122751,5,89155,Las Vegas,Clark County,Las Vegas-Henderson-Paradise,NV,324790.0,326751.0,327544.0,328498.0,...,432288.0,434705.0,436505.0,440215.0,444924.0,450580.0,454860.0,458650.0,461560.0,465385.0


In [924]:
zhvi_complete_df.set_index(['Bedroom Count','Zip Code'], inplace=True)
zhvi_complete_df

Unnamed: 0_level_0,Unnamed: 1_level_0,City,CountyName,Metro,State,01/31/2015,02/28/2015,03/31/2015,04/30/2015,05/31/2015,06/30/2015,...,04/30/2020,05/31/2020,06/30/2020,07/31/2020,08/31/2020,09/30/2020,10/31/2020,11/30/2020,12/31/2020,01/31/2021
Bedroom Count,Zip Code,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1,Unnamed: 22_level_1
1,10025,New York,New York County,New York-Newark-Jersey City,NY,687255.0,692013.0,692222.0,691676.0,690212.0,691754.0,...,689319.0,688401.0,687989.0,686916.0,685278.0,682705.0,681110.0,680490.0,681329.0,681740.0
1,60657,Chicago,Cook County,Chicago-Naperville-Elgin,IL,205053.0,204925.0,205317.0,206342.0,206843.0,205887.0,...,217151.0,217353.0,217807.0,218778.0,220062.0,221396.0,222713.0,224034.0,225383.0,226155.0
1,10023,New York,New York County,New York-Newark-Jersey City,NY,832919.0,834728.0,835364.0,840480.0,840354.0,844932.0,...,764479.0,762927.0,763024.0,762572.0,762335.0,762696.0,764745.0,766037.0,768289.0,769852.0
1,60614,Chicago,Cook County,Chicago-Naperville-Elgin,IL,233676.0,233681.0,234129.0,235117.0,236302.0,236381.0,...,246589.0,246965.0,247455.0,247958.0,248226.0,248181.0,248364.0,249036.0,250130.0,250644.0
1,77449,Katy,Harris County,Houston-The Woodlands-Sugar Land,TX,104257.0,105348.0,105860.0,107274.0,108457.0,109481.0,...,137762.0,138753.0,139456.0,140111.0,140617.0,142016.0,143976.0,146953.0,149562.0,151943.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
5,18348,Pocono Lake,Monroe County,East Stroudsburg,PA,0.0,0.0,0.0,239530.0,239882.0,238308.0,...,208013.0,208351.0,208001.0,209010.0,211422.0,214524.0,215693.0,214938.0,216432.0,219000.0
5,21405,Annapolis,Anne Arundel County,Baltimore-Columbia-Towson,MD,956662.0,960987.0,960050.0,965402.0,970865.0,974424.0,...,1038744.0,1041542.0,1044743.0,1047632.0,1054339.0,1059607.0,1071507.0,1080129.0,1099575.0,1110337.0
5,4109,Portland,Cumberland County,Portland-South Portland,ME,766902.0,761535.0,765015.0,762734.0,763211.0,759332.0,...,954064.0,963020.0,970516.0,975945.0,986262.0,995443.0,1010473.0,1027687.0,1050814.0,1077295.0
5,89155,Las Vegas,Clark County,Las Vegas-Henderson-Paradise,NV,324790.0,326751.0,327544.0,328498.0,330283.0,331729.0,...,432288.0,434705.0,436505.0,440215.0,444924.0,450580.0,454860.0,458650.0,461560.0,465385.0


## Extracting States from csv

In [925]:
file_path="Data Resources/States.csv"
states_df = pd.read_csv(file_path)
states_df.set_index('state', inplace=True)
states_df

Unnamed: 0_level_0,state_name
state,Unnamed: 1_level_1
AL,Alabama
AK,Alaska
AZ,Arizona
AR,Arkansas
CA,California
CO,Colorado
CT,Connecticut
DE,Delaware
DC,District of Columbia
FL,Florida


# Inventory Data (resource - Google sheets)
https://drive.google.com/drive/folders/1SCwfsJ8WD_295HeEOx8iBrM8mtwEzM7y

## Extracting data

In [926]:
scope=["https://spreadsheets.google.com/feeds",
       "https://www.googleapis.com/auth/spreadsheets",
       "https://www.googleapis.com/auth/drive.file",
       "https://www.googleapis.com/auth/drive"]

creds = ServiceAccountCredentials.from_json_keyfile_name("credentials.json",scope)

client = gspread.authorize(creds)

inventory_sales_sheets = client.open("US_Sale_Inventory_Monthly")

print(inventory_sales_sheets.worksheets())

inventory_pending_sheets = client.open("US_Pending_Inventory_Monthly")

print(inventory_pending_sheets.worksheets())

zori_sheets = client.open("ZORI_AllHomesPlusMultifamily_ZIP")

print(zori_sheets.worksheets())



[<Worksheet 'All Homes' id:0>]
[<Worksheet 'All Homes' id:0>]
[<Worksheet 'All Homes' id:0>]


In [927]:
inventory_sales_ws = inventory_sales_sheets.worksheet("All Homes").get_all_records()
inventory_pending_ws = inventory_pending_sheets.worksheet("All Homes").get_all_records()
zori_ws = zori_sheets.worksheet("All Homes").get_all_records()

# print(ws)

inventory_sales_df = pd.DataFrame(inventory_sales_ws)
inventory_pending_df = pd.DataFrame(inventory_pending_ws)
zori_df = pd.DataFrame(zori_ws)

print("Extracting data successfully from google sheets.")
# data = sheet.get_all_records()

Extracting data successfully from google sheets.


## Transforming data

In [928]:
inventory_sales_df.drop(columns=["RegionID","SizeRank","RegionType"], inplace=True)

columns_to_drop = []

inventory_sales_df = inventory_sales_df.iloc[1:]

for column in inventory_sales_df.columns[2:]:
    if int(column.split('/')[2]) < 2018:
#         print(column)
        columns_to_drop.append(column)

inventory_sales_df.drop(columns=columns_to_drop, inplace=True)
inventory_sales_df.rename(columns={"StateName" : "State"}, inplace=True)

inventory_sales_df['RegionName'] = inventory_sales_df['RegionName'].str.split(',').str[0]

inventory_sales_df.set_index(['RegionName','State'], inplace=True)
inventory_sales_df

Unnamed: 0_level_0,Unnamed: 1_level_0,1/31/2018,2/28/2018,3/31/2018,4/30/2018,5/31/2018,6/30/2018,7/31/2018,8/31/2018,9/30/2018,10/31/2018,...,4/30/2020,5/31/2020,6/30/2020,7/31/2020,8/31/2020,9/30/2020,10/31/2020,11/30/2020,12/31/2020,1/31/2021
RegionName,State,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1,Unnamed: 22_level_1
New York,NY,65653,73487,79409,86394,91531,93057,92781,90108,89423,90475,...,65234,70110,78575,84264,82987,84009,82610,73791,64939,63891
Los Angeles-Long Beach-Anaheim,CA,19814,21856,24152,25576,27763,28955,30705,30999,30577,31360,...,20187,23468,25276,26358,26419,26438,26285,23218,19499,19748
Chicago,IL,40138,42746,51220,53709,56214,58257,59362,59119,57760,57150,...,44617,49279,51907,52229,51297,49407,48739,41987,35176,35034
Dallas-Fort Worth,TX,22856,23466,26728,30074,31840,33978,35962,35346,34127,33380,...,31380,35059,34971,34032,31525,28750,30876,27066,23330,24120
Philadelphia,PA,24308,25443,27976,31137,32347,31462,30287,29806,29246,29917,...,18261,21044,22839,22429,21525,20785,20871,18264,15508,16028
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
Jackson,MS,2826,2916,3174,3335,3554,3440,3378,3276,3062,3114,...,2563,2617,2432,2341,2186,2034,2034,1761,1537,1516
Augusta,GA,3827,3811,3999,4098,4144,4094,4124,4056,3903,3895,...,3271,3225,2890,2981,2715,2570,2542,2209,2110,2211
Harrisburg,PA,2680,2632,2897,3094,3189,3227,3237,3239,3067,3040,...,1616,2104,2293,2142,2030,1826,1805,1552,1300,1451
Provo,UT,1823,1614,1981,2548,3054,3292,3431,3440,3341,3678,...,3667,4047,3556,3174,2851,2520,2485,1807,1430,1428


In [929]:
inventory_pending_df.drop(columns=["RegionID","SizeRank","RegionType"], inplace=True)
inventory_pending_df = inventory_pending_df.iloc[1:]

inventory_pending_df.rename(columns={"StateName" : "State"}, inplace=True)

inventory_pending_df['RegionName'] = inventory_pending_df['RegionName'].str.split(',').str[0]

inventory_pending_df.set_index(['RegionName','State'], inplace=True)
inventory_pending_df

Unnamed: 0_level_0,Unnamed: 1_level_0,1/31/2018,2/28/2018,3/31/2018,4/30/2018,5/31/2018,6/30/2018,7/31/2018,8/31/2018,9/30/2018,10/31/2018,...,3/31/2020,4/30/2020,5/31/2020,6/30/2020,7/31/2020,8/31/2020,9/30/2020,10/31/2020,11/30/2020,12/31/2020
RegionName,State,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1,Unnamed: 22_level_1
New York,NY,71,36,33,38,36,42,47,51,56,53,...,43,46,54,29,27,29,31,29,33,41
Los Angeles-Long Beach-Anaheim,CA,19,14,15,14,15,16,19,21,25,26,...,13,17,15,15,13,13,12,13,13,14
Chicago,IL,48,19,12,14,14,17,21,25,29,33,...,18,24,18,15,14,12,13,13,17,24
Dallas-Fort Worth,TX,35,22,21,19,20,23,24,29,33,34,...,26,29,27,26,24,23,23,22,23,24
Philadelphia,PA,57,39,28,24,28,30,25,27,29,28,...,11,22,12,10,9,8,14,9,10,13
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
Jackson,MS,53,33,38,32,28,30,25,33,36,37,...,23,25,18,23,17,14,18,9,11,18
Augusta,GA,73,8,22,21,20,18,25,29,33,36,...,21,19,16,11,10,10,8,10,10,12
Harrisburg,PA,50,43,14,10,16,13,13,22,24,22,...,8,20,10,7,6,6,22,6,6,8
Provo,UT,29,11,11,8,15,11,13,16,21,22,...,6,13,11,13,6,6,6,5,5,8


In [930]:
from calendar import monthrange

zori_df.drop(columns=["RegionID","SizeRank"], inplace=True)
# print(zori_df)
zori_df = zori_df.iloc[1:]

columns_to_drop = []
columns_to_rename = {}
new_date_column=""

for column in zori_df.columns[2:]:
    if int(column.split('-')[0]) < 2018:
#         print(column)
        columns_to_drop.append(column)
    else:
        new_date_column = f"{int(column.split('-')[1])}/{monthrange(int(column.split('-')[0]),int(column.split('-')[1]))[1]}/{int(column.split('-')[0])}"

        columns_to_rename[column] = new_date_column
    
# print(columns_to_drop)
# print(columns_to_rename)

zori_df.drop(columns = columns_to_drop, inplace=True)

columns_to_rename['RegionName'] = "Zip Code"

zori_df.rename(columns = columns_to_rename , inplace=True)

zori_df.insert(2,'State',zori_df['MsaName'].str.split(',').str[1])
zori_df['MsaName'] = zori_df['MsaName'].str.split(',').str[0]

zori_df.set_index(['Zip Code'], inplace=True)
zori_df

Unnamed: 0_level_0,MsaName,State,1/31/2018,2/28/2018,3/31/2018,4/30/2018,5/31/2018,6/30/2018,7/31/2018,8/31/2018,...,4/30/2020,5/31/2020,6/30/2020,7/31/2020,8/31/2020,9/30/2020,10/31/2020,11/30/2020,12/31/2020,1/31/2021
Zip Code,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
60657,Chicago,IL,1566,1583,1597,1614,1627,1633,1631,1614,...,1672,1675,1668,1636,1628,1593,1575,1540,1545,1571
10023,New York,NY,3245,3259,3270,3317,3362,3373,3344,3349,...,3386,3349,3303,3192,3137,3038,2975,2915,2851,2872
77494,Houston,TX,1722,1726,1726,1731,1727,1728,1734,1729,...,1751,1749,1754,1764,1772,1778,1764,1789,1798,1795
60614,Chicago,IL,1820,1828,1868,1865,1877,1889,1878,1868,...,1957,1954,1936,1920,1878,1860,1815,1753,1743,1721
77449,Houston,TX,1426,1437,1438,1451,1456,1460,1465,1460,...,1497,1509,1515,1527,1534,1536,1537,1541,1545,1568
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2108,Boston,MA,2474,2415,2462,2560,2494,2565,2495,2489,...,2789,2762,2674,2624,2480,2431,2545,2579,2416,2352
90067,Los Angeles-Long Beach-Anaheim,CA,6356,6673,6791,6603,6796,7075,6899,6336,...,6843,6616,7043,7199,7220,7301,7132,7243,7022,7075
2110,Boston,MA,3937,4007,3951,3690,3748,3674,3814,3925,...,,3689,3627,3591,3671,3624,3434,3558,3598,3502
20004,Washington,DC,2578,2560,2506,2530,2569,2584,2551,2583,...,2689,2650,2629,2644,2699,2642,2530,2563,2540,2472


# School Data (resource - www.greatschools.org)
## Extracting data

In [931]:
from splinter import Browser
from webdriver_manager.chrome import ChromeDriverManager

from bs4 import BeautifulSoup as bs
import requests

In [960]:
great_school_url = "https://www.greatschools.org/california/san-jose/schools/?gradeLevels%5B%5D=e&gradeLevels%5B%5D=m&gradeLevels%5B%5D=h&st%5B%5D=public_charter&st%5B%5D=public&st%5B%5D=charter&view=table"

executable_path = {'executable_path': ChromeDriverManager().install()}
browser = Browser('chrome', **executable_path, headless=False)

browser.visit(great_school_url)

html = browser.html
soup = bs(html, 'html.parser')

print("Data Extraction started...")
print('-'*30)
try:
    next_button = soup.findAll('a',class_='anchor-button')[-1]
    
    school_df = pd.DataFrame()

    print("Getting data from page 1.")
    
    while ('disabled' not in next_button.attrs['class']):
                
        next_button = soup.findAll('a',class_='anchor-button')[-1]
        
        school_section = soup.select("section.school-table")

        school_list = soup.find("tbody")


        for row in school_list:
            
            col = row.findAll('td')

            school_row = {} 

            if(col[0].select_one("a.name")):

                school_row['school_name'] = col[0].select_one("a.name").text

                rating = col[0].select_one("div div.circle-rating--small")
                if(rating):
                    school_row['rating'] = rating.text

                address = col[0].select_one("div.address")
                if(address):
                    school_row['zip_code'] = (address.text.split(',')[-1]).strip()

                    school_row['type'] = col[1].text
            school_row['grades'] = col[2].text
            school_row['total_students_enrolled'] = col[3].text
            school_row['students_per_teacher'] = col[4].text
            school_row['district'] = col[6].text

            school_df = school_df.append(school_row, ignore_index=True)
        if ('disabled' not in next_button.attrs['class']):
            browser.visit('https://www.greatschools.org'+ next_button['href'])    

            html = browser.html
            soup = bs(html, 'html.parser')
            
            print(f"Getting data from page {next_button['href'].split('&page=')[1]}.")
        else:
            break;
except:
    print("Something went wrong")
    
browser.quit()

print('-'*30)
print("Extraction completed...")
school_df

[WDM] - Current google-chrome version is 89.0.4389
[WDM] - Get LATEST driver version for 89.0.4389
[WDM] - Driver [C:\Users\shahp\.wdm\drivers\chromedriver\win32\89.0.4389.23\chromedriver.exe] found in cache




Data Extraction started...
------------------------------
Getting data from page 1.
Getting data from page 2.
Getting data from page 3.
Getting data from page 4.
Getting data from page 5.
Getting data from page 6.
Getting data from page 7.
Getting data from page 8.
Getting data from page 9.
Getting data from page 10.
------------------------------
Extraction completed...


Unnamed: 0,district,grades,rating,school_name,students_per_teacher,total_students_enrolled,type,zip_code
0,Fremont Union High School District,9-12,10/10,Lynbrook High School,23:1,1880,Public district,95129
1,Santa Clara County Office Of Education School ...,7-12,9/10,University Preparatory Academy Charter,20:1,684,Public charter,95125
2,East Side Union High School District,9-12,9/10,Evergreen Valley High School,26:1,2961,Public district,95148
3,East Side Union High School District,9-12,9/10,KIPP San Jose Collegiate,20:1,530,Public charter,95133
4,Franklin-Mckinley Elementary School District,K-8,9/10,Cornerstone Academy Preparatory School,24:1,543,Public charter,95122
...,...,...,...,...,...,...,...,...
241,East Side Union High School District,11-12,,Foothill High School,17:1,274,Public district,95127
242,East Side Union High School District,11-12,,Pegasus High School,22:1,114,Public district,95133
243,East Side Union High School District,11-12,,Phoenix High School,17:1,78,Public district,95123
244,Oak Grove Elementary School District,K-6,,Glider Elementary School,26:1,620,Public district,95123


## Transforming data

In [991]:
# For currently unrated schools and N/A areas
school_df.fillna(0)

# school_df.loc[['Escuela Popular/Center For Training And Careers, Family Learning']]

school_df.set_index(['school_name','zip_code'], inplace=True)
school_df

Unnamed: 0_level_0,Unnamed: 1_level_0,district,grades,rating,students_per_teacher,total_students_enrolled,type
school_name,zip_code,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
Lynbrook High School,95129,Fremont Union High School District,9-12,10/10,23:1,1880,Public district
University Preparatory Academy Charter,95125,Santa Clara County Office Of Education School ...,7-12,9/10,20:1,684,Public charter
Evergreen Valley High School,95148,East Side Union High School District,9-12,9/10,26:1,2961,Public district
KIPP San Jose Collegiate,95133,East Side Union High School District,9-12,9/10,20:1,530,Public charter
Cornerstone Academy Preparatory School,95122,Franklin-Mckinley Elementary School District,K-8,9/10,24:1,543,Public charter
...,...,...,...,...,...,...,...
Foothill High School,95127,East Side Union High School District,11-12,0,17:1,274,Public district
Pegasus High School,95133,East Side Union High School District,11-12,0,22:1,114,Public district
Phoenix High School,95123,East Side Union High School District,11-12,0,17:1,78,Public district
Glider Elementary School,95123,Oak Grove Elementary School District,K-6,0,26:1,620,Public district


KeyError: "None of ['school_name', 'zip_code'] are in the columns"

# Loading Data to PostgreSQL

In [940]:
from sqlalchemy import create_engine
import pandas as pd
from db_conn import user_name
from db_conn import password
import psycopg2

conn = psycopg2.connect(
   database="postgres", user=f'{user_name}', password=f'{password}', host='127.0.0.1', port= '5432'
)

conn.autocommit = True

cursor = conn.cursor()

cursor.execute("SELECT datname FROM pg_database;")

list_database = cursor.fetchall()

dbname = "zillow_db"

try:
#     if (dbname,) in list_database:

#         #Preparing query to delete a database
#         cursor.execute(f'''DROP DATABASE {dbname}''')

#         cursor.close()
#     #     conn.close()

#         print("Database deleted successfully...")
#         print('-'*30)

    if (dbname,) not in list_database:
        
        cur = conn.cursor()
        cur.execute('CREATE DATABASE ' + dbname)

        cur.close()
        conn.close()

        print("Creating Database...")

        engine = create_engine(f'postgresql://{user_name}:{password}@localhost:5432/{dbname}')

        connection = engine.connect()

        print('-'*30)
        print("Creating Tables, Please wait...")
        print('-'*30)

        zhvi_complete_df.to_sql('sales',engine)
        print("Table sales created successfully")

        states_df.to_sql('states', engine)
        print("Table states created successfully")

        sales_inventory_df.to_sql('inventory_sales', engine)
        print("Table inventory_sales created successfully")

        inventory_pending_df.to_sql('inventory_pending', engine)
        print("Table inventory_pending created successfully")

        zori_df.to_sql('rentals', engine)
        print("Table rentals created successfully")

        school_df.to_sql('schools', engine)
        print("Table schools created successfully")

        connection.close()

        print('-'*30)
        print("Database is ready to use.")
    else:
        print("Database is already exists.")
except:
    print("Something went wrong.")

Creating Database...
------------------------------
Creating Tables, Please wait...
------------------------------
Table sales created successfully
Table states created successfully
Table inventory_sales created successfully
Table inventory_pending created successfully
Table rentals created successfully
Table schools created successfully
------------------------------
Database is ready to use.


In [None]:
# SELECT * FROM inventory_pending

# SELECT * FROM inventory_sales

# SELECT * FROM rentals

# SELECT * FROM sales

# SELECT * FROM states