Merging the Data and Storing in a Database/Visualizing Data
Now that you have cleaned and transformed your 3 datasets, you need to load them into a
database. You can choose what kind of database (SQLLite or MySQL, Postgre SQL are all free
options). You will want to load each dataset into SQL Lite as an individual table and then you
must join the datasets together in Python into 1 dataset.
Once all the data is merged together in your database, create 5 visualizations that demonstrate
the data you have cleansed. You should have at least 2 visualizations that have data from more
than one source (meaning, if you have 3 tables, you must have visualizations that span across 2
of the tables – you are also welcome to use your consolidated dataset that you created in the
previous step, if you do that, you have met this requirement).

Submit your code for merging and storing in the database, with your code for the visualizations
along with a 250-500-word summary of what you learned and had to do to complete the
project. You can submit a Jupyter Notebook or a PDF of your code. If you submit a .py file you
need to also include a PDF or attachment of your results.

In [28]:
import sqlite3
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
%matplotlib inline
import math
from bs4 import BeautifulSoup
import requests

In [23]:
# Loading the Financial history data
# reusing the earlier database db_dsc540 for the final project
# Opening a connection object to connect to the database
with sqlite3.connect('db_dsc540') as conn:

    yr = ['2014','2015','2016','2017','2018']
    for i in range(5):
        yr[i]
        filename = "data/" + yr[i] +"_Financial_Data.csv"

        # defining the list of columns to load to the database
        column = ['Unnamed: 0','Revenue','Gross Profit','Operating Expenses','Operating Income','Interest Expense',
                 'Earnings before Tax','Net Income','EPS','Dividend per Share','Gross Margin','EBITDA',
                 'Revenue per Share','Net Income per Share','Market Cap','PE ratio','Price to Sales Ratio',
                 'Debt to Equity','Debt to Assets','Sector']
        df_fin_hist = pd.read_csv(filename, usecols=column)
        # Changing / Replace a column name in the dataframe
        df_fin_hist.rename(columns={'Unnamed: 0': 'Ticker'},inplace = True)
        df_fin_hist.insert(0, 'Year', yr[i])
        df_fin_hist.columns = [column.replace(' ','_') for column in df_fin_hist.columns]
        # write to the existing database into a new table
        df_fin_hist.to_sql('FinancialIndicator',conn, if_exists='append',index=False)
    # df_fin_hist.shape
    # df_fin_hist.head()


In [38]:
list_of_ticker = ['AAPL','AMZN','LULU','BAC','T']
# for i in range(5):
TK = list_of_ticker[0]

url = "https://finance.yahoo.com/quote/" + TK + "/history?period1=1571443200&period2=1603065600&interval=1d&filter=history&frequency=1d&includeAdjustedClose=true"
resp = requests.get(url)
soup = BeautifulSoup(resp.text, 'html.parser')
data = soup.findAll('tr')
headers = data[0]
data_without_header = data[1:]
col_headers = [th.getText() for th in headers.findAll('th')]
df_url_data = [[td.getText() for td in tr.findAll('td')] for tr in data_without_header]
df_eod = pd.DataFrame(df_url_data, columns=col_headers)
df_eod.insert(0,'Ticker', TK)
# remove the last row as it is the footer.
df_eod = df_eod[:-1]
df_eod[["Open","High","Low","Close*","Adj Close**"]] = df_eod[["Open","High","Low","Close*","Adj Close**"]].apply(pd.to_numeric) 
df_eod['Volume'] = df_eod['Volume'].str.replace(',', '').astype(float)

df_eod.columns = [column.replace('*','') for column in df_eod.columns]

# write to the existing database into a new table
# df_final.to_sql('eodstockprice',conn, if_exists='append',index=False)

ValueError: Unable to parse string "4:1 Stock Split" at position 34

In [58]:
# df_eod['Volume'] = df_eod['Volume'].str.replace(',', '').astype(float)
# df_eod[["Open","High","Low","Close*","Adj Close**"]] = df_eod[["Open","High","Low","Close*","Adj Close**"]].apply(pd.to_numeric) 
df_eod[["Open","High"]] = df_eod[["Open","High"]].apply(pd.to_numeric) 

# df_eod.dtypes

ValueError: Unable to parse string "4:1 Stock Split" at position 34

In [62]:
# Writing the SQL statements
cursor = conn.cursor()
sql_stmt = ''' select * from sqlite_master'''
rows = cursor.execute(sql_stmt)
for row in rows:
    print(row)

('table', 'customer', 'customer', 2, 'CREATE TABLE "customer" (\n\t"cust_name"\tTEXT NOT NULL,\n\t"cust_address"\tTEXT,\n\t"cust_city"\tTEXT,\n\t"cust_state"\tTEXT,\n\t"cust_zip"\tTEXT,\n\t"cust_phone"\tINTEGER\n)')
('table', 'FinancialIndicator', 'FinancialIndicator', 3, 'CREATE TABLE "FinancialIndicator" (\n"Ticker" TEXT,\n  "Revenue" REAL,\n  "Gross_Profit" REAL,\n  "Operating_Expenses" REAL,\n  "Operating_Income" REAL,\n  "Interest_Expense" REAL,\n  "Earnings_before_Tax" REAL,\n  "Net_Income" REAL,\n  "EPS" REAL,\n  "Dividend_per_Share" REAL,\n  "Gross_Margin" REAL,\n  "EBITDA" REAL,\n  "Revenue_per_Share" REAL,\n  "Net_Income_per_Share" REAL,\n  "Market_Cap" REAL,\n  "PE_ratio" REAL,\n  "Price_to_Sales_Ratio" REAL,\n  "Debt_to_Equity" REAL,\n  "Debt_to_Assets" REAL,\n  "Sector" TEXT\n)')
('table', 'eodstockprice', 'eodstockprice', 5, 'CREATE TABLE "eodstockprice" (\n"Ticker" TEXT,\n  "Date" TEXT,\n  "Open" REAL,\n  "High" REAL,\n  "Low" REAL,\n  "Close" REAL,\n  "Adj Close" REAL,\

In [89]:
# Reading from a sql query into a pandas dataframe 
pd.read_sql_query(sql_stmt,conn)

ProgrammingError: Cannot operate on a closed database.

In [85]:
# Closing the connection
conn.close()

In [87]:
# Dropping a table

drop_stmt = """drop table FinancialIndicator"""
cursor.execute(drop_stmt)
conn.commit()

OperationalError: database is locked

In [88]:
conn.close()

In [39]:
# Reading the data from the table
pd.read_sql_query('SELECT count(*) from FinancialIndicator',conn)

ProgrammingError: Cannot operate on a closed database.

In [10]:
# sample each Ticker from this table
pd.read_sql_query('''SELECT Ticker,count(*) from FinancialIndicator 
                  group by Ticker
                  ''',conn)

Unnamed: 0,Ticker,count(*)
0,A,1
1,AA,1
2,AABA,1
3,AAL,1
4,AAMC,1
...,...,...
4387,ZTR,1
4388,ZTS,1
4389,ZUMZ,1
4390,ZYME,1


In [12]:
# Show the Financial data for the following five companies by Ticker symbol
pd.read_sql_query('''SELECT * from FinancialIndicator 
                  where Ticker in ('AAPL','AMZN','LULU','BAC','T')
                  ''',conn)

Unnamed: 0,Ticker,Revenue,Gross_Profit,Operating_Expenses,Operating_Income,Interest_Expense,Earnings_before_Tax,Net_Income,EPS,Dividend_per_Share,Gross_Margin,EBITDA,Revenue_per_Share,Net_Income_per_Share,Market_Cap,PE_ratio,Price_to_Sales_Ratio,Debt_to_Equity,Debt_to_Assets,Sector
0,BAC,91247000000.0,91247000000.0,53381000000.0,37866000000.0,0.0,34584000000.0,28147000000.0,2.64,0.54,1.0,36647000000.0,9.0375,2.7878,241821800000.0,9.3333,2.6502,1.6454,0.1854,Financial Services
1,AAPL,265595000000.0,101839000000.0,30941000000.0,70898000000.0,0.0,72903000000.0,59531000000.0,12.01,2.72,0.3834,83806000000.0,53.5973,12.0134,1097649000000.0,18.9226,4.1328,1.0685,0.313,Technology
2,T,170756000000.0,91337000000.0,65241000000.0,26096000000.0,7957000000.0,24290000000.0,19370000000.0,2.85,2.0,0.5349,60677000000.0,23.4619,2.6614,207714100000.0,10.014,1.2164,0.9588,0.3319,Communication Services
3,AMZN,232887000000.0,93731000000.0,81310000000.0,12421000000.0,1417000000.0,11270000000.0,10073000000.0,20.68,0.0,0.4025,28028000000.0,478.2074,20.6838,734416200000.0,72.6291,3.1535,0.7611,0.2038,Technology
4,LULU,2649181000.0,1398790000.0,942789000.0,456001000.0,0.0,459998000.0,258662000.0,1.9,0.0,0.528,568233000.0,19.481,1.9021,10042900000.0,42.0842,3.7909,0.0,0.0,Consumer Cyclical


In [27]:
# showing the sum of Revenue by the Sector
pd.read_sql_query('''SELECT SECTOR,sum(Revenue)/(10000000) as Rev
                     from FinancialIndicator 
                     group by Sector
                     order by Rev
                     ''',conn)

Unnamed: 0,Sector,Rev
0,Real Estate,30078.197054
1,Utilities,57035.007279
2,Basic Materials,108313.885649
3,Communication Services,152409.924612
4,Consumer Defensive,232616.498036
5,Industrials,233905.50191
6,Financial Services,247620.126318
7,Healthcare,248401.957762
8,Technology,274650.377158
9,Consumer Cyclical,317851.13901


In [30]:
# Reading the data from the table
pd.read_sql_query('SELECT count(*) from eodstockprice',conn)

Unnamed: 0,count(*)
0,500


In [31]:
res=pd.read_sql_query('SELECT * from eodstockprice',conn)
res

Unnamed: 0,Ticker,Date,Open,High,Low,Close,Adj Close,Volume
0,AAPL,"Oct 16, 2020",2.30,2.32,2.20,2.21,2.21,478100.0
1,AAPL,"Oct 15, 2020",2.28,2.32,2.20,2.31,2.31,766800.0
2,AAPL,"Oct 14, 2020",2.19,2.26,2.18,2.18,2.18,264200.0
3,AAPL,"Oct 13, 2020",2.22,2.24,2.15,2.20,2.20,252100.0
4,AAPL,"Oct 12, 2020",2.24,2.30,2.16,2.19,2.19,452800.0
...,...,...,...,...,...,...,...,...
495,T,"Jun 03, 2020",2.95,3.03,2.83,2.84,2.84,1139100.0
496,T,"Jun 02, 2020",2.92,2.95,2.79,2.91,2.91,1385500.0
497,T,"Jun 01, 2020",2.95,3.02,2.88,2.88,2.88,873500.0
498,T,"May 29, 2020",2.90,2.97,2.81,2.93,2.93,1055500.0


In [32]:
# these footers should be taken out from the database.
res=pd.read_sql_query('''SELECT * 
                            from eodstockprice
                            where open is null'''
                      ,conn)
res

Unnamed: 0,Ticker,Date,Open,High,Low,Close,Adj Close,Volume
