In [2]:


'''''''''''''''
Goal:  
- Produce dataset that's used for the long term model. The data produced by this pipeline will later be cleaned and used to predict new observations.  

The pipeline receives:  
- Fundamental Data:  
        - BalanceSheet  
        - IncomeStatement  
        - FinancialRatios  
        - Dividends  
        - Shares  
- Historical Data: 
        - Daily Prices  
        - S&P Daily Prices    
        
- The fundamental data will be merged to a unique dataframe based on ticker and date (combination of both year and quarter).  
- The daily historical data will be resampled to a quarterly basis with the goal of calculating the targets

'''''''''''''''

def gen_quarterFundamentals():
    
    #define queries
    balanceSheet_query = f""" 
                SELECT *
                FROM `{project_id}.{dataset_name}.balanceSheet` as df
                order by df.date asc
                
            """
    finRatios_query = f""" 
                SELECT *
                FROM `{project_id}.{dataset_name}.financialRatios` as df
                order by df.date asc
        
            """
    incomeStatement_query = f""" 
                SELECT *
                FROM `{project_id}.{dataset_name}.incomeStatement` as df
                order by df.date asc

            """
    cashFlow_query = f""" 
                SELECT *
                FROM `{project_id}.{dataset_name}.cashFlows` as df
                order by df.date asc

            """
    companyInfo_query = f""" 
                SELECT *
                FROM `{project_id}.{dataset_name}.companyInfo` as df
            """

    dividends_query = f""" 
                SELECT *
                FROM `{project_id}.{dataset_name}.historicalDividends` as df
            """

    shares_query = """ 
                SELECT *
                FROM `stockmarket-v0.stock_market.stock_shares` as df
            """


    ##### Import balancesheet
    balancesheet =  pd.read_gbq(query = balanceSheet_query,
                                project_id=project_id).drop_duplicates(['symbol','date'], keep='first')

    ##### Import Financial Ratios
    financialratios = pd.read_gbq(query = finRatios_query,
                                project_id=project_id).drop_duplicates(['symbol','date'], keep='first')

    ##### Import Financial Ratios

    cashflows = pd.read_gbq(query = cashFlow_query,
                                project_id=project_id).drop_duplicates(['symbol','date'], keep='first')

    ##### Import Income Statement
    income_statement = pd.read_gbq(query = incomeStatement_query,
                                project_id=project_id).drop_duplicates(['symbol','date'], keep='first')

    ##### Import company info
    company_info = pd.read_gbq(query = companyInfo_query,
                                project_id=project_id).drop_duplicates( keep='first')

    ##### Import Dividends
   
    dividend = pd.read_gbq(query = dividends_query,
                                project_id=project_id).drop_duplicates(['symbol','date'], keep='first')
    dividend = dividend.rename(columns = {'ticker':'symbol'})


    ##### Import Stock Shares 
    shares = pd.read_gbq(query = shares_query,
                        project_id=project_id).drop_duplicates(keep='first')


    ##### Current Work
    dividend = pd.merge(dividend,
                        company_info[['sector','symbol']],
                        on='symbol')
    dividend['date'] = pd.to_datetime(dividend['date'])
    dividend['quarter'] = dividend['date'].dt.quarter
    dividend['year'] = dividend['date'].dt.year


    df = income_statement.drop(columns = ['cik']).copy()
    df['date'] = pd.to_datetime(df['date'],utc=True) 

    #rename columns from cf to distinguish from columns from BS
    cashflows = cashflows.rename(columns  = {'netIncome':'netIncome_cf',
                                            'inventory':'inventory_cf',
                                            'depreciationAndAmortization':'depreciationAndAmortization_cf'})

    #drop unwanted columns - this columns are present in every table
    balancesheet = balancesheet.drop(columns = ['link','cik','calendarYear',
                                                'finalLink','acceptedDate','reportedCurrency'])
    cashflows = cashflows.drop(columns = ['link','cik','calendarYear',
                                          'finalLink','acceptedDate','reportedCurrency'])
        
        

    #merge all dfs on symbol and date
    for i in [balancesheet,cashflows,financialratios]:
        i['date'] = pd.to_datetime(i['date'],utc=True)  
        
        if 'fillingDate'  in i.columns:
            df = pd.merge(df,i.drop(columns = ['period','fillingDate']),
                        on=['symbol','date'], 
                        how = 'left')
        if 'fillingDate'  not in i.columns:
            df = pd.merge(df,i.drop(columns = ['period']),
                        on=['symbol','date'], 
                        how = 'left')

    #transform date column to datetime 
    df['date'] = pd.to_datetime(df['date'])


    #create a copy of the df named data
    data = df.copy()

    #create quarter & year columns
    data['quarter'] = data['date'].dt.quarter
    data['year'] = data['date'].dt.year

    #merge data with shares info on symbol, year and quarter 
    data = pd.merge(data.reset_index(),
                    shares[['symbol','floatShares']], 
                    on ='symbol',
                    how='left')
    #merge data with dividend on symbol, year and quarter 
    data = pd.merge(data, 
                    dividend[['symbol','year','quarter','dividend']],
                    on = ['symbol','year','quarter'],
                    how='left')

    #merge data with company info on symbol 
    data = pd.merge(data, 
                    company_info[['symbol','sector','industry']],
                    on='symbol',
                    how='left')

    #set symbol and date as index
    data = data.drop_duplicates(['quarter','year','symbol'], 
                                keep='first').set_index(['symbol','date']).drop(columns = ['index'])

    #reset index of dataframe
    data = data.reset_index()

    #get metric features
    metric_features = data.select_dtypes(include=np.number).columns

    #change type of columns to float
    for col in metric_features:
        data[col] = data[col].astype(float)

    #sort values by symbol and date
    data = data.sort_values(['symbol','date'])
    
    return(data)

import sys
sys.path.insert(1, r'C:\Users\rafae\Personal\Github\GCP-Stock_Market\FMP_API')
sys.path.insert(2, r'C:\Users\rafae\Personal\Github\GCP-Stock_Market\InsertData')

import time 
import pandas as pd
import numpy as np


import FMP_API as API
import insertBQ_Class as insert


#set key
key = '54594d7278e0fa3c0831a72c60e04b8d'

start_time = time.time()

#set big query variables
project_id = 'stockmarket-v0'
table_name = 'QuarterlyFundamentals'
dataset_name = 'stockMarket_dev'

#set table path
destionation_table_name = f'{project_id}.{dataset_name}.{table_name}'

print(f'Project ID: {project_id}')
print(f'Dataset Name: {dataset_name}')
print(f'Inserting Data into: {table_name}')

#generate quarterly fundamentals
data = gen_quarterFundamentals()

#set query for duplicate checking
query =  f"""
    SELECT df.symbol,df.date,df.fillingDate
    FROM `{destionation_table_name}` as df 
    
    """
#set index for duplicate checking
index_list = ['symbol','date']

#set inset data object
insertBQ = insert.insertData(projectID = project_id,
                            dataset_name = dataset_name,
                            duplicates = True,)

#insert data
insertBQ.insertData(data = data.iloc[:],
                    table_name = table_name,
                    query = query,
                    index_list = index_list)

Project ID: stockmarket-v0
Dataset Name: stockMarket_dev
Inserting Data into: QuarterlyAlpha
