In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

import json
# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session



# Outline

1. **Load data**
2. **Data Preprocessing**
3. **Correlation for features**
4. **Consider NaN**
5. **ML model**
6. **Conclusion**

## 1. Load data

**Setting class**

In [None]:
class LongTermStrategy:
    def __init__(self, url, etfname):
        self.url = url
        self.etfname = etfname
        
    def get_ticker_list(self):
        df = self.get_stats()
        list_ticker = sorted(list(set(df['Ticker'].to_list())))

        return list_ticker

    def get_price_data(self, etf_list=[], OnlyRecent=False):
        self.etf_list = etf_list
        url_price = self.url+'/data_origin/FS_'+self.etfname+'_Value.json'
        combined_price = pd.read_json(url_price)
        df_price = pd.DataFrame({'Recent_price': []})

        if OnlyRecent == True:
            for symbol in self.etf_list:
                temp_df = combined_price[combined_price.Ticker.str.contains(symbol)].copy()
                res = temp_df.loc[temp_df.index[-1], 'Adj Close']
                df_price.loc[symbol, 'Recent_price'] =res
            return df_price
        else:
            df_price = combined_price.copy()
            return df_price
        
        
    def get_stats(self, preprocessing = False):
        url_stats = self.url+'/data_origin/FS_'+self.etfname+'_stats.json'
        df = pd.read_json(url_stats)
        if preprocessing == True:
            df_per = self.get_PER() # PER
            df_psr = self.get_PSR() # Price/Sales
            df_pbr = self.get_PBR() # Price/Book
            df_peg = self.get_PEG() # Price/Earning growth
            df_forper = self.get_FORPER() # Forward PER
            df_cap = self.get_CAP() # Market Cap

            # Concat mulit dataframe
            df = pd.concat([df_per, df_psr, df_pbr, df_peg, df_forper, df_cap], axis=1)
            
        return df

    def get_addstats(self, preprocessing = False):
        url_addstats = self.url+'/data_origin/FS_'+self.etfname+'_addstats.json'
        df = pd.read_json(url_addstats)
        if preprocessing == True:
            df_beta = self.get_Beta()
            df_divr = self.get_DivRate() # Annual diviend rate
            df_roe = self.get_ROE() # ROE
            df_roa = self.get_ROA() # ROA
            df_pm = self.get_PM() # Profit Margin
            df_cash = self.get_Cash() # Total Cash
            df_debt = self.get_Debt() # Total Debt
            
            # Concat mulit dataframe
            df = pd.concat([df_beta, df_divr, df_roe, df_roa, df_pm, df_cash, df_debt], axis=1)

        return df

    def get_balsheets(self, preprocessing = False):
        url_balsheets = self.url+'/data_origin/FS_'+self.etfname+'_balsheets.json'
        df = pd.read_json(url_balsheets)
        if preprocessing == True:
            df_ta = self.get_TA() # Total Assets

            df = pd.concat([df_ta], axis=1)

        return df
    
    def get_income(self, preprocessing = False):
        url_income = self.url+'/data_origin/FS_'+self.etfname+'_income.json'
        df = pd.read_json(url_income)
        if preprocessing == True:
            df_tr = self.get_TR() # Total revenue

            df = pd.concat([df_tr], axis=1)

        return df

    def get_flow(self, preprocessing = False):
        url_flow = self.url+'/data_origin/FS_'+self.etfname+'_flow.json'
        df = pd.read_json(url_flow)
        if preprocessing == True:
            df_div = self.get_DIV() # Dividends paid across companies
            df_iss = self.get_ISS() # Issuance information

            df = pd.concat([df_div, df_iss], axis=1)

        return df

 ###################################################################################################
    ## For stats
    def get_stats_element(self, etf_list =['AAPL']):
        df_stats = self.get_stats()
        self.etf_list = etf_list
        temp_df = df_stats[df_stats.Ticker == etf_list[0]].copy()
        list_df = temp_df['Attribute'].to_list()
        df = pd.DataFrame(columns=list_df, index = self.etf_list)
        for ticker in self.etf_list:
            temp_df = df_stats[df_stats.Ticker == ticker].copy()
            list_df = temp_df['Attribute'].to_list()
            for att in list_df:
                temp_df_stats = df_stats[df_stats.Attribute == att].copy()
                temp_df_stats = temp_df_stats.set_index('Ticker')
                df.loc[ticker, att] = temp_df_stats.loc[ticker, 'Recent']

        return df
    
    def get_addstats_element(self, etf_list =['AAPL']):
        df_stats = self.get_addstats()
        self.etf_list = etf_list
        temp_df = df_stats[df_stats.Ticker == etf_list[0]].copy()
        list_df = temp_df['Attribute'].to_list()
        df = pd.DataFrame(columns=list_df, index = self.etf_list)
        for ticker in self.etf_list:
            temp_df = df_stats[df_stats.Ticker == ticker].copy()
            list_df = temp_df['Attribute'].to_list()
            for att in list_df:
                temp_df_stats = df_stats[df_stats.Attribute == att].copy()
                temp_df_stats = temp_df_stats.set_index('Ticker')
                df.loc[ticker, att] = temp_df_stats.loc[ticker, 'Value']

        return df
   
    def get_balsheets_element(self, etf_list =['AAPL']):
        df_stats = self.get_balsheets()
        self.etf_list = etf_list
        temp_df = df_stats[df_stats.Ticker == etf_list[0]].copy()
        list_df = temp_df['Breakdown'].to_list()
        df = pd.DataFrame(columns=list_df, index = self.etf_list)
        for ticker in self.etf_list:
            temp_df = df_stats[df_stats.Ticker == ticker].copy()
            list_df = temp_df['Breakdown'].to_list()
            for att in list_df:
                temp_df_stats = df_stats[df_stats.Breakdown == att].copy()
                temp_df_stats = temp_df_stats.set_index('Ticker')
                df.loc[ticker, att] = temp_df_stats.loc[ticker, 'Recent']

        return df.astype(float)
    
    def get_income_element(self, etf_list =['AAPL']):
        df_stats = self.get_income()
        self.etf_list = etf_list
        temp_df = df_stats[df_stats.Ticker == etf_list[0]].copy()
        list_df = temp_df['Breakdown'].to_list()
        df = pd.DataFrame(columns=list_df, index = self.etf_list)
        for ticker in self.etf_list:
            temp_df = df_stats[df_stats.Ticker == ticker].copy()
            list_df = temp_df['Breakdown'].to_list()
            for att in list_df:
                temp_df_stats = df_stats[df_stats.Breakdown == att].copy()
                temp_df_stats = temp_df_stats.set_index('Ticker')
                df.loc[ticker, att] = temp_df_stats.loc[ticker, 'Recent']

        return df.astype(float)
    
    
    def get_flow_element(self, etf_list =['AAPL']):
        df_stats = self.get_flow()
        self.etf_list = etf_list
        temp_df = df_stats[df_stats.Ticker == etf_list[0]].copy()
        list_df = temp_df['Breakdown'].to_list()
        df = pd.DataFrame(columns=list_df, index = self.etf_list)
        for ticker in self.etf_list:
            temp_df = df_stats[df_stats.Ticker == ticker].copy()
            list_df = temp_df['Breakdown'].to_list()
            for att in list_df:
                temp_df_stats = df_stats[df_stats.Breakdown == att].copy()
                temp_df_stats = temp_df_stats.set_index('Ticker')
                df.loc[ticker, att] = temp_df_stats.loc[ticker, 'Recent']

        return df.astype(float)

###################################################################################################

    def get_PER(self):
        df = self.get_stats()
        df_per = df[df.Attribute.str.contains('Trailing P/E')].copy()
        df_per['PER'] = df_per.loc[:, 'Recent']
        df_per = df_per.drop(['Attribute', 'Recent'], axis=1)
        df_per = df_per.set_index('Ticker')
        df_per = df_per.fillna(value=np.nan)
        df_temp = pd.DataFrame()
        for col in df_per.columns:
            df_temp[col] = pd.to_numeric(df_per[col], errors='coerce')
            
        return df_temp.astype(float)

    def get_PSR(self):
        df = self.get_stats()
        df_psr = df[df.Attribute.str.contains('Price/Sales')].copy()
        df_psr['PSR'] = df_psr.loc[:, 'Recent']
        df_psr = df_psr.drop(['Attribute', 'Recent'], axis=1)
        df_psr = df_psr.set_index('Ticker')
        df_psr = df_psr.fillna(value=np.nan)
        df_temp = pd.DataFrame()
        for col in df_psr.columns:
            df_temp[col] = pd.to_numeric(df_psr[col], errors='coerce')

        return df_temp.astype(float)

    def get_PBR(self):
        df = self.get_stats()
        df_pbr = df[df.Attribute.str.contains('Price/Book')].copy()
        df_pbr['PBR'] = df_pbr.loc[:, 'Recent']
        df_pbr = df_pbr.drop(['Attribute', 'Recent'], axis=1)
        df_pbr = df_pbr.set_index('Ticker')

        return df_pbr.astype(float)

    def get_PEG(self):
        df = self.get_stats()
        df_peg = df[df.Attribute.str.contains('PEG')].copy()
        df_peg['PEG'] = df_peg.loc[:, 'Recent']
        df_peg = df_peg.drop(['Attribute', 'Recent'], axis=1)
        df_peg = df_peg.set_index('Ticker')

        return df_peg.astype(float)

    def get_FORPER(self):
        df = self.get_stats()
        df_forper = df[df.Attribute.str.contains('Forward P/E')].copy()
        df_forper['forPER'] = df_forper.loc[:, 'Recent']
        df_forper = df_forper.drop(['Attribute', 'Recent'], axis=1)
        df_forper = df_forper.set_index('Ticker')
        df_temp = pd.DataFrame()
        for col in df_forper.columns:
            df_temp[col] = pd.to_numeric(df_forper[col], errors='coerce')

        return df_temp
    def get_CAP(self):
        df = self.get_stats()
        df_cap = df[df.Attribute.str.contains('Cap')].copy()
        df_cap['marketCap'] = df_cap.loc[:, 'Recent']
        df_cap = df_cap.drop(['Attribute', 'Recent'], axis=1)
        df_cap = df_cap.set_index('Ticker')
        df_cap = df_cap.fillna(value=np.nan)
        for ticker in df_cap.index:
            value = df_cap.loc[ticker, 'marketCap']
            if type(value) == str:
                value = float(value.replace('.','').replace('T','0000000000').replace('B','0000000'). replace('M','0000').replace('k','0'))
            df_cap.loc[ticker, 'marketCap'] = value

        return df_cap.astype(float)
    
#############################################################
    

    def get_Beta(self):
        df = self.get_addstats()
        df_beta = df[df.Attribute.str.contains('Beta')].copy()
        df_beta['Beta'] = df_beta.loc[:, 'Value']
        df_beta = df_beta.drop(['Attribute', 'Value'], axis=1)
        df_beta = df_beta.set_index('Ticker')

        return df_beta.astype(float)
    
    def get_DivRate(self):
        df = self.get_addstats()
        df_divr = df[df.Attribute.str.contains('Trailing Annual Dividend Rate')].copy()
        df_divr['AnnualDividendRate']= df_divr.loc[:, 'Value']
        df_divr = df_divr.drop(['Attribute', 'Value'], axis=1)
        df_divr = df_divr.set_index('Ticker')

        return df_divr.astype(float)

    def get_ROE(self):
        df = self.get_addstats()
        df_roe = df[df.Attribute.str.contains('Return on Equity')].copy()
        df_roe['ROE(%)'] = df_roe.loc[:, 'Value']
        df_roe = df_roe.drop(['Attribute', 'Value'], axis=1)
        df_roe = df_roe.set_index('Ticker')
        df_roe = df_roe.fillna(value=np.nan)
        for ticker in df_roe.index:
            value = df_roe.loc[ticker, 'ROE(%)']
            if type(value) == str:
                value = float(value[:-1].replace(',',''))
            df_roe.loc[ticker, 'ROE(%)'] = value

        return df_roe.astype(float)

    def get_ROA(self):
        df = self.get_addstats()
        df_roa = df[df.Attribute.str.contains('Return on Assets')].copy()
        df_roa['ROA(%)'] = df_roa.loc[:, 'Value']
        df_roa = df_roa.drop(['Attribute', 'Value'], axis=1)
        df_roa = df_roa.set_index('Ticker')
        df_roa = df_roa.fillna(value=np.nan)
        for ticker in df_roa.index:
            value = df_roa.loc[ticker, 'ROA(%)']
            if type(value) == str:
                value = float(value[:-1])
            df_roa.loc[ticker, 'ROA(%)'] = value

        return df_roa.astype(float)

    def get_PM(self):
        df = self.get_addstats()
        df_pm = df[df.Attribute.str.contains('Profit Margin')].copy()
        df_pm['ProfitMargin(%)'] = df_pm.loc[:, 'Value']
        df_pm = df_pm.drop(['Attribute', 'Value'], axis=1)
        df_pm = df_pm.set_index('Ticker')
        df_pm = df_pm.fillna(value=np.nan)
        for ticker in df_pm.index:
            value = df_pm.loc[ticker, 'ProfitMargin(%)']
            if type(value) == str:
                value = float(value[:-1])
            df_pm.loc[ticker, 'ProfitMargin(%)'] = value

        return df_pm.astype(float)
    
    def get_Cash(self):
        df = self.get_addstats()
        df_cash = df[df.Attribute.str.contains('Total Cash Per Share')].copy()
        df_cash['TotalCash'] = df_cash.loc[:, 'Value']
        df_cash = df_cash.drop(['Attribute', 'Value'], axis=1)
        df_cash = df_cash.set_index('Ticker')

        return df_cash.astype(float)

    def get_Debt(self):
        df = self.get_addstats()
        df_debt = df[df.Attribute.str.contains('Total Debt/Equity')].copy()
        df_debt['TotalDebt'] = df_debt.loc[:, 'Value']
        df_debt = df_debt.drop(['Attribute', 'Value'], axis=1)
        df_debt = df_debt.set_index('Ticker')

        return df_debt.astype(float)

##########################################################    
    def get_TA(self):
        df = self.get_balsheets()
        df_ta = df[df.Breakdown == 'totalAssets'].copy()
        df_ta['TotalAssets'] = df_ta.loc[:, 'Recent']
        df_ta = df_ta.drop(['Breakdown', 'Recent'], axis=1)
        df_ta = df_ta.set_index('Ticker')

        return df_ta
    
    def get_TR(self):
        df = self.get_income()
        df_tr = df[df.Breakdown == 'totalRevenue'].copy()
        df_tr['TotalRevenue'] = df_tr.loc[:, 'Recent']
        df_tr = df_tr.drop(['Breakdown', 'Recent'], axis=1)
        df_tr = df_tr.set_index('Ticker')

        return df_tr
    
    def get_DIV(self):
        df = self.get_flow()
        df_div = df[df.Breakdown == 'dividendsPaid'].copy()
        df_div['DividendsPaid'] = df_div.loc[:, 'Recent']
        df_div = df_div.drop(['Breakdown', 'Recent'], axis=1)
        df_div = df_div.set_index('Ticker')

        return df_div

    def get_ISS(self):
        df = self.get_flow()
        df_iss = df[df.Breakdown == 'issuanceOfStock'].copy()
        df_iss['Issuance'] = df_iss.loc[:, 'Recent']
        df_iss = df_iss.drop(['Breakdown', 'Recent'], axis=1)
        df_iss = df_iss.set_index('Ticker')

        return df_iss


In [None]:
# var and class
filename = 'sp500'
url='/kaggle/input/sp-500-stocks-value-with-financial-statement'

strategy = LongTermStrategy(url, filename)

# Get list of stocks
sp500_list = strategy.get_ticker_list()
print(sp500_list)

## 2. Data Preprocessing

**2.1 price as time series**

It needs much time to large datasets.

In [None]:
#df_price = strategy.get_price_data(etf_list=sp500_list, OnlyRecent=True)
#df_price.head(5)
df_price = pd.read_json(url+'/data_origin/FS_sp500_Recent_Value.json')
df_price.head(5)

**2.2 fundmantal stats**

In [None]:
df_stats = strategy.get_stats(True)
df_stats.head(5)

**2.3 Additional stats**

In [None]:
df_addstats = strategy.get_addstats(True)
df_addstats.head(5)

**2.4 Balance sheets**

In [None]:
#df_balsheets = strategy.get_balsheets(True)
df_balsheets = strategy.get_balsheets_element(sp500_list)
df_balsheets.head(5)

**2.5 Imcome statement**

In [None]:
#df_income = strategy.get_income(True)
df_income = strategy.get_income_element(sp500_list)
df_income.head(5)

**2.6 Cash flow**

In [None]:
#df_flow = strategy.get_flow(True)
df_flow = strategy.get_flow_element(sp500_list)
df_flow.head(5)

**2.7 Merge dataframe**

In [None]:
df = pd.concat([df_price, df_stats, df_addstats, df_balsheets, df_income, df_flow], axis=1)
df.dropna()
df.head(5)

**2.8 Check numeric dtype**

In [None]:
from pandas.api.types import is_numeric_dtype
num_cols = [is_numeric_dtype(dtype) for dtype in df.dtypes]
print(num_cols)

## 3. Correlation for features

**3.1 Split data and test For correlation**

In [None]:
from sklearn.model_selection import train_test_split
train_df_corr, test_df_corr = train_test_split(df.copy(), test_size=0.2)
train_df_corr.head(5), test_df_corr.head(5)

**3.2 Calculation**

In [None]:
corrmat = train_df_corr.corr()
top_corr_features = corrmat.index[abs(corrmat['Recent_price'])>0.2]
top_corr_features

**3.2 Heatmap**

In [None]:
import matplotlib.pyplot as plt
import seaborn as sns
plt.figure(figsize=(13,10))
plt_corr = sns.heatmap(train_df_corr[top_corr_features].corr(), annot=True)

## 4. Consider NaN

How to considef NaN data

We need how to take the NaN data. First of all, we remove the NaN index and columns over ratio of 0.5.

Before considering, we have to select ticker for analysis.

In my case, I select energy sector in S&P500 index.

In [None]:
energy_list = ['APA', 'COG', 'COP', 'CVX', 'DVN', 'EOG', 'FANG', 'HAL', 'HES', 'KMI',         'MPC', 'MRO', 'NOV', 'OKE', 'OXY', 'PSX', 'PXD', 'SLB', 'VLO', 'WMB', 'XOM']
port_list = energy_list

**4.1 Remove index**

In [None]:
df_index_null = pd.DataFrame(columns=['TotalNull', 'PercentOfNull'])
for ticker in df.index:
    temp_df = df.loc[ticker,:]
    count_null = temp_df.isnull().sum()
    percent_count_null = count_null/len(temp_df)
    df_index_null.loc[ticker, 'TotalNull'] = count_null
    df_index_null.loc[ticker, 'PercentOfNull'] = percent_count_null
    remove_index = df_index_null[df_index_null['PercentOfNull']>0.5].index.tolist()
    for tic in port_list:
        if tic in remove_index:
            remove_index.remove(tic)
df = df.drop(remove_index, axis=0)
df.info()


**4.2 Remove columns**

In [None]:
nulltotal = df.isnull().sum().sort_values(ascending=False)
nullpercent = ( df.isnull().sum() / len(df) ).sort_values(ascending=False)
nullpoint = pd.concat([nulltotal, nullpercent], axis=1, keys=['Total number of null', 'Percent of null'])
print(nullpoint)

remove_cols = nullpercent[nullpercent >= 0.5].keys()
df = df.drop(remove_cols, axis=1)
print(df.isnull().sum().max())
newtotal = df.isnull().sum().sort_values(ascending=False)
print(newtotal)

In [None]:
# filling the numeric data
numeric_missed = newtotal.index
for feature in numeric_missed:
    df[feature] = df[feature].fillna(0)

print('Re check')
print(df.isnull().sum().max())

In [None]:
## Feature Engineering
from scipy.stats import norm, skew
numeric_feats = df.dtypes[df.dtypes != 'object'].index
skewed_feats = df[numeric_feats].apply(lambda x: skew(x)).sort_values(ascending=False)
high_skew = skewed_feats[abs(skewed_feats) > 0.5]
print(high_skew)

#for feature in high_skew.index:
#    df[feature] = np.log1p(df[feature]-df[feature].min()+1)

## 5. ML model

**5.1 Split train and test for ML**

target column is **Market Cap** and then setting.

In [None]:
y_df = df['marketCap']
df = df.drop(['Recent_price', 'marketCap'], axis=1)


In [None]:
y_df = y_df.to_frame()

In [None]:
y_test = y_df.loc[y_df.index.intersection(port_list), :]
x_test = df.loc[df.index.intersection(port_list), :]
y_train = y_df.drop(port_list, axis=0)
x_train = df.drop(port_list, axis=0)


In [None]:
test_index = x_test.index

**5.2 Modeling**

In [None]:
import xgboost as XGB

the_model = XGB.XGBRegressor(colsample_bytree=0.4603, gamma=0.0468,
                            learning_rate=0.05, max_depth=3,
                            min_child_weight=1.7817, n_estimators=2200,
                            reg_alpha=0.4640, reg_lambda=0.8571,
                            subsample=0.5213, random_state =7, nthread = -1)

# To solve error of features_names
x_train = x_train.loc[:,~x_train.columns.duplicated()]
duplicate_columns = x_train.columns[x_train.columns.duplicated()]
x_test = x_test.loc[:,~x_test.columns.duplicated()]
duplicate_columns_t = x_test.columns[x_test.columns.duplicated()]

the_model.fit(x_train, y_train)

y_predict = np.floor(the_model.predict(x_test))

sub = pd.DataFrame()
sub['Ticker'] = test_index
sub['MarketCapOfPrediction'] = y_predict
sub = sub.set_index('Ticker')
sub = pd.concat([sub, y_test], axis=1)
sub['Ratio'] = sub.MarketCapOfPrediction / sub.marketCap
sub = sub.sort_values(by= 'Ratio', ascending=False)
print(sub)

from sklearn.metrics import mean_squared_error
acc = mean_squared_error(y_test, y_predict)
print('mse: ', acc)

## 6. Conclusion

We have to know how much to estimatie market price at that time (2021-07-04).

We should hanv known several ticker is ** low estimation ** for currunt above result for ratio over 1.

It is up to the individual to judge how to evaluate this result.


If you want to use this code, I'm very sorry that you should change code and make directory for data  a little bit.

I should appreciate and refer for many blog on google. Thanks a lot.

If you satisfied this post you should check [Github](https://github.com/hanseopark/Stock/tree/master/Strategy) and please **Star** :)
