In [1]:
%load_ext autoreload
%autoreload 2

In [2]:
import sys
from pathlib import Path
sys.path.append(Path(".").absolute().parent.as_posix())

In [3]:
from src import model_data as md

In [4]:
print(md.SingleTickerPipeline.__init__.__doc__)


        Parameters
        ----------
        target: str
            target can be "price", "return" or "log_return"
        target_type: str
            target type can be "single" for single point-in-time prediction 
            or "sequence" for sequence prediction (predicts a sequence of target shifted one day into the future)
            if single, output y shape is (N, 1)
            if sequence, output y shape is (N, model_seq_len, 1)
        model_seq_len: int
            model sequence length specifies the sequence length of each input sample. 
            E.g. 30 means using the past 30 days's historical data to predict the next day
        max_overlap: int
            maximum number of overlapping days between two sequences. Will be capped at model_seq_len - 1
            if it is larger than model_seq_len
        train_periods: list(tuple(str, str))
            training periods is a list of tuples, each tuple has a start date and an end date. 
            Data from all tr

In [5]:
single_ticker_pipeline = md.SingleTickerPipeline(
    target="price",
    target_type="single",
    model_seq_len=30,
    max_overlap=20,
    train_periods=[
        ("2000-01-01", "2006-12-31"),
        ("2009-01-01", "2018-12-31"),
    ],
    test_periods=[
        ("2007-01-01", "2008-12-31"),
        ("2019-01-01", "2021-04-01"),
    ],
    normalization_method="log",
#     lookback_period=200,
    cross_validation_folds=5,)

In [6]:
single_ticker_pipeline.prepare_data("TEAM")

INFO:src.model_data:Reading data from /home/rluo/raid/classes/gatech/cs7643/GATech-CS7643-Project-Group/data/feature_selected/TEAM.csv...
INFO:src.model_data:Making training arrays...
INFO:numexpr.utils:Note: NumExpr detected 12 cores but "NUMEXPR_MAX_THREADS" not set, so enforcing safe limit of 8.
INFO:numexpr.utils:NumExpr defaulting to 8 threads.
INFO:src.model_data:  Training has 73 sequences of length 30.
INFO:src.model_data:Making 5 validation folds...
INFO:src.model_data:  Generating folds with fold_size=11 and distance between train and validation being 3
INFO:src.model_data:    Fold 0 shapes:
INFO:src.model_data:      x: (11, 30, 53), y: (11, 1)
INFO:src.model_data:      x: (11, 30, 53), y: (11, 1)
INFO:src.model_data:    Fold 1 shapes:
INFO:src.model_data:      x: (22, 30, 53), y: (22, 1)
INFO:src.model_data:      x: (11, 30, 53), y: (11, 1)
INFO:src.model_data:    Fold 2 shapes:
INFO:src.model_data:      x: (33, 30, 53), y: (33, 1)
INFO:src.model_data:      x: (11, 30, 53), 

In [7]:
# If loading existing data:
single_ticker_pipeline.load_data("TEAM")

INFO:src.model_data:Loading generated data from /home/rluo/raid/classes/gatech/cs7643/GATech-CS7643-Project-Group/data/model_data/TEAM...
INFO:src.model_data:  Loading train folds...
INFO:src.model_data:  Loading test arrays...


### Train dictionary structure

In [14]:
single_ticker_pipeline.print_train_shapes()

{
    "0": {
        "train": {
            "x": "(11, 30, 53)",
            "y": "(11, 1)",
            "target_date": "(11, 1)"
        },
        "valid": {
            "x": "(11, 30, 53)",
            "y": "(11, 1)",
            "target_date": "(11, 1)"
        }
    },
    "1": {
        "train": {
            "x": "(22, 30, 53)",
            "y": "(22, 1)",
            "target_date": "(22, 1)"
        },
        "valid": {
            "x": "(11, 30, 53)",
            "y": "(11, 1)",
            "target_date": "(11, 1)"
        }
    },
    "2": {
        "train": {
            "x": "(33, 30, 53)",
            "y": "(33, 1)",
            "target_date": "(33, 1)"
        },
        "valid": {
            "x": "(11, 30, 53)",
            "y": "(11, 1)",
            "target_date": "(11, 1)"
        }
    },
    "3": {
        "train": {
            "x": "(44, 30, 53)",
            "y": "(44, 1)",
            "target_date": "(44, 1)"
        },
        "valid": {
            "x": "(11

### Test dictionary structure

In [15]:
single_ticker_pipeline.print_test_shapes()

{
    "N": 537,
    "target_date": "(537, 1)",
    "x": "(537, 30, 53)",
    "y": "(537, 1)"
}


# Drafts

In [4]:
from pathlib import Path
import pandas as pd
import numpy as np

In [5]:
__file__ = Path("./create_training_data.ipynb")

In [6]:
data_path = Path(__file__).absolute().parent.parent.joinpath("data/feature_selected")

In [7]:
data_path

PosixPath('/home/rluo/raid/classes/gatech/cs7643/GATech-CS7643-Project-Group/data/feature_selected')

In [8]:
data_files = list(data_path.glob("*.csv"))

## Config

In [107]:
# target can be "price", "return" or "log_return"
target = "price"

# target type can be single prediction or sequence prediction
target_type = "sequence"

# model sequence length specifies the sequence length of each input sample. E.g. 30 means using the past 30 days's historical data to predict the next day
model_seq_len = 30

# maximum number of overlapping days of historical data between two records
max_overlap = 20

# training periods is a list of tuples, each tuple has a start date and an end date. Data from all training periods are put together
# Note that training periods will be further divided into training and valiation, or time series cross validation
train_periods = [
    ("2000-01-01", "2006-12-31"),
    ("2010-01-01", "2018-12-31")
]

# testing periods - similar to training periods
test_periods = [
    ("2007-01-01", "2009-12-31"),
    ("2019-01-01", "2021-04-01")
]

cross_validation_folds = 5

## Single Ticker

In [105]:
__name__ = "create_training_data"

In [12]:
from pathlib import Path
import pandas as pd
import numpy as np
import logging
import pickle
import json
from sklearn.preprocessing import StandardScaler

LOG = logging.getLogger(__name__)
logging.basicConfig(level=logging.INFO)

class SingleTickerPipeline:
    def __init__(
        self,
        target="price",
        target_type="sequence",
        model_seq_len=30,
        max_overlap=20,
        train_periods=[
            ("2000-01-01", "2006-12-31"),
            ("2009-01-01", "2018-12-31"),
        ],
        test_periods=[
            ("2007-01-01", "2008-12-31"),
            ("2019-01-01", "2021-04-01"),
        ],
        normalization_method="log",
        lookback_period=200,
        cross_validation_folds=5,
        data_path=Path(__file__).absolute().parent.parent.joinpath("data/feature_selected"),
        output_path=Path(__file__).absolute().parent.parent.joinpath("data/model_data"),
    ):
        """
        Parameters
        ----------
        target: str
            target can be "price", "return" or "log_return"
        target_type: str
            target type can be "single" for single point-in-time prediction 
            or "sequence" for sequence prediction (predicts a sequence of target shifted one day into the future)
            if single, output y shape is (N, 1)
            if sequence, output y shape is (N, model_seq_len, 1)
        model_seq_len: int
            model sequence length specifies the sequence length of each input sample. 
            E.g. 30 means using the past 30 days's historical data to predict the next day
        max_overlap: int
            maximum number of overlapping days between two sequences. Will be capped at model_seq_len - 1
            if it is larger than model_seq_len
        train_periods: list(tuple(str, str))
            training periods is a list of tuples, each tuple has a start date and an end date. 
            Data from all training periods are put together
            Note that training periods will be further divided into time series cross validation
        test_periods: list(tuple(str, str))
            similar to training periods
        normalization_method: str
            how features are normalized within each sequence
            None: no normalization performed
            "log": feature x is transformed into sign(x) * log(1 + |x|)
            "quantile": feature x is transformed into (x - P50) / (P75 - P25), where P25, P50 and P75 are 
                the 25th, 50th and 75th quantile of x in the past lookback_period records (if available)
        lookback_period: int
            number of records from the past used to estimate quantiles, only used if normalization_method is set to "quantile"
        cross_validation_folds: int
            number of folds for rolling cross validation
        data_path: str or pathlib.Path
            path to the input data directory. Default: project_root/data/feature_selected
        output_path: str or pathlib.Path
            root path to store the output data. Default: project_root/data/model_data
        """
        self.target = target
        self.target_col = "adj_close" if target == 'price' else target
        self.target_type = target_type
        self.model_seq_len = model_seq_len
        self.max_overlap = min(model_seq_len - 1, max_overlap)
        self.train_periods = train_periods
        self.test_periods = test_periods
        self.normalization_method = normalization_method
        self.lookback_period = lookback_period
        self.cross_validation_folds = cross_validation_folds
        self.data_path = Path(data_path)
        self.output_path = Path(output_path)
        
        # internal attributes
        self._df = None
        self._ticker = None
        self._feature_cols = None
        self._train_out = None
        self._test_out = None
        self._seq_dist = self.model_seq_len - self.max_overlap
        self._save_path = None
        
    def load_input(self, ticker=None):
        ticker = ticker or self._ticker
        data_file = self.data_path.joinpath(f"{ticker}.csv")
        LOG.info(f"Reading data from {data_file.as_posix()}...")
        df = pd.read_csv(data_file).drop('price', axis=1, errors="ignore").sort_values("date", ascending=True)
        if self.target == "price":
            df.loc[:, "target"] = df['adj_close']
        else: 
            if self.target == "return":
                return_col = get_return_col(df, log=False)
            if self.target == "log_return":
                return_col = get_return_col(df, log=True)
            df.loc[:, "target"] = df[return_col]
            df.drop(['adj_close'])
        self._feature_cols = df.drop(['date', 'target'], axis=1).columns.tolist()
        
        # match the target with data from the previous days
        df = pd.concat(
            [
                df[['target', 'date']].rename({'date': 'target_date'}, axis=1).iloc[1:, :].reset_index(drop=True),
                df[self._feature_cols + ["date"]].iloc[:-1, :].reset_index(drop=True)
            ],
            axis=1
        )
        self._df = df

    def get_xy_arr(self, dfs, seq_dist=None):
        seq_dist = seq_dist or self._seq_dist
        arrays = {"x": [], "y": [], "target_date": [], "N": 0}
        for df in dfs:
            N = df.shape[0]
            if N >= self.model_seq_len:
                for i in range((N - self.model_seq_len) // seq_dist):
                    feature_subdf = df[self._feature_cols].iloc[(N - (i * seq_dist + self.model_seq_len)):(N - i * seq_dist)]
                    target_col_copy = feature_subdf[[self.target_col]]
                    if self.normalization_method == "quantile":
                        feature_quantiles = df[self._feature_cols].iloc[
                            max(0, (N - (i * seq_dist + self.lookback_period))):(N - i * seq_dist)
                        ].quantile([0.25, 0.5, 0.75])
                        p25 = feature_quantiles.loc[0.25, :]
                        p50 = feature_quantiles.loc[0.50, :]
                        p75 = feature_quantiles.loc[0.75, :]
                        feature_subdf = ((feature_subdf - p50) / (p75 - p25))
                    elif self.normalization_method == "log":
                        feature_subdf = np.sign(feature_subdf) * np.log1p(np.abs(feature_subdf))
                    
                    feature_subdf = pd.concat([target_col_copy, feature_subdf], axis=1).replace([-np.inf, np.inf], np.nan).fillna(0)
                    arrays["x"].append(feature_subdf.values)
                    if self.target_type == "sequence":
                        arrays["y"].append(df[["target"]].iloc[(N - (i * seq_dist + self.model_seq_len)):(N - i * seq_dist)].values)
                        arrays["target_date"].append(df[["target_date"]].iloc[(N - (i * seq_dist + self.model_seq_len)):(N - i * seq_dist)].values)
                    elif self.target_type == "single":
                        arrays["y"].append([df["target"].iloc[(N - i * seq_dist) - 1]])
                        arrays["target_date"].append([df["target_date"].iloc[(N - i * seq_dist) - 1]])
                    else:
                        raise KeyError("Unknown target_type: target_type must be one of 'sequence' or 'single'!")
                    arrays["N"] += 1
        arrays["x"] = np.array(arrays['x'][::-1])
        arrays["y"] = np.array(arrays['y'][::-1])
        arrays["target_date"] = np.array(arrays['target_date'][::-1])
        return arrays

    def create_train_array(self):
        LOG.info("Making training arrays...")
        train_dfs = get_period_data(self._df, self.train_periods)
        train_xy_arrs = self.get_xy_arr(train_dfs)
        LOG.info(f"  Training has {train_xy_arrs['N']} sequences of length {self.model_seq_len}.")
        
        LOG.info(f"Making {self.cross_validation_folds} validation folds...")
        train_val_distance = int(np.ceil(self.model_seq_len / self._seq_dist))
        fold_size = (train_xy_arrs["N"] - train_val_distance) // (self.cross_validation_folds + 1)
        
        LOG.info(f"  Generating folds with fold_size={fold_size} and distance between train and validation being {train_val_distance}")
        folds = {}
        for i in range(self.cross_validation_folds):
            train_end_ind = fold_size * (i + 1)
            val_begin_ind = fold_size * (i + 1) + train_val_distance
            val_end_ind = val_begin_ind + fold_size
            fold_arrs = {
                "train":{
                    "x": train_xy_arrs["x"][:train_end_ind],
                    "y": train_xy_arrs["y"][:train_end_ind],
                    "target_date": train_xy_arrs["target_date"][:train_end_ind],
                },
                "valid":{
                    "x": train_xy_arrs["x"][val_begin_ind:val_end_ind],
                    "y": train_xy_arrs["y"][val_begin_ind:val_end_ind],
                    "target_date": train_xy_arrs["target_date"][val_begin_ind:val_end_ind],
                },
            }
            folds[i] = fold_arrs
            LOG.info(f"    Fold {i} shapes:")
            for sample in fold_arrs:
                LOG.info(f"      x: {fold_arrs[sample]['x'].shape}, y: {fold_arrs[sample]['y'].shape}")
        folds["_all_"] = {'x': train_xy_arrs['x'], 'y': train_xy_arrs['y']}
        self._train_out = folds
        
    def create_test_array(self):
        LOG.info("Making testing arrays...")
        test_dfs = get_period_data(self._df, self.test_periods)
        test_xy_arrs = self.get_xy_arr(test_dfs, seq_dist=1)
        LOG.info(f"  Training has {test_xy_arrs['N']} sequences of length {self.model_seq_len}.")
        self._test_out = test_xy_arrs
    
    def create_arrays(self):
        self.create_train_array()
        self.create_test_array()
        self._save_path = self.output_path.joinpath(self._ticker)
        LOG.info(f"Saving generated data at {self._save_path.as_posix()}...")
        if not self._save_path.exists():
            LOG.info(f"  Directory doesn't exist, making directory...")
            self._save_path.mkdir(parents=True)
        LOG.info("  Writing train folds...")
        write_pickle_file(self._train_out, self._save_path.joinpath("train.pkl"))
        LOG.info("  Writing test arrays...")
        write_pickle_file(self._test_out, self._save_path.joinpath("test.pkl"))
        
    def prepare_data(self, ticker):
        self._ticker = ticker
        self.load_input()
        self.create_arrays()
        
    def load_data(self, ticker):
        self._ticker = ticker
        self._save_path = self.output_path.joinpath(self._ticker)
        LOG.info(f"Loading generated data from {self._save_path.as_posix()}...")
        if not self._save_path.exists():
            raise FileNotFoundError("Directory doesn't exist, can't load data!")
        LOG.info("  Loading train folds...")
        self._train_out = load_pickle_file(self._save_path.joinpath("train.pkl"))
        LOG.info("  Loading test arrays...")
        self._test_out = load_pickle_file(self._save_path.joinpath("test.pkl"))
    
    def print_train_shapes(self):
        print(
            json.dumps({
                i: {
                    s: ({k: str(v.shape) for k, v in arr.items()} if isinstance(arr, dict) else str(arr.shape)) 
                    for s, arr in fold.items()
                } for i, fold in self._train_out.items()
            }, sort_keys=False, indent=4)
        )
        
    def print_test_shapes(self):
        print(
            json.dumps(
                {k: str(v.shape) if hasattr(v, "shape") else v for k, v in self._test_out.items()},
                sort_keys=True, indent=4
            )
        )
    
def get_return_col(df, log=False):
    price_rat = df['adj_close'] / df['adj_close'].shift(-1)
    if log:
        return_col_name = "log_return"
        return_col_value = np.log(price_rat)
    else:
        return_col_name = "return"
        return_col_value = price_rat - 1
    df.loc[:, return_col_name] = return_col_value
    return return_col_name


def get_period_data(df, periods, date_col="date"):
    dfs_by_period = [
        df[
            pd.to_datetime(df[date_col]).between(pd.to_datetime(period[0]), pd.to_datetime(period[1]))
        ].sort_values(date_col, ascending=True) for period in periods
    ]
    return dfs_by_period


def write_pickle_file(obj, file):
    with Path(file).open('wb') as pkl_file:
        pickle.dump(obj, pkl_file, protocol=4)


def load_pickle_file(file):
    with Path(file).open('rb') as pkl_file:
        obj = pickle.load(pkl_file)
    return obj

NameError: name '__file__' is not defined

In [96]:
pipeline = SingleTickerPipeline(normalization_method="log")

In [97]:
pipeline.prepare_data("TEAM")

INFO:__main__:Reading data from /home/rluo/raid/classes/gatech/cs7643/GATech-CS7643-Project-Group/data/feature_selected/TEAM.csv...
INFO:__main__:Making training arrays...
INFO:__main__:  Training has 73 sequences of length 30.
INFO:__main__:Making 5 validation folds...
INFO:__main__:  Generating folds with fold_size=11 and distance between train and validation being 3
INFO:__main__:    Fold 0 shapes:
INFO:__main__:      x: (11, 30, 53), y: (11, 30, 1)
INFO:__main__:      x: (11, 30, 53), y: (11, 30, 1)
INFO:__main__:    Fold 1 shapes:
INFO:__main__:      x: (22, 30, 53), y: (22, 30, 1)
INFO:__main__:      x: (11, 30, 53), y: (11, 30, 1)
INFO:__main__:    Fold 2 shapes:
INFO:__main__:      x: (33, 30, 53), y: (33, 30, 1)
INFO:__main__:      x: (11, 30, 53), y: (11, 30, 1)
INFO:__main__:    Fold 3 shapes:
INFO:__main__:      x: (44, 30, 53), y: (44, 30, 1)
INFO:__main__:      x: (11, 30, 53), y: (11, 30, 1)
INFO:__main__:    Fold 4 shapes:
INFO:__main__:      x: (55, 30, 53), y: (55, 30

In [98]:
import json
print(json.dumps({i: {s: ({k: str(v.shape) for k, v in arr.items()} if isinstance(arr, dict) else str(arr.shape)) for s, arr in fold.items()} for i, fold in pipeline._train_out.items()}, sort_keys=False, indent=4))

{
    "0": {
        "train": {
            "x": "(11, 30, 53)",
            "y": "(11, 30, 1)"
        },
        "valid": {
            "x": "(11, 30, 53)",
            "y": "(11, 30, 1)"
        }
    },
    "1": {
        "train": {
            "x": "(22, 30, 53)",
            "y": "(22, 30, 1)"
        },
        "valid": {
            "x": "(11, 30, 53)",
            "y": "(11, 30, 1)"
        }
    },
    "2": {
        "train": {
            "x": "(33, 30, 53)",
            "y": "(33, 30, 1)"
        },
        "valid": {
            "x": "(11, 30, 53)",
            "y": "(11, 30, 1)"
        }
    },
    "3": {
        "train": {
            "x": "(44, 30, 53)",
            "y": "(44, 30, 1)"
        },
        "valid": {
            "x": "(11, 30, 53)",
            "y": "(11, 30, 1)"
        }
    },
    "4": {
        "train": {
            "x": "(55, 30, 53)",
            "y": "(55, 30, 1)"
        },
        "valid": {
            "x": "(11, 30, 53)",
            "y": "(11

In [51]:
pipeline.load_data("TEAM")

INFO:__main__:Loading generated data from /home/rluo/raid/classes/gatech/cs7643/GATech-CS7643-Project-Group/data/model_data/TEAM...
INFO:__main__:  Loading train folds...
INFO:__main__:  Loading test arrays...


In [12]:
train_dfs = get_period_data(pipeline._df, pipeline.train_periods)

In [None]:
means = 

In [15]:
train_concat = pd.concat(train_dfs, axis=0)

In [48]:
np.sign(train_concat[pipeline._feature_cols])

Unnamed: 0,adj_close,ev,marketcap,pb,pe,evebit,retearn,accoci,ps,shareswa,...,revenueusd,revenue,divyield,sgna,cor,receivables,gp,taxliabilities,invcap,currentratio
0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,0.0,1.0,1.0,...,1.0,1.0,0.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0
1,1.0,1.0,1.0,1.0,1.0,1.0,1.0,0.0,1.0,1.0,...,1.0,1.0,0.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0
2,1.0,1.0,1.0,1.0,1.0,1.0,1.0,0.0,1.0,1.0,...,1.0,1.0,0.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0
3,1.0,1.0,1.0,1.0,1.0,1.0,1.0,0.0,1.0,1.0,...,1.0,1.0,0.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0
4,1.0,1.0,1.0,1.0,1.0,1.0,1.0,0.0,1.0,1.0,...,1.0,1.0,0.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
764,1.0,1.0,1.0,1.0,-1.0,-1.0,-1.0,0.0,1.0,1.0,...,1.0,1.0,0.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0
765,1.0,1.0,1.0,1.0,-1.0,-1.0,-1.0,0.0,1.0,1.0,...,1.0,1.0,0.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0
766,1.0,1.0,1.0,1.0,-1.0,-1.0,-1.0,0.0,1.0,1.0,...,1.0,1.0,0.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0
767,1.0,1.0,1.0,1.0,-1.0,-1.0,-1.0,0.0,1.0,1.0,...,1.0,1.0,0.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0


In [35]:
train_describe = train_concat.describe().T

In [28]:
stats = train_concat[pipeline._feature_cols].agg(['mean', 'std'])

In [32]:
(train_concat[pipeline._feature_cols] - stats.loc['mean', :]) / stats.loc['std', :]

Unnamed: 0,adj_close,ev,marketcap,pb,pe,evebit,retearn,accoci,ps,shareswa,...,revenueusd,revenue,divyield,sgna,cor,receivables,gp,taxliabilities,invcap,currentratio
0,-0.771824,-0.782292,-0.799935,2.209687,1.020279,1.992171,1.178283,,0.294485,-2.947541,...,-1.689038,-1.689038,,-1.737656,-1.433170,-0.727768,-1.782124,-1.273314,-1.749773,1.509293
1,-0.785770,-0.794417,-0.811850,2.178333,1.010464,1.971751,1.178283,,0.232390,-2.947541,...,-1.689038,-1.689038,,-1.737656,-1.433170,-0.727768,-1.782124,-1.273314,-1.749773,1.509293
2,-0.847530,-0.848147,-0.864652,1.990209,0.966577,1.881138,1.178283,,0.015059,-2.947541,...,-1.689038,-1.689038,,-1.737656,-1.433170,-0.727768,-1.782124,-1.273314,-1.749773,1.509293
3,-0.835576,-0.837746,-0.854430,2.021563,0.974990,1.898686,1.178283,,0.046107,-2.947541,...,-1.689038,-1.689038,,-1.737656,-1.433170,-0.727768,-1.782124,-1.273314,-1.749773,1.509293
4,-0.833584,-0.836022,-0.852737,2.021563,0.976392,1.901717,1.178283,,0.077154,-2.947541,...,-1.689038,-1.689038,,-1.737656,-1.433170,-0.727768,-1.782124,-1.273314,-1.749773,1.509293
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
764,1.762335,1.770585,1.807062,2.084271,-0.036935,-0.070557,-0.830421,,1.287999,0.518039,...,0.872251,0.872251,,0.911815,0.912132,0.802412,0.844046,0.722499,-0.429552,-0.847556
765,2.094047,2.097117,2.127951,2.429166,-0.043244,-0.079810,-0.830421,,1.815803,0.518039,...,0.872251,0.872251,,0.911815,0.912132,0.802412,0.844046,0.722499,-0.429552,-0.847556
766,2.206112,2.207421,2.236349,2.554582,-0.045488,-0.083000,-0.830421,,1.971040,0.518039,...,0.872251,0.872251,,0.911815,0.912132,0.802412,0.844046,0.722499,-0.429552,-0.847556
767,2.207606,2.208895,2.237798,2.554582,-0.045488,-0.083160,-0.830421,,1.971040,0.518039,...,0.872251,0.872251,,0.911815,0.912132,0.802412,0.844046,0.722499,-0.429552,-0.847556


In [27]:
stats[stats['std'] == 0]

Unnamed: 0,mean,std
accoci,0.0,0.0
ncfdiv,0.0,0.0
investmentsnc,0.0,0.0
dps,0.0,0.0
debtc,0.0,0.0
inventory,0.0,0.0
divyield,0.0,0.0


In [254]:
pipeline._df.columns

Index(['target', 'adj_close', 'ev', 'marketcap', 'pb', 'pe', 'evebit',
       'retearn', 'accoci', 'ps', 'shareswa', 'de', 'taxassets', 'ncfdiv',
       'shareswadil', 'sharesbas', 'debt', 'ps1', 'evebitda', 'bvps',
       'ppnenet', 'investmentsnc', 'equity', 'sps', 'rnd', 'debtusd',
       'equityusd', 'payables', 'assets', 'liabilities', 'assetsnc', 'depamor',
       'tangibles', 'debtnc', 'dps', 'liabilitiesnc', 'debtc', 'tbvps',
       'intangibles', 'opex', 'sbcomp', 'grossmargin', 'inventory',
       'revenueusd', 'revenue', 'divyield', 'sgna', 'cor', 'receivables', 'gp',
       'taxliabilities', 'invcap', 'currentratio', 'date'],
      dtype='object')

In [255]:
df = pipeline._df.copy()

In [257]:
train_dfs = get_period_data(pipeline._df, pipeline.train_periods)
train_xy_arrs = pipeline.get_xy_arr(train_dfs)

In [279]:
train_arrays = {"x": [], "y": [], "N": 0}
for train_df in train_dfs:
    N = train_df.shape[0]
    step = max_overlap
    if N >= model_seq_len:
        for i in range((N - model_seq_len) // (model_seq_len - max_overlap)):
            train_arrays["x"].append(train_df[pipeline._feature_cols].iloc[(N - (i * (model_seq_len - max_overlap) + model_seq_len)):(N - i * (model_seq_len - max_overlap))].values)
            train_arrays["y"].append([train_df["target"].iloc[(N - i * (model_seq_len - max_overlap)) - 1]])
            train_arrays["N"] += 1
train_arrays["x"] = np.array(train_arrays['x'][::-1])
train_arrays["y"] = np.array(train_arrays['y'][::-1])

In [280]:
train_val_distance = int(np.ceil(model_seq_len / (model_seq_len - max_overlap)))
fold_size = (train_arrays["N"] - train_val_distance) // cross_validation_folds

In [281]:
fold_size

14

In [282]:
folds = {}

In [283]:
for i in range(cross_validation_folds):
    train_end_ind = fold_size * (i + 1)
    val_begin_ind = fold_size * (i + 1) + train_val_distance
    val_end_ind = val_begin_ind + fold_size
    fold_arrs = {
        "train":{
            "x": train_arrays["x"][:train_end_ind],
            "y": train_arrays["y"][:train_end_ind],
        },
        "valid":{
            "x": train_arrays["x"][val_begin_ind:val_end_ind],
            "y": train_arrays["y"][val_begin_ind:val_end_ind],
        },
    }
    folds[i] = fold_arrs

In [14]:
date_diff = pd.to_datetime(df['date']) - pd.to_datetime(df['date'].shift(-1))

In [22]:
df.loc[1315:1323]

Unnamed: 0,date,adj_close,ev,marketcap,pb,pe,evebit,retearn,accoci,ps,...,revenueusd,revenue,divyield,sgna,cor,receivables,gp,taxliabilities,invcap,currentratio
1315,2016-01-25,23.67,4731.0,4939.3,22.9,597.0,1038.0,25049000.0,0.0,14.0,...,109706000.0,109706000.0,0.0,40020000.0,18473000.0,18273000.0,91233000.0,5584000.0,130095000.0,3.741
1316,2016-01-22,24.61,4927.2,5135.5,23.8,620.7,1081.0,25049000.0,0.0,14.5,...,109706000.0,109706000.0,0.0,40020000.0,18473000.0,18273000.0,91233000.0,5584000.0,130095000.0,3.741
1317,2016-01-21,24.24,4850.0,5058.3,23.4,611.3,1064.1,25049000.0,0.0,14.3,...,109706000.0,109706000.0,0.0,40020000.0,18473000.0,18273000.0,91233000.0,5584000.0,130095000.0,3.741
1318,2016-01-20,22.63,4514.0,4722.3,21.9,570.7,990.3,25049000.0,0.0,13.4,...,109706000.0,109706000.0,0.0,40020000.0,18473000.0,18273000.0,91233000.0,5584000.0,130095000.0,3.741
1319,2016-01-19,24.02,4804.1,5012.4,23.2,605.8,1054.0,25049000.0,0.0,14.2,...,109706000.0,109706000.0,0.0,40020000.0,18473000.0,18273000.0,91233000.0,5584000.0,130095000.0,3.741
1320,2016-01-15,25.19,5048.2,5256.5,24.3,635.3,1107.5,25049000.0,0.0,14.9,...,109706000.0,109706000.0,0.0,40020000.0,18473000.0,18273000.0,91233000.0,5584000.0,130095000.0,3.741
1321,2016-01-14,26.84,5392.5,5600.8,25.9,676.9,1183.1,25049000.0,0.0,15.8,...,109706000.0,109706000.0,0.0,40020000.0,18473000.0,18273000.0,91233000.0,5584000.0,130095000.0,3.741
1322,2016-01-13,26.73,5369.6,5577.9,25.8,674.1,1178.1,25049000.0,0.0,15.8,...,109706000.0,109706000.0,0.0,40020000.0,18473000.0,18273000.0,91233000.0,5584000.0,130095000.0,3.741
1323,2016-01-12,26.82,5388.4,5596.7,25.9,676.4,1182.2,25049000.0,0.0,15.8,...,109706000.0,109706000.0,0.0,40020000.0,18473000.0,18273000.0,91233000.0,5584000.0,130095000.0,3.741


In [8]:
df['ps1']

0       31.945
1       31.945
2       31.945
3       31.945
4       31.945
         ...  
1340    11.881
1341    11.881
1342    11.881
1343    11.881
1344    11.881
Name: ps1, Length: 1345, dtype: float64