<a href="https://colab.research.google.com/github/stevengregori92/Learn-PyForFinance/blob/main/Target_Function.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
!gdown https://drive.google.com/uc?id=1LfMank89K6cKESN0S92FXeFvKT-Rx7BU
!unzip /content/sp500_joined_close.zip

Downloading...
From: https://drive.google.com/uc?id=1LfMank89K6cKESN0S92FXeFvKT-Rx7BU
To: /content/sp500_joined_close.zip
100% 13.8M/13.8M [00:00<00:00, 102MB/s] 
Archive:  /content/sp500_joined_close.zip
  inflating: sp500_joined_close.csv  


In [2]:
from collections import Counter
import numpy as np
import pandas as pd
import pickle

In [3]:
def process_data_for_labels(ticker):
  hm_days = 7
  df = pd.read_csv('sp500_joined_close.csv', index_col=0)
  tickers = df.columns.values.tolist()
  df.fillna(0, inplace=True)

  for i in range(1,hm_days+1):
    df['{}_{}d'.format(ticker,i)] = (df[ticker].shift(-i) - df[ticker]) / df[ticker]

  df.fillna(0, inplace=True)
  return tickers, df

In [4]:
process_data_for_labels('AAPL')

(['MMM',
  'AOS',
  'ABT',
  'ABBV',
  'ACN',
  'ATVI',
  'ADM',
  'ADBE',
  'ADP',
  'AAP',
  'AES',
  'AFL',
  'A',
  'APD',
  'AKAM',
  'ALK',
  'ALB',
  'ARE',
  'ALGN',
  'ALLE',
  'LNT',
  'ALL',
  'GOOGL',
  'GOOG',
  'MO',
  'AMZN',
  'AMCR',
  'AMD',
  'AEE',
  'AAL',
  'AEP',
  'AXP',
  'AIG',
  'AMT',
  'AWK',
  'AMP',
  'ABC',
  'AME',
  'AMGN',
  'APH',
  'ADI',
  'ANSS',
  'AON',
  'APA',
  'AAPL',
  'AMAT',
  'APTV',
  'ACGL',
  'ANET',
  'AJG',
  'AIZ',
  'T',
  'ATO',
  'ADSK',
  'AZO',
  'AVB',
  'AVY',
  'AXON',
  'BKR',
  'BALL',
  'BAC',
  'BBWI',
  'BAX',
  'BDX',
  'WRB',
  'BRK.B',
  'BBY',
  'BIO',
  'TECH',
  'BIIB',
  'BLK',
  'BK',
  'BA',
  'BKNG',
  'BWA',
  'BXP',
  'BSX',
  'BMY',
  'AVGO',
  'BR',
  'BRO',
  'BF.B',
  'BG',
  'CHRW',
  'CDNS',
  'CZR',
  'CPT',
  'CPB',
  'COF',
  'CAH',
  'KMX',
  'CCL',
  'CARR',
  'CTLT',
  'CAT',
  'CBOE',
  'CBRE',
  'CDW',
  'CE',
  'CNC',
  'CNP',
  'CDAY',
  'CF',
  'CRL',
  'SCHW',
  'CHTR',
  'CVX',
  'CMG',
 

In [5]:
def buy_sell_hold(*args):
  cols = [c for c in args]
  requirement = 0.02
  for col in cols:
    if col > requirement:
      return 1
    if col < -requirement:
      return -1
  return 0

In [6]:
def extract_featuresets(ticker):
  tickers, df = process_data_for_labels(ticker)

  df['{}_target'.format(ticker)] = list(map(buy_sell_hold,
                                        df['{}_1d'.format(ticker)],
                                        df['{}_2d'.format(ticker)],
                                        df['{}_3d'.format(ticker)],
                                        df['{}_4d'.format(ticker)],
                                        df['{}_5d'.format(ticker)],
                                        df['{}_6d'.format(ticker)],
                                        df['{}_7d'.format(ticker)]))

  vals = df['{}_target'.format(ticker)].values.tolist()
  str_vals = [str(i) for i in vals]
  print('Data spread: ', Counter(str_vals))

  df.fillna(0, inplace=True)
  df = df.replace([np.inf, -np.inf], np.nan)
  df.dropna(inplace=True)

  df_vals = df[[ticker for ticker in tickers]].pct_change()
  df_vals = df_vals.replace([np.inf, -np.inf], 0)
  df_vals.fillna(0, inplace=True)

  X = df_vals.values
  y = df['{}_target'.format(ticker)].values

  return X, y, df

In [7]:
extract_featuresets('AAPL')

Data spread:  Counter({'1': 2098, '-1': 1830, '0': 349})


(array([[ 0.        ,  0.        ,  0.        , ...,  0.        ,
          0.        ,  0.        ],
        [-0.0397352 , -0.01440929, -0.02857114, ...,  0.        ,
         -0.04842313,  0.        ],
        [ 0.02896575, -0.00292327, -0.00183812, ...,  0.        ,
         -0.00118312,  0.        ],
        ...,
        [-0.00469493, -0.01347458, -0.00984495, ...,  0.00155509,
         -0.01479635, -0.00521211],
        [ 0.00185309,  0.00147112,  0.00235485, ...,  0.0042692 ,
         -0.01132158,  0.00336844],
        [ 0.00089701, -0.00650448,  0.00260997, ..., -0.00289857,
          0.00584265, -0.00167856]]),
 array([-1, -1, -1, ...,  0,  0,  0]),
                    MMM        AOS        ABT       ABBV         ACN  \
 Date                                                                  
 2000-01-03   25.143320   2.360311   9.131376   0.000000    0.000000   
 2000-01-04   24.144245   2.326301   8.870482   0.000000    0.000000   
 2000-01-05   24.843601   2.319500   8.854177 