In [17]:
#!pip install pandas-datareader
import time
import numpy as np
import pandas as pd
import json
import pandas_datareader.data as web
import matplotlib.pyplot as plt
import bs4 as bs
import pickle
import requests

In [3]:
def save_sp500_tickers():
    resp = requests.get('https://en.wikipedia.org/wiki/List_of_S%26P_500_companies')

    soup = bs.BeautifulSoup(resp.text, "lxml")
    table = soup.find('table', {'class':'wikitable sortable'})
    tickers = []
    for row in table.findAll('tr')[1:]:
        ticker = row.findAll('td')[0].text
        ticker = ticker[:-1]
        tickers.append(ticker)

    with open('sp500tickers.pickle','wb') as f:
        pickle.dump(tickers, f)
    return tickers

tickers = save_sp500_tickers()
tickers.sort()
tickers.remove('BF.B')
tickers.remove('BRK.B')
tickers.remove('CARR')
tickers.remove('DPZ')
tickers.remove('DXCM')
tickers.remove('OTIS')
tickers.remove('WST')

hm_days = 7

def buy_sell_hold(*args):
    cols = [c for c in args]
    requirement = 0.0235
    for col in cols:
        if col > requirement:
            return 1
        if col < -requirement:
            return -1
    return 0


In [5]:
from collections import OrderedDict
import pytz

data = OrderedDict()
stock_data_preprocessed = pd.DataFrame()
for ticker in tickers:
    df = pd.read_csv("stocks_dfs/{}.csv".format(ticker), index_col=0, parse_dates=['Date'])
    df = df[['Adj Close','Volume']]
    df.fillna(0, inplace=True)
    for i in range(1, hm_days+1):
        df['{}d'.format(i)] = (df['Adj Close'].shift(-i) - df['Adj Close']) / df['Adj Close']
        
    '''df['50MA'] = df['Adj Close'].rolling(50).mean()
    df['25MA'] = df['Adj Close'].rolling(25).mean()
    df['10MA'] = df['Adj Close'].rolling(10).mean()
    df['5MA'] = df['Adj Close'].rolling(5).mean()

    df['50STD'] = df['Adj Close'].rolling(50).std()
    df['25STD'] = df['Adj Close'].rolling(25).std()
    df['10STD'] = df['Adj Close'].rolling(10).std()

    df['50UBB'] = df['50MA'] + (df['50STD'] * 2)
    df['25UBB'] = df['25MA'] + (df['25STD'] * 2)

    df['50LBB'] = df['50MA'] - (df['50STD'] * 2)
    df['25LBB'] = df['25MA'] - (df['25STD'] * 2)'''
    
    data[ticker] = df
    

stock_data_preprocessed = pd.concat(data.values(),keys=tickers,names=['Ticker','Date'])

stock_data_preprocessed = stock_data_preprocessed.swaplevel()
stock_data_preprocessed = stock_data_preprocessed.sort_index()
#stock_data_preprocessed.to_csv('stock_data_preprocessed.csv')

In [6]:
stock_data_preprocessed.head()

Unnamed: 0_level_0,Unnamed: 1_level_0,Adj Close,Volume,1d,2d,3d,4d,5d,6d,7d
Date,Ticker,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1
2010-01-04,A,20.436504,3815500.0,-0.010862,-0.014377,-0.015655,-0.015975,-0.015336,-0.027157,-0.019489
2010-01-04,AAL,4.496876,9837300.0,0.113208,0.067086,0.098533,0.077568,0.056604,0.06499,0.148847
2010-01-04,AAP,39.293575,1701700.0,-0.005943,0.002724,0.002476,0.006439,-0.003467,-0.020802,-0.007182
2010-01-04,AAPL,26.538483,123432400.0,0.001729,-0.014205,-0.016027,-0.009485,-0.018223,-0.029391,-0.0157
2010-01-04,ABC,22.813559,2455900.0,-0.007134,-0.016522,-0.032294,-0.02178,-0.01089,-0.004131,0.015396


In [7]:
stock_data_preprocessed['target'] = list(map( buy_sell_hold,
                                               stock_data_preprocessed['1d'.format(ticker)],
                                               stock_data_preprocessed['2d'.format(ticker)],
                                               stock_data_preprocessed['3d'.format(ticker)],
                                               stock_data_preprocessed['4d'.format(ticker)],
                                               stock_data_preprocessed['5d'.format(ticker)],
                                               stock_data_preprocessed['6d'.format(ticker)],
                                               stock_data_preprocessed['7d'.format(ticker)] ))

In [15]:
stock_data_preprocessed.drop(columns=['1d', '2d','3d','4d','5d','6d','7d'], inplace=True)
stock_data_preprocessed.replace([np.inf, -np.inf], np.nan, inplace = True)
stock_data_preprocessed.fillna(0, inplace=True)

In [12]:
from sklearn.preprocessing import MinMaxScaler
scaler = MinMaxScaler()
stock_data_preprocessed[['Adj Close', 'Volume']] = scaler.fit_transform(stock_data_preprocessed[['Adj Close', 'Volume']])
stock_data_preprocessed.to_csv('stock_data_preprocessed.csv')