In [3]:
import pandas as pd
import numpy as np
import datetime
from tiingo import TiingoClient
from sklearn.impute import SimpleImputer
from sklearn import preprocessing
import indicoio
from textblob import TextBlob
from sklearn import linear_model
import _pickle as cPickle

In [97]:
# feature functions
def impute(X):
    imp = SimpleImputer(missing_values=np.nan, strategy='mean')
    imp = imp.fit(X)
    X = imp.transform(X)
    return X

def good_bag(string_vector):
    print(string_vector)
    bag = pd.read_csv("data/bag_of_words.csv")
    good_bag = bag["Good Words"].apply(str.lower).to_list()
    print(good_bag)
    good_count = 0
    count = 0
    ttl_length = 0
    string_vector = ' '.join(string_vector)
    for word in good_bag:
        for word2 in string_vector.split():
            ttl_length = ttl_length + 1
            if (word.lower() == word2.lower()):
                count = count + 1
    return count/ttl_length


def bad_bag(string_vector):
    bag = pd.read_csv("data/bag_of_words.csv")
    bad_bag = bag["Bad Words"].apply(str.lower).to_list()
    bad_count = 0
    good_count = 0
    count = 0
    ttl_length = 0
    string_vector = ' '.join(string_vector)
    for word in bad_bag:
        for word2 in string_vector.split():
            ttl_length = ttl_length + 1
            if (word.lower() == word2.lower()):
                count = count + 1
    return count/ttl_length


def aggregate_jsons(json_list):
    result = []
    for json in json_list:
        headline = json["title"]
        desc = json["description"]
        x = headline + " " + desc
        result.append(x)
    return result


In [87]:
def pipeline(df_path, ticky):
    # get the dataframe from csv
    df = pd.read_csv(df_path)

    # fix dates
    df["Date"] = df["Date"].apply(lambda x: pd.to_datetime(x, infer_datetime_format=True))
    df["Start"] = df["Date"].apply(str)
    six_days = lambda start_date: start_date + datetime.timedelta(days=6)
    df["End"] = df["Date"].apply(six_days).apply(str)
    df = df.drop(columns=["Date"])
    remove_time = lambda dt: dt[0:10]
    df["Start"] = df["Start"].apply(remove_time)
    df["End"] = df["End"].apply(remove_time)

    # add json 
    client = TiingoClient({"api_key": "a265fc4a1013923f970d16e7348195074e97fcb0"})
    query_ticker = lambda t, s, e: client.get_news(tickers=[t], startDate=s, endDate=e)
    df["JSON"] = df.apply(lambda d: query_ticker(ticky, d["Start"], d["End"]), axis=1)

    # create corpus
    df["corpus"] = df["JSON"].apply(aggregate_jsons)
    df["vectorized"] = df["corpus"]
    combinatric = lambda l: ''.join(l)
    df["corpus"] = df["vectorized"].apply(combinatric)
    
    # new key 04878c9a5bb99aaf8a8ccdd65954442a under prianka@contrarycap.com
    indicoio.config.api_key = "04878c9a5bb99aaf8a8ccdd65954442a"

    # add sentiment
    df["sentiment"] = df["corpus"].apply(lambda orig: TextBlob(orig).sentiment.polarity)
    df["sentiment_test"] = df["vectorized"].apply(lambda orig: [TextBlob(o).sentiment.polarity for o in orig]).apply(
        np.mean)
    df["indico_sentiment"] = df["corpus"].apply(lambda text: indicoio.sentiment_hq(text))

    # add bag of words featurization
    df["bad_bag"] = df["vectorized"].apply(bad_bag)
    df["good_bag"] = df["vectorized"].apply(good_bag)

    # return the nice beautiful dataframe
    
    return df

In [88]:
# takes in a BIG FAT dataframe
# returns a trained Bayesian Ridge regression object that can be pickled
def massive_lm(d):
    d["lastweek"] = d["Close"]
    d.lastweek = d.lastweek.shift(7)
    training_X_cols = ["Open", "lastweek", "indico_sentiment", "sentiment", "sentiment_test", "bad_bag", "good_bag"]
    training_Y_col = "Close"

    # Train on ALL rows

    X_train = impute(d[training_X_cols])
    Y_train = d[training_Y_col]

    scaler = preprocessing.StandardScaler().fit(X_train)
    X_train = scaler.transform(X_train)

    model = linear_model.BayesianRidge()
    model.fit(X_train, Y_train)
    
    print("created bayesian ridge regression model. returning !")
    return model

In [89]:
# takes in a model object
# writes it out to a PKL file
def pickle_up(model, filepath):
    # save the classifier
    with open(filepath, 'wb') as fid:
        cPickle.dump(model, fid) 
    print("saved model to {0}".format(filepath))
        
def pickle_down(filepath):
    # load it again
    with open(filepath, 'rb') as fid:
        model_loaded = cPickle.load(fid)
    return model_loaded

In [90]:
tickers = ["aapl", "amzn", "baba", "crm", "fb", "googl", "msft", "nflx", "tsla", "wmt"]

In [None]:
bois = []
# prepare all rows into a giant train-ready dataset
for ticker in tickers:
    ticked_df = pipeline('data/6m-weekly/{0}-6m-weekly.csv'.format(ticker), ticker)
    bois.append(ticked_df)

In [18]:
df = pd.concat(bois, ignore_index = True)

In [19]:
df.head()

Unnamed: 0,Open,High,Low,Close,Adj Close,Volume,Start,End,JSON,corpus,vectorized,sentiment,sentiment_test,indico_sentiment,bad_bag,good_bag
0,148.149994,158.520004,146.589996,156.229996,154.966034,191160200,2018-12-24,2018-12-30,"[{'crawlDate': '2019-04-10T14:13:54.677740Z', ...","Apple hires Jaunt XR founder, sparking fresh s...","[Apple hires Jaunt XR founder, sparking fresh ...",0.073553,0.058019,0.887193,0.0,0.0
1,158.529999,159.360001,142.0,148.259995,147.060516,221962500,2018-12-31,2019-01-06,"[{'crawlDate': '2019-04-10T14:13:54.677740Z', ...","Apple hires Jaunt XR founder, sparking fresh s...","[Apple hires Jaunt XR founder, sparking fresh ...",0.073553,0.058019,0.887193,0.0,0.0
2,148.699997,154.529999,145.899994,152.289993,151.057922,203706100,2019-01-07,2019-01-13,"[{'crawlDate': '2019-04-10T14:13:54.677740Z', ...","Apple hires Jaunt XR founder, sparking fresh s...","[Apple hires Jaunt XR founder, sparking fresh ...",0.073553,0.058019,0.887193,0.0,0.0
3,150.850006,157.880005,149.220001,156.820007,155.551285,155292000,2019-01-14,2019-01-20,"[{'crawlDate': '2019-04-10T14:13:54.677740Z', ...","Apple hires Jaunt XR founder, sparking fresh s...","[Apple hires Jaunt XR founder, sparking fresh ...",0.073553,0.058019,0.887193,0.0,0.0
4,156.410004,158.130005,151.699997,157.759995,156.483673,112501600,2019-01-21,2019-01-27,"[{'crawlDate': '2019-04-10T14:13:54.677740Z', ...","Apple hires Jaunt XR founder, sparking fresh s...","[Apple hires Jaunt XR founder, sparking fresh ...",0.073553,0.058019,0.887193,0.0,0.0


In [31]:
model = massive_lm(df)
pickle_up(model, "bayesian_model.pkl")

created bayesian ridge regression model. returning !
saved model to bayesian_model.pkl


In [91]:
def predict_new(ticker, date):
    six_days = lambda start_date: start_date + datetime.timedelta(days=6)
    remove_time = lambda dt: dt[0:10]
    
    start_date = pd.to_datetime(pd.Series([date]), infer_datetime_format = True)
    end_date = start_date.apply(six_days).apply(str)[0]
    start_date = start_date.apply(str).apply(remove_time)[0]
    
    client = TiingoClient({"api_key": "a265fc4a1013923f970d16e7348195074e97fcb0"})
    prices = client.get_ticker_price(ticker, fmt='object', startDate=start_date, endDate=end_date, frequency='daily')
    open_price = prices[0].open
    close_price = prices[-1].close
    
     # add json 
    client = TiingoClient({"api_key": "a265fc4a1013923f970d16e7348195074e97fcb0"})
    query_ticker = lambda t, s, e: client.get_news(tickers=[t], startDate=s, endDate=e)
    json = query_ticker(ticker, start_date, end_date)

    # create corpus
    corpus = aggregate_jsons(json)
    vectorized = corpus
    combinatric = lambda l: ''.join(l)
    corpus = combinatric(vectorized)
    
    indicoio.config.api_key = "04878c9a5bb99aaf8a8ccdd65954442a"

    # add sentiment
    mean = lambda listy: sum(listy)/len(listy)
    sentiment = (lambda orig: TextBlob(orig).sentiment.polarity) (corpus)
    sentiment_test = mean((lambda orig: [TextBlob(o).sentiment.polarity for o in orig])(vectorized))
    indico_sentiment = (lambda text: indicoio.sentiment_hq(text))(corpus)

    # add bag of words featurization
    badbag = bad_bag(vectorized)
    goodbag = good_bag(vectorized)
    
    # add last week's data
    start_last = pd.to_datetime(pd.Series([start_date])).apply(lambda start_date: start_date - datetime.timedelta(days=6))
    start_last = remove_time(str(start_last.values[0]))
    print(start_last)
    end_last = start_date
    print(end_last)
    lastweek = client.get_ticker_price(ticker, fmt='object', startDate=start_last, endDate=end_last, frequency='daily')
    lastweek = lastweek[0].close

    df = pd.DataFrame(
        {
            "Open": pd.Series([open_price]),
            "Close": pd.Series([close_price]),
            "Start": pd.Series([start_date]),
            "End": pd.Series([end_date]),
            "corpus": pd.Series([corpus]),
            "vectorized": pd.Series([vectorized]),
            "sentiment": pd.Series([sentiment]),
            "sentiment_test": pd.Series([sentiment_test]),
            "indico_sentiment": pd.Series([indico_sentiment]),
            "bad_bag": pd.Series([badbag]),
            "good_bag": pd.Series([goodbag]),
            "lastweek": pd.Series([lastweek])
        }
    )
    
    # now perform the MODELING!!
    model = pickle_down("bayesian_model.pkl")
    X_test = df[["Open", "lastweek", "indico_sentiment", "sentiment", "sentiment_test", "bad_bag", "good_bag"]]
    Y_test = df["Close"][0]
    
    X_test = impute(X_test)
    scaler = preprocessing.StandardScaler().fit(X_test)
    X_test = scaler.transform(X_test)         
    result = model.predict(X_test)[0]
    print("predicted {0} and error was {1}".format(result, result - Y_test ))
    
    return df

In [98]:
predict_new("aapl", "2019-01-15")

["FAANNG Stocks, Ranked From Cheapest to Most Expensive Ignoring current mutiples and looking at future multiples only, FAANNG stocks aren't that expensive, and many of them are actually cheap.", 'FAANNG Stocks, Ranked From Cheapest to Most Expensive InvestorPlace - Stock Market News, Stock Advice & Trading TipsOne thing that I have found true about the market is that everyday investors are.', "Saudi Aramco's Debut Debt Sale Sees Slick Demand Bond investors’ fervor for new U.S. dollar-denominated corporate bond sales and emerging market assets ratcheted up a notch with Saudi Arabian Oil Company’s (Sa", 'Despite Big Q1 Rally, Info Tech Earnings Seen Down On Tough Comparisons, Economy Info Tech earnings season approaches as the sector continues to lead the market rally. Investors appear to have high expectations, despite what many analysts ex', 'A toddler locked his dad out of this iPad for 48 years What to do when you’re locked out of your Apple or Android device.', "Apple's $9.99-a-mon

predicted 440.70751994814816 and error was 283.88751994814817


Unnamed: 0,Open,Close,Start,End,corpus,vectorized,sentiment,sentiment_test,indico_sentiment,bad_bag,good_bag,lastweek
0,150.27,156.82,2019-01-15,2019-01-21 00:00:00,"FAANNG Stocks, Ranked From Cheapest to Most Ex...","[FAANNG Stocks, Ranked From Cheapest to Most E...",0.094231,0.053972,0.849065,1.5e-05,7e-06,153.31
