In [10]:
import pandas as pd
import numpy as np
import datetime
from tiingo import TiingoClient
from sklearn.impute import SimpleImputer
from sklearn import preprocessing
import indicoio
from textblob import TextBlob
from sklearn import linear_model
import _pickle as cPickle

In [18]:
# feature functions
def impute(X):
    imp = SimpleImputer(missing_values=np.nan, strategy='mean')
    imp = imp.fit(X)
    X = imp.transform(X)
    return X

def good_bag(string_vector):
    bag = pd.read_csv("data/sentiment_word_list.csv")
    good_bag = bag["good"].dropna().apply(str.lower).to_list()
    good_count = 0
    count = 0
    ttl_length = 0
    string_vector = ' '.join(string_vector)
    for word in good_bag:
        for word2 in string_vector.split():
            ttl_length = ttl_length + 1
            if (word.lower() == word2.lower()):
                count = count + 1
    return count/ttl_length


def bad_bag(string_vector):
    bag = pd.read_csv("data/sentiment_word_list.csv")
    bad_bag = bag["bad"].apply(str.lower).to_list()
    bad_count = 0
    good_count = 0
    count = 0
    ttl_length = 0
    string_vector = ' '.join(string_vector)
    for word in bad_bag:
        for word2 in string_vector.split():
            ttl_length = ttl_length + 1
            if (word.lower() == word2.lower()):
                count = count + 1
    return count/ttl_length


def aggregate_jsons(json_list):
    result = []
    for json in json_list:
        headline = json["title"]
        desc = json["description"]
        x = headline + " " + desc
        result.append(x)
    return result

In [19]:
# takes in a model object
# writes it out to a PKL file
def pickle_up(model, filepath):
    # save the classifier
    with open(filepath, 'wb') as fid:
        cPickle.dump(model, fid) 
    print("saved model to {0}".format(filepath))
        
def pickle_down(filepath):
    # load it again
    with open(filepath, 'rb') as fid:
        model_loaded = cPickle.load(fid)
    return model_loaded

In [20]:
def pipeline(ticker, date):
    six_days = lambda start_date: start_date + datetime.timedelta(days=6)
    remove_time = lambda dt: dt[0:10]
    
    start_date = pd.to_datetime(pd.Series([date]), infer_datetime_format = True)
    end_date = start_date.apply(six_days).apply(str)[0]
    start_date = start_date.apply(str).apply(remove_time)[0]
    
    client = TiingoClient({"api_key": "a265fc4a1013923f970d16e7348195074e97fcb0"})
    prices = client.get_ticker_price(ticker, fmt='object', startDate=start_date, endDate=end_date, frequency='daily')
    open_price = prices[0].open
    close_price = prices[-1].close
    
     # add json 
    client = TiingoClient({"api_key": "a265fc4a1013923f970d16e7348195074e97fcb0"})
    query_ticker = lambda t, s, e: client.get_news(tickers=[t], startDate=s, endDate=e)
    json = query_ticker(ticker, start_date, end_date)

    # create corpus
    corpus = aggregate_jsons(json)
    vectorized = corpus
    combinatric = lambda l: ''.join(l)
    corpus = combinatric(vectorized)
    
    indicoio.config.api_key = "04878c9a5bb99aaf8a8ccdd65954442a"

    # add sentiment
    mean = lambda listy: sum(listy)/len(listy)
    sentiment = (lambda orig: TextBlob(orig).sentiment.polarity) (corpus)
    sentiment_test = mean((lambda orig: [TextBlob(o).sentiment.polarity for o in orig])(vectorized))
    indico_sentiment = (lambda text: indicoio.sentiment_hq(text))(corpus)

    # add bag of words featurization
    badbag = bad_bag(vectorized)
    goodbag = good_bag(vectorized)
    
    # add last week's data
    start_last = pd.to_datetime(pd.Series([start_date])).apply(lambda start_date: start_date - datetime.timedelta(days=6))
    start_last = remove_time(str(start_last.values[0]))
    end_last = start_date
    lastweek = client.get_ticker_price(ticker, fmt='object', startDate=start_last, endDate=end_last, frequency='daily')
    lastweek = lastweek[0].close
    
    # add the delta - up or down
    tri_delt = close_price - open_price
    if tri_delt > 0:
        delta = 1
    else:
        delta = 0

    df = pd.DataFrame(
        {
            "Start": pd.Series([start_date]),
            "End": pd.Series([end_date]),
            "sentiment": pd.Series([sentiment]),
            "sentiment_test": pd.Series([sentiment_test]),
            "indico_sentiment": pd.Series([indico_sentiment]),
            "bad_bag": pd.Series([badbag]),
            "good_bag": pd.Series([goodbag]),
            "lastweek": pd.Series([lastweek]),
            "delta": pd.Series([delta])
        }
    )
    
    return df

In [21]:
def multi_row_pipeline(dates, ticker):
    rows = []
    for date in dates:
        row = pipeline(ticker, date)
        rows.append(row)
    df = pd.concat(rows)
    return df

In [22]:
def run_model(df, model_path):
    model = pickle_down(model_path)
    X_test = df[["indico_sentiment", "sentiment", "sentiment_test", "bad_bag", "good_bag"]]
    Y_test = df["delta"]
    
    X_test = impute(X_test)
    scaler = preprocessing.StandardScaler().fit(X_test)
    X_test = scaler.transform(X_test)         
    Y_predicted = model.predict(X_test)
    accuracy = (np.sum(Y_predicted) / np.sum(Y_test)) * 100
    print("predicted with {0}% accuracy".format(accuracy))
    return pd.DataFrame({"actual": Y_test, "predicted": Y_predicted})

# takes in a dataframe, runs logistic regression
def generate_model(d):
    training_X_cols = ["indico_sentiment", "sentiment", 
                       "sentiment_test", "bad_bag", "good_bag"]
    training_Y_col = "delta"

    # Train on ALL rows
    X_train = impute(d[training_X_cols])
    Y_train = d[training_Y_col]

    scaler = preprocessing.StandardScaler().fit(X_train)
    X_train = scaler.transform(X_train)

    model = linear_model.LogisticRegression(solver="lbfgs")
    model.fit(X_train, Y_train)
    
    return model

## training + saving logistic regression model

In [23]:
tickers = ["aapl", "amzn", "baba", "crm", "fb", "googl", "msft", "nflx", "tsla", "wmt"]
start_dates = pd.read_csv("data/dates.csv")["0"].tolist()

In [24]:
master_df_set = []
for ticker in tickers:
    for date in start_dates:
        row = pipeline(ticker, date)
        master_df_set.append(row)
        print("finished date {0} for ticker {1}".format(date, ticker))
        print("S I C K O M O D E")

finished date 2018-12-24 for ticker aapl
S I C K O M O D E
finished date 2018-12-31 for ticker aapl
S I C K O M O D E
finished date 2019-01-07 for ticker aapl
S I C K O M O D E
finished date 2019-01-14 for ticker aapl
S I C K O M O D E
finished date 2019-01-21 for ticker aapl
S I C K O M O D E
finished date 2019-01-28 for ticker aapl
S I C K O M O D E
finished date 2019-02-04 for ticker aapl
S I C K O M O D E
finished date 2019-02-11 for ticker aapl
S I C K O M O D E
finished date 2019-02-18 for ticker aapl
S I C K O M O D E
finished date 2019-02-25 for ticker aapl
S I C K O M O D E
finished date 2019-03-04 for ticker aapl
S I C K O M O D E
finished date 2019-03-11 for ticker aapl
S I C K O M O D E
finished date 2019-03-18 for ticker aapl
S I C K O M O D E
finished date 2019-03-25 for ticker aapl
S I C K O M O D E
finished date 2019-04-01 for ticker aapl
S I C K O M O D E
finished date 2019-04-08 for ticker aapl
S I C K O M O D E
finished date 2019-04-15 for ticker aapl
S I C K O M O D

finished date 2019-02-04 for ticker googl
S I C K O M O D E
finished date 2019-02-11 for ticker googl
S I C K O M O D E
finished date 2019-02-18 for ticker googl
S I C K O M O D E
finished date 2019-02-25 for ticker googl
S I C K O M O D E
finished date 2019-03-04 for ticker googl
S I C K O M O D E
finished date 2019-03-11 for ticker googl
S I C K O M O D E
finished date 2019-03-18 for ticker googl
S I C K O M O D E
finished date 2019-03-25 for ticker googl
S I C K O M O D E
finished date 2019-04-01 for ticker googl
S I C K O M O D E
finished date 2019-04-08 for ticker googl
S I C K O M O D E
finished date 2019-04-15 for ticker googl
S I C K O M O D E
finished date 2019-04-22 for ticker googl
S I C K O M O D E
finished date 2019-04-29 for ticker googl
S I C K O M O D E
finished date 2019-05-06 for ticker googl
S I C K O M O D E
finished date 2019-05-13 for ticker googl
S I C K O M O D E
finished date 2019-05-20 for ticker googl
S I C K O M O D E
finished date 2019-05-27 for ticker goog

In [28]:
df = pd.concat(master_df_set)
df.to_csv("data/240_rows.csv")
model = generate_model(df)
pickle_up(model, "data/logistic.pkl")

saved model to data/logistic.pkl


In [29]:
model = pickle_down("data/logistic.pkl")

In [30]:
dates_to_predict = ["2019-07-01", "2019-07-02", "2019-07-03", "2019-07-04", "2019-07-05"]
df = multi_row_pipeline(dates_to_predict, "fb")
run_model(df, "data/logistic.pkl")

predicted with 100.0% accuracy


Unnamed: 0,actual,predicted
0,1,1
0,1,1
0,1,1
0,1,1
0,1,1


In [32]:
pd.read_csv("data/240_rows.csv")

Unnamed: 0.1,Unnamed: 0,Start,End,sentiment,sentiment_test,indico_sentiment,bad_bag,good_bag,lastweek,delta
0,0,2018-12-24,2018-12-30 00:00:00,0.115305,0.095800,0.459462,0.000005,0.000023,166.07,1
1,0,2018-12-31,2019-01-06 00:00:00,0.115305,0.095800,0.459462,0.000005,0.000023,157.17,0
2,0,2019-01-07,2019-01-13 00:00:00,0.115305,0.095800,0.459462,0.000005,0.000023,157.92,1
3,0,2019-01-14,2019-01-20 00:00:00,0.115305,0.095800,0.459462,0.000005,0.000023,150.75,1
4,0,2019-01-21,2019-01-27 00:00:00,0.115305,0.095800,0.459462,0.000005,0.000023,153.07,1
5,0,2019-01-28,2019-02-03 00:00:00,0.115305,0.095800,0.459462,0.000005,0.000023,153.30,1
6,0,2019-02-04,2019-02-10 00:00:00,0.115305,0.095800,0.459462,0.000005,0.000023,154.68,1
7,0,2019-02-11,2019-02-17 00:00:00,0.115305,0.095800,0.459462,0.000005,0.000023,174.18,0
8,0,2019-02-18,2019-02-24 00:00:00,0.115305,0.095800,0.459462,0.000005,0.000023,170.89,1
9,0,2019-02-25,2019-03-03 00:00:00,0.115305,0.095800,0.459462,0.000005,0.000023,170.93,1
