In [52]:
import pandas as pd
import numpy as np
import datetime
from tiingo import TiingoClient
from sklearn.impute import SimpleImputer
import indicoio
import seaborn as sns
import time
from textblob import TextBlob
import matplotlib as plt

In [68]:
def bag_of_words(string_vector):
    bad_bag = ...
    good_bag = ...
    bad_count, good_count = 0
    for word in bad_bag:
        bad_count += string_vector.count(word)
    for word in good_bag:
        good_count += string_vector.count(word)
    bad = bad_count/len(string_vector)
    good = good_count/len(string_vector)
    return good, bad
    
def aggregate_jsons(json_list):
    result = []
    for json in json_list:
        headline = json["title"]
        desc = json["description"]
        x = headline + " " + desc
        result.append(x)
    return result

def pipeline(df_path, ticky):
    # get the dataframe from csv
    df = pd.read_csv(df_path)
    
    # fix dates
    df["Date"] = df["Date"].apply(lambda x: pd.to_datetime(x, infer_datetime_format=True))
    df["Start"] = df["Date"].apply(str)
    six_days = lambda start_date: start_date + datetime.timedelta(days=6)
    df["End"] = df["Date"].apply(six_days).apply(str)
    df = df.drop(columns=["Date"])
    remove_time = lambda dt: dt[0:10]
    df["Start"] = df["Start"].apply(remove_time)
    df["End"] = df["End"].apply(remove_time)
    
    # add json
    client = TiingoClient({"api_key": "a265fc4a1013923f970d16e7348195074e97fcb0"})
    query_ticker = lambda t, s, e: client.get_news(tickers=[t], startDate=s, endDate=e)
    df["JSON"] = df.apply(lambda d: query_ticker(ticky, d["Start"], d["End"]), axis=1)
    
    # create corpus
    df["corpus"] = df["JSON"].apply(aggregate_jsons)
    df["vectorized"] = df["corpus"]
    combinatric = lambda l: ''.join(l)
    df["corpus"] = df["vectorized"].apply(combinatric)
    indicoio.config.api_key = "25b83c4c388204edd2c6c11cd907e048"
    
    # add sentiment
    df["sentiment"] = df["corpus"].apply(lambda orig: TextBlob(orig).sentiment.polarity)
    df["sentiment_test"] = df["vectorized"].apply(lambda orig: [TextBlob(o).sentiment.polarity for o in orig]).apply(np.mean)
    df["indico_sentiment"] = df["corpus"].apply(lambda text: indicoio.sentiment_hq(text))
    
    # add bag of words featurization
    df["bad_bag"], df["good_bag"] = df["vectorized"].apply(bag_of_words)
    # add statistical features
    # df["features"] = df["corpus"].apply(lambda text: indicoio.text_features(text, v=2))
    
    # add bag of words ratio
    #df["bad_ratio"] = 
    #df["good_keywords"]
    
    # return the nice beautiful dataframe
    return df

In [54]:
def predictor(d, ticky):
    from sklearn.model_selection import train_test_split
    from sklearn import linear_model

    d["constvec"] = [0.7] * len(d)
    d["lastweek"] = d["Close"]
    df.lastweek = df.lastweek.shift(7) ## shift down
    #df.lastweek.drop(df.gdp.shape[0] - 1,inplace = True) ## removing the first row
    X = d[["Open", "lastweek", "indico_sentiment"]]
    Y = d["Close"]
    imp = SimpleImputer(missing_values=np.nan, strategy='mean')
    imp = imp.fit(X)
    X = imp.transform(X)
    X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size = 0.3, random_state = 69)

    def mse(v1, v2):
        return np.sum((v1 - v2) ** 2) 
    
    model = linear_model.BayesianRidge()
    model.fit(X_train, Y_train)
    Y_predicted = model.predict(X_test)
    error = mse(Y_predicted, Y_test)
    print("aggregated error : {0}".format(error))
    print("average error by prediction for {1}: {0}".format(np.mean(Y_predicted - Y_test), ticky))

In [55]:
start = time.time()
df = pipeline('aapl-6m-weekly.csv', 'aapl')
end = time.time()
pipeline_time = end - start
print("pipeline took {0}s for {1}".format(pipeline_time, 'aapl'))
predictor(df, 'aapl')

pipeline took 31.345920085906982s for aapl
aggregated error : 158.50476680221217
average error by prediction for aapl: 0.0879561589843484


In [56]:
start = time.time()
df = pipeline('crm-6m-weekly.csv', 'crm')
end = time.time()
pipeline_time = end - start
print("pipeline took {0}s for {1}".format(pipeline_time, 'crm'))
predictor(df, 'crm')

pipeline took 28.644730806350708s for crm
aggregated error : 87.24971769126635
average error by prediction for crm: 1.8844314922405028


In [57]:
start = time.time()
df = pipeline('tsla-6m-weekly.csv', 'tsla')
end = time.time()
pipeline_time = end - start
print("pipeline took {0}s for {1}".format(pipeline_time, 'tsla'))
predictor(df, 'tsla')

pipeline took 36.25160598754883s for tsla
aggregated error : 2980.1651149210297
average error by prediction for tsla: -11.743885306369817


In [58]:
start = time.time()
df = pipeline('fb-6m-weekly.csv', 'fb')
end = time.time()
pipeline_time = end - start
print("pipeline took {0}s for {1}".format(pipeline_time, 'fb'))
predictor(df, 'fb')

pipeline took 30.20443606376648s for fb
aggregated error : 112.77294440289208
average error by prediction for fb: 0.1187069821744381


In [59]:
start = time.time()
df = pipeline('baba-6m-weekly.csv', 'baba')
end = time.time()
pipeline_time = end - start
print("pipeline took {0}s for {1}".format(pipeline_time, 'baba'))
predictor(df, 'baba')

pipeline took 36.11968111991882s for baba
aggregated error : 343.35289975565377
average error by prediction for baba: -3.3095244772874413


In [60]:
start = time.time()
df = pipeline('msft-6m-weekly.csv', 'msft')
end = time.time()
pipeline_time = end - start
print("pipeline took {0}s for {1}".format(pipeline_time, 'msft'))
predictor(df, 'msft')

pipeline took 38.20546102523804s for msft
aggregated error : 44.333149430340875
average error by prediction for msft: 0.7658176359434081


In [61]:
start = time.time()
df = pipeline('nflx-6m-weekly.csv', 'nflx')
end = time.time()
pipeline_time = end - start
print("pipeline took {0}s for {1}".format(pipeline_time, 'nflx'))
predictor(df, 'nflx')

pipeline took 32.990135192871094s for nflx
aggregated error : 1579.151981114193
average error by prediction for nflx: -8.735665936461146


In [62]:
start = time.time()
df = pipeline('amzn-6m-weekly.csv', 'amzn')
end = time.time()
pipeline_time = end - start
print("pipeline took {0}s for {1}".format(pipeline_time, 'amzn'))
predictor(df, 'amzn')

pipeline took 31.7349591255188s for amzn
aggregated error : 3559.7040908256663
average error by prediction for amzn: 9.34027130103315


In [63]:
start = time.time()
df = pipeline('googl-6m-weekly.csv', 'googl')
end = time.time()
pipeline_time = end - start
print("pipeline took {0}s for {1}".format(pipeline_time, 'googl'))
predictor(df, 'googl')

pipeline took 34.03275227546692s for googl
aggregated error : 14629.308640535517
average error by prediction for googl: 27.57321798135427


In [64]:
start = time.time()
df = pipeline('wmt-6m-weekly.csv', 'wmt')
end = time.time()
pipeline_time = end - start
print("pipeline took {0}s for {1}".format(pipeline_time, 'wmt'))
predictor(df, 'wmt')

pipeline took 34.973336935043335s for wmt
aggregated error : 55.831140343965274
average error by prediction for wmt: -0.594398543408758


In [72]:
# sandbox for turning bag of words into a feature
bag = pd.read_csv("data/bag_of_words.csv")
good = bag["Good Words"].apply(str.lower).to_list()
bad = bag["Bad Words"].apply(str.lower).to_list()
good

['outperform',
 'voided',
 'discrepancy',
 'arbitrary',
 'imbalance',
 'overcharged',
 'manipulation',
 'distressed',
 'hazard',
 'unfit',
 'defraud',
 'undefined',
 'malpractice',
 'presumes',
 'collapsing',
 'falsely',
 'unsound',
 'damaging',
 'reassignment',
 'distracting',
 'disapproved',
 'stagnant',
 'unsafe',
 'critically',
 'duress',
 'pleadings',
 'investigated',
 'sometime',
 'encroachment',
 'forfeits',
 'rejections',
 'whereabouts',
 'unmatched',
 'confident',
 'rewarded',
 'prosperity',
 'rectification',
 'forfeitable',
 'turmoil',
 'progresses',
 'antecedent',
 'dissolutions',
 'expropriation',
 'understate',
 'misstate',
 'mutandis',
 'delisting',
 'uncovers',
 'grantors',
 'predeceases']