In [23]:
import pandas as pd
import numpy as np
import datetime
from tiingo import TiingoClient
from sklearn.impute import SimpleImputer
import indicoio
import seaborn as sns
import time
from textblob import TextBlob
import matplotlib as plt

In [49]:
UP = []
DOWN = []

def count_words(original_string, bag_of_words):
    split_string = original_string.lower().split()
    count = 0
    for word in bag_of_words:
        matches = split_string.count(word)
        count += matches
    ratio = count/len(split_string)
    return ratio
    
def aggregate_jsons(json_list):
    result = []
    for json in json_list:
        headline = json["title"]
        desc = json["description"]
        x = headline + " " + desc
        result.append(x)
    return result

def pipeline(df_path, ticky):
    # get the dataframe from csv
    df = pd.read_csv(df_path)
    
    # fix dates
    df["Date"] = df["Date"].apply(lambda x: pd.to_datetime(x, infer_datetime_format=True))
    df["Start"] = df["Date"].apply(str)
    six_days = lambda start_date: start_date + datetime.timedelta(days=6)
    df["End"] = df["Date"].apply(six_days).apply(str)
    df = df.drop(columns=["Date"])
    remove_time = lambda dt: dt[0:10]
    df["Start"] = df["Start"].apply(remove_time)
    df["End"] = df["End"].apply(remove_time)
    
    # add json
    client = TiingoClient({"api_key": "a265fc4a1013923f970d16e7348195074e97fcb0"})
    query_ticker = lambda t, s, e: client.get_news(tickers=[t], startDate=s, endDate=e)
    df["JSON"] = df.apply(lambda d: query_ticker(ticky, d["Start"], d["End"]), axis=1)
    
    # create corpus
    df["corpus"] = df["JSON"].apply(aggregate_jsons)
    df["vectorized"] = df["corpus"]
    combinatric = lambda l: ''.join(l)
    df["corpus"] = df["vectorized"].apply(combinatric)
    indicoio.config.api_key = "25b83c4c388204edd2c6c11cd907e048"
    # add sentiment
    df["sentiment"] = df["corpus"].apply(lambda orig: TextBlob(orig).sentiment.polarity)
    df["sentiment_test"] = df["vectorized"].apply(lambda orig: [TextBlob(o).sentiment.polarity for o in orig]).apply(np.mean)
    df["rick_sentiment"] = df["corpus"].apply(lambda text: indicoio.sentiment_hq(text))
    # add statistical features
    # df["features"] = df["corpus"].apply(lambda text: indicoio.text_features(text, v=2))
    
    # add bag of words ratio
    #df["bad_ratio"] = 
    #df["good_keywords"]
    
    # return the nice beautiful dataframe
    return df

In [54]:
def predictor(d, ticky):
    from sklearn.model_selection import train_test_split
    from sklearn import linear_model

    d["constvec"] = [0.7] * len(d)
    d["lastweek"] = d["Close"]
    df.lastweek = df.lastweek.shift(7) ## shift down
    #df.lastweek.drop(df.gdp.shape[0] - 1,inplace = True) ## removing the first row
    X = d[["Open", "lastweek", "rick_sentiment"]]
    Y = d["Close"]
    imp = SimpleImputer(missing_values=np.nan, strategy='mean')
    imp = imp.fit(X)
    X = imp.transform(X)
    X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size = 0.3, random_state = 69)

    def mse(v1, v2):
        return np.sum((v1 - v2) ** 2) 
    
    model = linear_model.BayesianRidge()
    model.fit(X_train, Y_train)
    Y_predicted = model.predict(X_test)
    error = mse(Y_predicted, Y_test)
    print("aggregated error : {0}".format(error))
    print("average error by prediction for {1}: {0}".format(np.mean(Y_predicted - Y_test), ticky))

In [55]:
start = time.time()
df = pipeline('aapl-6m-weekly.csv', 'aapl')
end = time.time()
pipeline_time = end - start
print("pipeline took {0}s for {1}".format(pipeline_time, 'aapl'))
predictor(df, 'aapl')

pipeline took 30.467437028884888s for aapl
aggregated error : 158.50868881699554
average error by prediction for aapl: 0.08792566878034967


In [28]:
start = time.time()
df = pipeline('crm-6m-weekly.csv', 'crm')
end = time.time()
pipeline_time = end - start
print("pipeline took {0}s for {1}".format(pipeline_time, 'crm'))
predictor(df, 'crm')

pipeline took 9.660725593566895s for crm
aggregated error : 107.38549891871838
average error by prediction for crm: 2.22604476549286


In [9]:
start = time.time()
df = pipeline('tsla-6m-weekly.csv', 'tsla')
end = time.time()
pipeline_time = end - start
print("pipeline took {0}s for {1}".format(pipeline_time, 'tsla'))
predictor(df, 'tsla')

pipeline took 12.82651162147522s for tsla
aggregated error : 2973.1671609501464
average error by prediction for tsla: -11.812941524549151


In [10]:
start = time.time()
df = pipeline('fb-6m-weekly.csv', 'fb')
end = time.time()
pipeline_time = end - start
print("pipeline took {0}s for {1}".format(pipeline_time, 'fb'))
predictor(df, 'fb')

pipeline took 10.964613437652588s for fb
aggregated error : 156.56163627135112
average error by prediction for fb: 1.2611638446461138


In [11]:
start = time.time()
df = pipeline('baba-6m-weekly.csv', 'baba')
end = time.time()
pipeline_time = end - start
print("pipeline took {0}s for {1}".format(pipeline_time, 'baba'))
predictor(df, 'baba')

pipeline took 12.713487386703491s for baba
aggregated error : 346.6137871138725
average error by prediction for baba: -3.175124251872483


In [12]:
start = time.time()
df = pipeline('msft-6m-weekly.csv', 'msft')
end = time.time()
pipeline_time = end - start
print("pipeline took {0}s for {1}".format(pipeline_time, 'msft'))
predictor(df, 'msft')

pipeline took 11.158455848693848s for msft
aggregated error : 48.333944998318856
average error by prediction for msft: 0.8648414383188638


In [13]:
start = time.time()
df = pipeline('nflx-6m-weekly.csv', 'nflx')
end = time.time()
pipeline_time = end - start
print("pipeline took {0}s for {1}".format(pipeline_time, 'nflx'))
predictor(df, 'nflx')

pipeline took 12.216991424560547s for nflx
aggregated error : 1154.4152604516844
average error by prediction for nflx: -6.532616521740055


In [20]:
start = time.time()
df = pipeline('amzn-6m-weekly.csv', 'amzn')
end = time.time()
pipeline_time = end - start
print("pipeline took {0}s for {1}".format(pipeline_time, 'amzn'))
predictor(df, 'amzn')

pipeline took 11.339673519134521s for amzn
aggregated error : 3462.678062502195
average error by prediction for amzn: 4.462062612569172


In [18]:
start = time.time()
df = pipeline('googl-6m-weekly.csv', 'googl')
end = time.time()
pipeline_time = end - start
print("pipeline took {0}s for {1}".format(pipeline_time, 'googl'))
predictor(df, 'googl')

pipeline took 13.014968395233154s for googl
aggregated error : 16801.007336800532
average error by prediction for googl: 26.4038610113791


In [19]:
start = time.time()
df = pipeline('wmt-6m-weekly.csv', 'wmt')
end = time.time()
pipeline_time = end - start
print("pipeline took {0}s for {1}".format(pipeline_time, 'wmt'))
predictor(df, 'wmt')

pipeline took 13.363798379898071s for wmt
aggregated error : 39.68606497767715
average error by prediction for wmt: 0.22019270404814031
