In [2]:
import pandas as pd
import numpy as np
import datetime
from tiingo import TiingoClient
import indicoio
import seaborn as sns
import time
from textblob import TextBlob

In [3]:
UP = []
DOWN = []

def count_words(original_string, bag_of_words):
    split_string = original_string.lower().split()
    count = 0
    for word in bag_of_words:
        matches = split_string.count(word)
        count += matches
    ratio = count/len(split_string)
    return ratio
    
def aggregate_jsons(json_list):
    result = []
    for json in json_list:
        headline = json["title"]
        desc = json["description"]
        x = headline + " " + desc
        result.append(x)
    return result

def pipeline(df_path, ticky):
    # get the dataframe from csv
    df = pd.read_csv(df_path)
    
    # fix dates
    df["Date"] = df["Date"].apply(lambda x: pd.to_datetime(x, infer_datetime_format=True))
    df["Start"] = df["Date"].apply(str)
    six_days = lambda start_date: start_date + datetime.timedelta(days=6)
    df["End"] = df["Date"].apply(six_days).apply(str)
    df = df.drop(columns=["Date"])
    remove_time = lambda dt: dt[0:10]
    df["Start"] = df["Start"].apply(remove_time)
    df["End"] = df["End"].apply(remove_time)
    
    # add json
    client = TiingoClient({"api_key": "a265fc4a1013923f970d16e7348195074e97fcb0"})
    query_ticker = lambda t, s, e: client.get_news(tickers=[t], startDate=s, endDate=e)
    df["JSON"] = df.apply(lambda d: query_ticker(ticky, d["Start"], d["End"]), axis=1)
    
    # create corpus
    df["corpus"] = df["JSON"].apply(aggregate_jsons)
    df["vectorized"] = df["corpus"]
    combinatric = lambda l: ''.join(l)
    df["corpus"] = df["vectorized"].apply(combinatric)
    
    # add sentiment
    df["sentiment"] = df["corpus"].apply(lambda orig: TextBlob(orig).sentiment.polarity)
    df["sentiment_test"] = df["corpus"].apply(lambda orig: [TextBlob(o).sentiment.polarity for o in orig]).apply(np.mean)

    # add statistical features
    # df["features"] = df["corpus"].apply(lambda text: indicoio.text_features(text, v=2))
    
    # add bag of words ratio
    #df["bad_ratio"] = 
    #df["good_keywords"]
    
    # return the nice beautiful dataframe
    return df

In [None]:
g = time.time()
d = pipeline("TSLA-daily.csv", "tsla")
x = time.time()
print("Function took {0}s to run".format(x-g))

In [None]:
sns.lineplot(d["sentiment"], d["Volume"])
sns.lineplot(d["sentiment_test"], d["Volume"])

In [None]:
d.head()

In [None]:
g = time.time()
d = pipeline("MSFT-daily.csv", "msft")
x = time.time()
print("Function took {0}s to run".format(x-g))
sns.lineplot(d["sentiment"], d["Volume"])
sns.lineplot(d["sentiment_test"], d["Volume"])

In [None]:
g = time.time()
d = pipeline("AAPL-daily.csv", "aapl")
x = time.time()
print("Function took {0}s to run".format(x-g))
sns.lineplot(d["sentiment"], d["Volume"])
sns.lineplot(d["sentiment_test"], d["Volume"])