# regression bois

In [6]:
import pandas as pd
import numpy as np
import datetime
from tiingo import TiingoClient
from sklearn.impute import SimpleImputer
import indicoio
import seaborn as sns
import time
from textblob import TextBlob
import matplotlib as plt
from sklearn.model_selection import train_test_split
from sklearn import linear_model

# feature vector creation

In [7]:
def good_bag(string_vector):
    bag = pd.read_csv("data/bag_of_words.csv")
    good_bag = bag["Good Words"].apply(str.lower).to_list()
    good_count = 0
    for word in good_bag:
        good_count += string_vector.count(word)
    good = good_count/len(string_vector)
    return good

def bad_bag(string_vector):
    bag = pd.read_csv("data/bag_of_words.csv")
    bad_bag = bag["Bad Words"].apply(str.lower).to_list()
    bad_count = 0
    for word in bad_bag:
        bad_count += string_vector.count(word)
    bad = bad_count/len(string_vector)
    return bad
    
def aggregate_jsons(json_list):
    result = []
    for json in json_list:
        headline = json["title"]
        desc = json["description"]
        x = headline + " " + desc
        result.append(x)
    return result

def pipeline(df_path, ticky):
    # get the dataframe from csv
    df = pd.read_csv(df_path)
    
    # fix dates
    df["Date"] = df["Date"].apply(lambda x: pd.to_datetime(x, infer_datetime_format=True))
    df["Start"] = df["Date"].apply(str)
    six_days = lambda start_date: start_date + datetime.timedelta(days=6)
    df["End"] = df["Date"].apply(six_days).apply(str)
    df = df.drop(columns=["Date"])
    remove_time = lambda dt: dt[0:10]
    df["Start"] = df["Start"].apply(remove_time)
    df["End"] = df["End"].apply(remove_time)
    
    # add json
    client = TiingoClient({"api_key": "a265fc4a1013923f970d16e7348195074e97fcb0"})
    query_ticker = lambda t, s, e: client.get_news(tickers=[t], startDate=s, endDate=e)
    df["JSON"] = df.apply(lambda d: query_ticker(ticky, d["Start"], d["End"]), axis=1)
    
    # create corpus
    df["corpus"] = df["JSON"].apply(aggregate_jsons)
    df["vectorized"] = df["corpus"]
    combinatric = lambda l: ''.join(l)
    df["corpus"] = df["vectorized"].apply(combinatric)
    indicoio.config.api_key = "25b83c4c388204edd2c6c11cd907e048"
    
    # add sentiment
    df["sentiment"] = df["corpus"].apply(lambda orig: TextBlob(orig).sentiment.polarity)
    df["sentiment_test"] = df["vectorized"].apply(lambda orig: [TextBlob(o).sentiment.polarity for o in orig]).apply(np.mean)
    df["indico_sentiment"] = df["corpus"].apply(lambda text: indicoio.sentiment_hq(text))
    
    # add bag of words featurization
    df["bad_bag"] = df["vectorized"].apply(bad_bag)
    df["good_bag"] = df["vectorized"].apply(good_bag)
    
    # return the nice beautiful dataframe
    return df

# bayesian ridge regression
predicting close price from open and other features

In [8]:
def mse(v1, v2):
        return np.sum((v1 - v2) ** 2) 

def impute(X):
    imp = SimpleImputer(missing_values=np.nan, strategy='mean')
    imp = imp.fit(X)
    X = imp.transform(X)
    return X

In [9]:
def imputer(d, ticky):
    d["lastweek"] = d["Close"]
    df.lastweek = df.lastweek.shift(7) ## shift down
    #df.lastweek.drop(df.gdp.shape[0] - 1,inplace = True) ## removing the first row
    X = d[["Open", "lastweek", "indico_sentiment", "sentiment", "sentiment_test", "bad_bag", "good_bag"]]
    Y = d["Close"]
    
    X = impute(X)
    
    X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size = 0.3, random_state = 69)
    
    model = linear_model.BayesianRidge()
    model.fit(X_train, Y_train)
    Y_predicted = model.predict(X_test)
    error = mse(Y_predicted, Y_test)
    print("aggregated error : {0}".format(error))
    print("average error by prediction for {1}: {0}".format(np.mean(Y_predicted - Y_test), ticky))

In [10]:
def run_ticker_linear(ticky):
    print("******{0}*********".format(ticky)*5)
    start = time.time()
    df = pipeline('data/6m-weekly/{0}-6m-weekly.csv'.format(ticky), ticky)
    end = time.time()
    pipeline_time = end - start
    print("pipeline took {0}s for {1}".format(pipeline_time, ticky))
    imputer(df, ticky)

In [11]:
tickers = ['aapl', 'crm', 'tsla', 'fb', 'baba', 'msft', 'nflx', 'amzn', 'googl', 'wmt']

In [20]:
for ticker in tickers:
    run_ticker_linear(ticker)

******aapl***************aapl***************aapl***************aapl***************aapl*********
pipeline took 30.648231744766235s for aapl
aggregated error : 1.449004747382989e-16
average error by prediction for aapl: 3.7402969610411674e-10
******crm***************crm***************crm***************crm***************crm*********
pipeline took 30.65610694885254s for crm
aggregated error : 8.08661352611945e-16
average error by prediction for crm: 2.013820221691276e-09
******tsla***************tsla***************tsla***************tsla***************tsla*********
pipeline took 34.168715953826904s for tsla
aggregated error : 2.661535321437689e-16
average error by prediction for tsla: -3.6366335128453406e-09
******fb***************fb***************fb***************fb***************fb*********
pipeline took 30.925320386886597s for fb
aggregated error : 1.6780559364645576e-16
average error by prediction for fb: 1.6132572808500198e-09
******baba***************baba***************baba**********

# logistic regression
predicting whether it will be an up week or down week based on same features as above

In [36]:
# adding the up/down label
# DOWN - 0
# UP - 1

def logistic_accuracy(y_pred, y_real):
    y_pred, y_real = list(y_pred), list(y_real)
    ct = 0
    for i in range(len(y_pred)):
        if y_pred[i] == y_real[i]:
            ct += 1
    return ct/len(y_pred)

def add_updown(df):
    difs = df["Open"] - df["Close"]
    x = pd.Series([1 if dif < 0 else 0 for dif in difs])
    df["triangle"] = x
    return df

def prepare_logistic(path, ticky):
    df = pipeline(path, ticky)
    df = add_updown(df)
    return df

def logistic_reg(df, ticky):
    X = impute(df[["indico_sentiment", "sentiment", "sentiment_test", "bad_bag", "good_bag"]])
    Y = df["triangle"]
    X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size = 0.25, random_state = 69)
    
    model = linear_model.LogisticRegression(solver="lbfgs")
    model.fit(X_train, Y_train)
    Y_predicted = model.predict(X_test)
    
    error_pct = logistic_accuracy(Y_predicted, Y_test)
    return error_pct

def run_logistic(ticker_list):
    print("beginning logistic regression")
    for ticker in tickers:
        print("******{0}*********".format(ticker)*5)
        filepath = 'data/6m-weekly/{0}-6m-weekly.csv'.format(ticker)
        start = time.time()
        df = prepare_logistic(filepath, ticker)
        end = time.time()
        pipeline_time = end - start
        print("pipeline took {0}s for {1}".format(pipeline_time, ticker))
        accuracy_pct = logistic_reg(df, ticker) * 100
        print("ticker: {0}\naccuracy{1}%".format(ticker, accuracy_pct))
    print("finished logistic regression")

In [37]:
run_logistic(tickers)

beginning logistic regression
******aapl***************aapl***************aapl***************aapl***************aapl*********
pipeline took 30.623436212539673s for aapl
ticker: aapl
accuracy71.42857142857143%
******crm***************crm***************crm***************crm***************crm*********




pipeline took 29.559911012649536s for crm
ticker: crm
accuracy42.857142857142854%
******tsla***************tsla***************tsla***************tsla***************tsla*********




pipeline took 33.41772508621216s for tsla
ticker: tsla
accuracy42.857142857142854%
******fb***************fb***************fb***************fb***************fb*********




pipeline took 29.98150086402893s for fb
ticker: fb
accuracy71.42857142857143%
******baba***************baba***************baba***************baba***************baba*********




pipeline took 35.60064387321472s for baba
ticker: baba
accuracy71.42857142857143%
******msft***************msft***************msft***************msft***************msft*********




pipeline took 37.82983589172363s for msft
ticker: msft
accuracy85.71428571428571%
******nflx***************nflx***************nflx***************nflx***************nflx*********




pipeline took 32.75851011276245s for nflx
ticker: nflx
accuracy71.42857142857143%
******amzn***************amzn***************amzn***************amzn***************amzn*********




pipeline took 33.28855013847351s for amzn
ticker: amzn
accuracy85.71428571428571%
******googl***************googl***************googl***************googl***************googl*********




pipeline took 32.60937714576721s for googl
ticker: googl
accuracy28.57142857142857%
******wmt***************wmt***************wmt***************wmt***************wmt*********




pipeline took 34.14972400665283s for wmt
ticker: wmt
accuracy71.42857142857143%
finished logistic regression


