# Analytical Model
We are starting here with the analytical block and some exploring with the market data set

We will:
1. explore some features that we can create from the market data
1. find a way to train a model day-by-day
1. ???
1. profit


In [None]:
from kaggle.competitions import twosigmanews
# You can only call make_env() once, so don't lose it!
env = twosigmanews.make_env()

# returns the training data DataFrames as a tuple of:
(market_train_df, news_train_df) = env.get_training_data()

# size of total data
print("Market Train Size: ", market_train_df.shape)
print("News Train Size: ", news_train_df.shape)

# we only care about the market data here
market_train_df.head()

## Exploration
For now lets just use one asset tag and explore its feature and target relation

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

# find unique asset codes
uAssestCode = pd.unique(market_train_df.assetCode)
print("Unique asset codes: ", len(uAssestCode))
print(uAssestCode[0])

# grab information for uAssestCode[0]
df = market_train_df[market_train_df.assetCode == uAssestCode[0]]

In [None]:
# rough stock chart
fig, ax = plt.subplots()
ax.plot(df.time,
        df.close,
        color='blue')
ax.plot(df.time,
        df.open,
        color='red')
ax.grid()
ax.set_xlabel('time')
ax.set_ylabel('close')
ax.set_title('plotting ' + uAssestCode[0])
ax.legend(loc='lower right')

**Target**

returnsOpenNextMktres10

Lets plot the target and seperate it into two classes:
* positive gain ( > 0)
* no or negative gain (<= 0)

This will make feature exploration easier

In [None]:
# target value
fig, ax = plt.subplots()
ax.plot(df.time, 
        df.returnsOpenNextMktres10,
        color='green')
ax.grid()
ax.set_xlabel('time')
ax.set_ylabel('target')
ax.set_title('plotting: ' + uAssestCode[0])
ax.legend(loc='lower right')

In [None]:
# seperate out the label
yo = df['returnsOpenNextMktres10']

# create a class based decision
y = np.zeros(yo.shape[0])
y[yo > 0] = 1

**Features**

Raw:
* volume
* open
* close

DSP based:
* These will have to come later (based on obs)?

Market based:
* gain = close - open
* gpv = gain / volume


In [None]:
# volume
fill = np.linspace(1.0, df.volume.shape[0], num=df.volume.shape[0])

v     = df.volume
vol2  = np.power(df.volume, 1/2)

fig, ax = plt.subplots()
ax.scatter(v, 
        vol2, 
        c=y)
ax.grid()
ax.set_xlabel('time')
ax.set_ylabel('volume')
ax.set_title('plotting ' + uAssestCode[0])
ax.legend(loc='lower right')

# lets try to break this up into bins
# and look at the bins lined up with the target
vbins, ved = np.histogram(v, bins=20)
npv = np.array(v)
npy = np.array(y)
ymean   = np.zeros(ved.shape[0] - 1)
volumeb = np.zeros(v.shape[0])

for i in range(1, ved.shape[0] - 1):
    meets_range = npy[np.logical_and(ved[i] < npv, npv < ved[i+1])]    
    if(meets_range.size == 0):
        # it is empty
        ymean[i] = 0
    else:
        ymean[i] = meets_range.mean()
    volumeb[np.logical_and(ved[i] < npv, npv < ved[i+1])] = i
fill = np.linspace(1.0, ymean.shape[0], num=ymean.shape[0])

# bar
fig, ax = plt.subplots()
ax.bar(fill, ymean)
ax.grid()
ax.set_xlabel('bins')
ax.set_ylabel('y mean')
ax.set_title('plotting ' + uAssestCode[0])

# this looks good! as bins

In [None]:
# gain
# features to test
gain  = df.close - df.open
gain2 = np.power(gain, 2)
gainb = np.zeros(gain.shape[0])
gainb[gain > 0] = 1

# plot
fig, ax = plt.subplots()
ax.scatter(gain, 
           gainb, 
           c=y)
ax.grid()
ax.set_xlabel('close')
ax.set_ylabel('open')
ax.set_title('plotting: ' + uAssestCode[0])

# this looks good!

In [None]:
# gpv
gpv  = (df.close - df.open) / df.volume
gpv2 = np.power(gpv, 2)
fill = np.linspace(1.0, gpv.shape[0], num=gpv.shape[0])

fig, ax = plt.subplots()
ax.scatter(gpv2, 
           gpv, 
           c=y)
ax.grid()
ax.set_xlabel('fill')
ax.set_ylabel('gpv')
ax.set_title('plotting ' + uAssestCode[0])
ax.legend(loc='lower right')

# this looks ok, lets not use this as a feature for now

**Model**

SVM

In [None]:
# we will create our features in a data frame
#Xdict = {1: gain, 2: gainb, 3: volumeb}
Xdict = {2: gainb, 3: volumeb}
X     = pd.DataFrame(Xdict)

# print some info
print(type(X))
print(X.dtypes)
X.head()

In [None]:
# reference:
#    https://stackabuse.com/implementing-svm-and-kernel-svm-with-pythons-scikit-learn/ 
# time to train initial prediction model
from sklearn.model_selection import train_test_split 
from sklearn.preprocessing import minmax_scale
from sklearn import svm
 
# scale y to be max [-1,1] to represent confidence 
y       = np.zeros(yo.shape[0])
#y_scale = minmax_scale(list(yo), feature_range=(-1, 1), axis=0, copy=True)
y[yo >  1e-6] = 1
y[yo < -1e-6] = -1

# split up data set
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.20)

# implement SVM regression
clf = svm.LinearSVC()
#clf = svm.SVR(C=0.7, kernel='rbf')
#clf = svm.SVC(C=0.7)
clf.fit(X_train, y_train) 

In [None]:
y_scale = minmax_scale(list(yo), feature_range=(-1, 1), axis=0, copy=True)

# plot pred and truth
x = np.linspace(1.0, y_scale.shape[0], num=y_scale.shape[0])

# predicted
fig, ax = plt.subplots()
ax.plot(x, yo, color='green')
ax.grid()
ax.set_xlabel('time')
ax.set_ylabel('target')
ax.set_title('ya: ' + uAssestCode[0])

# predicted
fig, ax = plt.subplots()
ax.plot(x, y_scale, color='green')
ax.grid()
ax.set_xlabel('time')
ax.set_ylabel('target')
ax.set_title('scale: ' + uAssestCode[0])

# predicted
fig, ax = plt.subplots()
ax.plot(x, y, color='green')
ax.grid()
ax.set_xlabel('time')
ax.set_ylabel('target')
ax.set_title('y: ' + uAssestCode[0])

In [None]:
# test and analysis
import numpy as np
y_pred = clf.predict(X_test)

# plot pred and truth
x = np.linspace(1.0, y_pred.shape[0], num=y_pred.shape[0])

# predicted
fig, ax = plt.subplots()
ax.plot(x, y_pred, color='green')
ax.grid()
ax.set_xlabel('time')
ax.set_ylabel('target')
ax.set_title('predicted: ' + uAssestCode[0])
print('y_pred mean: ', np.mean(y_pred))

# actual
fig, ax = plt.subplots()
ax.plot(x, y_test, color='blue')
ax.grid()
ax.set_xlabel('time')
ax.set_ylabel('target')
ax.set_title('actual: ' + uAssestCode[0])
print('y_test mean: ', np.mean(y_test))

# squared error
y_test = np.array(y_test)
sq_err = 1/2 * np.power((y_test - y_pred), 2)
#
fig, ax = plt.subplots()
ax.plot(x, sq_err, color='red')
ax.grid()
ax.set_xlabel('time')
ax.set_ylabel('target')
ax.set_title('Squared Error: ' + uAssestCode[0])
print('sq_err mean: ', np.mean(sq_err))

## Feature Function

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

def analysis_get_features(market_data, byday=False, trainInfo=None):
    # for full training set feature creation
    if(not byday):
        # assign uids to each asset 
        uAssestCode = pd.unique(market_data.assetCode)    
        uidList     = np.linspace(1.0, uAssestCode.shape[0], num=uAssestCode.shape[0])

        # feature 0 - map from assetCode to uid    
        uidMap = {}
        for A, B in zip(uAssestCode, uidList):
            uidMap[A] = B

        aUID = np.zeros(market_data.shape[0])
        for i, item in enumerate(market_data.assetCode):
            aUID[i] = uidMap[item]

        # feature 1, 2 - gain, gainb    
        gain  = market_data.close - market_data.open    
        gainb = np.zeros(gain.shape[0])
        # classify
        gainb[gain > 0] = 1

        # feature 3 - volumeb
        v   = market_data.volume
        npv = np.array(v)    
        vbins, ved = np.histogram(v, bins=20)
        volumeb    = np.zeros(v.shape[0])

        # create classes for bins
        for i in range(1, ved.shape[0] - 1): 
            volumeb[np.logical_and(ved[i] < npv, npv < ved[i+1])] = i

        # features to dataframe
        #Xdict = {1: aUID, 2: gain, 3: gainb, 4: volumeb}
        Xdict = {1: gain, 2: gainb, 3: volumeb}
        X     = pd.DataFrame(Xdict)
        
        # save off training information
        trainInfo = (uidList, uidMap, ved)
        
    # for one off feature creation
    else:                
        # feature 0
        auid = np.zeros(market_data.assetCode.shape[0])
        for i, assetCode in enumerate(market_data.assetCode):
            # look for uid
            if assetCode in trainInfo[1]:
                uid = trainInfo[1][assetCode]
            else:
                # if its a new asset code create a new uid
                newUID = trainInfo[0].max() + 1
                np.append(trainInfo[0], newUID)
                
                # update dict
                trainInfo[1][assetCode] = newUID
                uid = newUID
                
            # set uid
            auid[i] = uid
        
        # feature 1, 2 - gain, gainb
        gain  = market_data.close - market_data.open    
        gainb = np.zeros(gain.shape[0])
        # classify
        gainb[gain > 0] = 1
        
        # feature 3 - volumeb
        v   = market_data.volume
        npv = np.array(v)    
        # TODO consider using the same bin alignment as the training data
        # it may be better to leave it as-is; it would be proportionate
        # ved = trainInfo[2][i]
        vbins, ved = np.histogram(v, bins=20)
        volumeb    = np.zeros(v.shape[0])

        # create classes for bins
        for i in range(1, ved.shape[0] - 1): 
            volumeb[np.logical_and(ved[i] < npv, npv < ved[i+1])] = i
                
        # features to dataframe
        #Xdict = {1: auid, 2: gain, 3: gainb, 4: volumeb}
        Xdict = {1: gain, 2: gainb, 3: volumeb}
        X     = pd.DataFrame(Xdict)
    
    return X, trainInfo

In [None]:
# debug 
# subset = market_train_df.head()
features, trainInfo = analysis_get_features(market_train_df)

features.head()

## Training Function

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.preprocessing import minmax_scale
from sklearn import svm

def analysis_train(features, target):
    # scale y to be max [-1,1] to represent confidence 
    y       = np.zeros(target.shape[0])
    #y_scale = minmax_scale(list(target), feature_range=(-1, 1), axis=0, copy=True)
    y[target >  1e-3] = 1
    y[target < -1e-3] = -1

    # implement SVM regression
    clf = svm.LinearSVC()
    #clf = svm.SVR(C=0.9, kernel='rbf')  
    #clf = svm.SVR(kernel='linear', C=1e3)   
    #clf = svm.SVC(gamma='auto')
    clf.fit(features, y)
    
    return clf

In [None]:
trainedModel = analysis_train(features, market_train_df['returnsOpenNextMktres10'])
print(trainedModel)

# Daily Values
* While there are more prediction day(s) and `predict` was called successfully since the last yield, yields a tuple of:
    * `market_observations_df`: DataFrame with market observations for the next prediction day.
    * `news_observations_df`: DataFrame with news observations for the next prediction day.
    * `predictions_template_df`: DataFrame with `assetCode` and `confidenceValue` columns, prefilled with `confidenceValue = 0`, to be filled in and passed back to the `predict` function.
* If `predict` has not been called since the last yield, yields `None`.

In [None]:
# You can only iterate through a result from `get_prediction_days()` once
# so be careful not to lose it once you start iterating.
#days = env.get_prediction_days()
#(market_obs_df, news_obs_df, predictions_template_df) = next(days)

In [None]:
# debug
#p = analysis_predict(market_obs_df, predictions_template_df, trainInfo, trainedModel)

# info
#print("market_obs_df size: ", market_obs_df.shape)
#market_obs_df.head()

In [None]:
#news_obs_df.head()

In [None]:
#p.head()

In [None]:
#predictions_template_df.head()

## Prediction Function

In [None]:
def analysis_predict(market_obs, predictions, trainInfo, trainedModel):    
    features, trainInfo = analysis_get_features(market_obs, True, trainInfo)
    p       = trainedModel.predict(features)
    #p_scale = minmax_scale(list(p), feature_range=(-1, 1), axis=0, copy=True)
    #p_class = np.ones(p_scale.shape[0]) * -1
    #p_class[p_scale > 0] = 1
    p_class = p
    
    # set
    predictions.confidenceValue = p_class

## Main Loop
Let's loop through all the days and make our random predictions.  The `days` generator (returned from `get_prediction_days`) will simply stop returning values once you've reached the end.

In [None]:
days = env.get_prediction_days()

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

for (market_obs_df, news_obs_df, predictions_template_df) in days:
    analysis_predict(market_obs_df, predictions_template_df, trainInfo, trainedModel)
    env.predict(predictions_template_df)
print('Done!')

In [None]:
predictions_template_df.head()

## **`write_submission_file`** function

Writes your predictions to a CSV file (`submission.csv`) in the current working directory.

In [None]:
env.write_submission_file()

In [None]:
# We've got a submission file!
import os
print([filename for filename in os.listdir('.') if '.csv' in filename])

As indicated by the helper message, calling `write_submission_file` on its own does **not** make a submission to the competition.  It merely tells the module to write the `submission.csv` file as part of the Kernel's output.  To make a submission to the competition, you'll have to **Commit** your Kernel and find the generated `submission.csv` file in that Kernel Version's Output tab (note this is _outside_ of the Kernel Editor), then click "Submit to Competition".  When we re-run your Kernel during Stage Two, we will run the Kernel Version (generated when you hit "Commit") linked to your chosen Submission.

In [None]:
# lets check out that CSV file
import pandas as pd
from datetime import datetime
import csv
import matplotlib.pyplot as plt
import matplotlib.dates as mdates

headers = ['time', 'assetCode', 'confidenceValue']
df_in  = pd.read_csv('submission.csv',names=headers)

print(df_in)

code = df_in.assetCode[3]
df   = df_in[df_in.assetCode == code]           

y = np.array(df['confidenceValue'], dtype=float)
x = np.linspace(1.0, y.shape[0], num=y.shape[0])

# plot
plt.plot(x,y)
plt.show()