<a href="https://colab.research.google.com/github/stevegbrooks/big-portfolio-learner/blob/construct_portfolio/GLM.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# CIS 545 Final Project

## Big Portfolio Learner: Construct Portfolio by GLM

### Team members: Steven Brooks & Chenlia Xu

# Section 1: Setting Up Environment

In [125]:
import random
import numpy as np 
import json
import matplotlib
import pandas as pd
import matplotlib.pyplot as plt
from matplotlib import cm
from datetime import datetime
import glob
import seaborn as sns
import re
import os

In [126]:
%%capture
## If boto3 not already installed uncomment the following:
!pip3 install boto3

In [127]:
import boto3
from botocore import UNSIGNED

from botocore.config import Config

s3 = boto3.resource('s3', config=Config(signature_version=UNSIGNED))
s3.Bucket('cis545project').download_file('data/stock_data.zip', 'stock_data.zip')
s3.Bucket('cis545project').download_file('data/technical_data.zip', 'technical_data.zip')

In [128]:
%%capture

stock_dir = "stock_data"
if not os.path.exists(stock_dir):
  os.makedirs(stock_dir)
!unzip /content/stock_data.zip -d /content/$stock_dir
!rm -f $stock_dir/.gitempty

tech_dir = "technical_data"
if not os.path.exists(tech_dir):
  os.makedirs(tech_dir)
!unzip /content/technical_data.zip -d /content/$tech_dir
!rm -f $tech_dir/.gitempty

# Section 2: Setup Spark Session

In [239]:
%%capture

!apt install libkrb5-dev
!wget https://www-us.apache.org/dist/spark/spark-2.4.5/spark-2.4.5-bin-hadoop2.7.tgz
!tar xf spark-2.4.5-bin-hadoop2.7.tgz
!pip install findspark
!pip install sparkmagic
!pip install pyspark==2.4.5
!pip install pyspark --user
!pip install seaborn --user
!pip install plotly --user
!pip install imageio --user
!pip install folium --user

!apt update
!apt install gcc python-dev libkrb5-dev


In [130]:
from pyspark.sql import SparkSession
from pyspark.sql.types import *
import pyspark.sql.functions as F

import os

spark = SparkSession.builder.getOrCreate()


In [131]:
%load_ext sparkmagic.magics

The sparkmagic.magics extension is already loaded. To reload it, use:
  %reload_ext sparkmagic.magics


In [258]:
#graph section
import networkx as nx
# SQLite RDBMS
import sqlite3
# Parallel processing
# import swifter
import pandas as pd
# NoSQL DB
from pymongo import MongoClient
from pymongo.errors import DuplicateKeyError, OperationFailure

import os
os.environ['SPARK_HOME'] = '/content/spark-2.4.5-bin-hadoop2.7'
os.environ["JAVA_HOME"] = "/usr/lib/jvm/java-8-openjdk-amd64"
import pyspark
from pyspark.sql import SQLContext

try:
    if(spark == None):
        spark = SparkSession.builder.appName('Initial').getOrCreate()
        sqlContext=SQLContext(spark)
except NameError:
    spark = SparkSession.builder.appName('Initial').getOrCreate()
    sqlContext=SQLContext(spark)

# Section 3: Load stock and technical data

In [135]:
stock_data_sdf = spark.read.load(
    'stock_data/*.csv', 
    format = 'csv', 
    header = 'true', 
    inferSchema = 'true', 
    sep = ','
)

First we'll set up the spark dataframe for stock prices using the work in `step1a`.

In [268]:
from pyspark.sql.functions import to_timestamp
from pyspark.sql.functions import year, month, date_format


stock_data_sdf = stock_data_sdf.withColumn("timestamp_as_dt", to_timestamp(stock_data_sdf.timestamp, 'yyyy-MM-dd'))
stock_data_sdf = stock_data_sdf.withColumn("year", year('timestamp_as_dt'))
stock_data_sdf = stock_data_sdf.filter("year >= 2002 AND year <= 2019")

count_by_symbol_year_sdf = stock_data_sdf.groupBy(["symbol", "year"]).count()
count_years_by_symbol_sdf = count_by_symbol_year_sdf.groupBy(["symbol"]).count()

### Just grab stocks that have data in each of the 18 years from 2002 to 2019
### AND remove the three outliers
stocks_with_all_analysis_yrs_sdf = count_years_by_symbol_sdf.filter("count == 18") #18 years of data from 2002 and 2019
stocks_to_remove = ['DCTH', 'BRK-A', 'AIKI']
stocks_with_all_analysis_yrs_sdf = stocks_with_all_analysis_yrs_sdf.filter(stocks_with_all_analysis_yrs_sdf.symbol.isin(stocks_to_remove) == False)

stock_data_sdf.createOrReplaceTempView("stock_data")
stocks_with_all_analysis_yrs_sdf.createOrReplaceTempView("stocks_with_all_analysis_yrs")

stock_data_2002_2019_sdf = spark.sql(
    """
    SELECT *
    FROM stock_data
    WHERE symbol IN (SELECT symbol FROM stocks_with_all_analysis_yrs)
    """
)

ValueError: ignored

Second, we set up the spark dataframe for technical indicators.  

In [None]:
technical_data_sdf = spark.read.load(
    'technical_data/*.csv', 
    format = 'csv', 
    header = 'true', 
    inferSchema = 'true', 
    sep = ','
)

Then we will reduce the technical data set to just those stocks that match up with the first dataset above.

In [None]:
technical_data_sdf = technical_data_sdf.withColumn("timestamp_as_dt", to_timestamp(technical_data_sdf.timestamp, 'yyyy-MM-dd'))
technical_data_sdf = technical_data_sdf.withColumn("year", year('timestamp_as_dt'))
technical_data_sdf = technical_data_sdf.filter("year >= 2002 AND year <= 2019")

technical_data_sdf.createOrReplaceTempView("technical_data")

technical_data_2002_2019_sdf = spark.sql(
    """
    SELECT *
    FROM technical_data
    WHERE symbol IN (SELECT symbol FROM stocks_with_all_analysis_yrs)
    """
)

# Section 4: Load Other Data.  

### Step 1: Download other data from google drive.  

In [259]:
from google.colab import drive
drive.mount('/content/gdrive')

os.mkdir('/content/other_data')
!cp '/content/gdrive/My Drive/other_data/forex.csv' other_data/forex.csv
!cp '/content/gdrive/My Drive/other_data/gdp.csv' other_data/gdp.csv
!cp '/content/gdrive/My Drive/other_data/stockindex.csv' other_data/stockindex.csv
!cp '/content/gdrive/My Drive/other_data/tbond.csv' other_data/tbond.csv
!cp '/content/gdrive/My Drive/other_data/unemployment.csv' other_data/unemployment.csv


Drive already mounted at /content/gdrive; to attempt to forcibly remount, call drive.mount("/content/gdrive", force_remount=True).


FileExistsError: ignored

### Step 2: Acquire overnight stock index performance of other major stock markets, e.g. FTSE, DAX, CAC, Nikkei, HKSE, SHSE.

In [260]:
stockindex_data = pd.read_csv('/content/other_data/stockindex.csv')

stockindex_data = stockindex_data[['date', 'FTSE100', 'DAX', 'CAC', 'Nikkei225', 'Hang_Seng', 'SHSZ300']]
stockindex_data['date'] = pd.to_datetime(stockindex_data.date)
stockindex_data = stockindex_data[stockindex_data['date'] < pd.Timestamp(2020,1,1)]

# filling missing data
stockindex_data = stockindex_data.fillna(method='ffill')
stockindex_data.iloc[0, stockindex_data.columns.get_loc('Nikkei225')] = 10871.49
stockindex_data.iloc[1, stockindex_data.columns.get_loc('Nikkei225')] = 10871.49
stockindex_data.iloc[0, stockindex_data.columns.get_loc('SHSZ300')] = 1316.46
stockindex_data.iloc[1, stockindex_data.columns.get_loc('SHSZ300')] = 1316.46

# add quarter / month column
stockindex_data['quarter'] = stockindex_data['date'].dt.to_period('Q')
stockindex_data['month'] = stockindex_data['date'].dt.to_period('M')

# calculating day-over-day % price change
stockindex_data['FTSE100_DoD'] = stockindex_data['FTSE100'].pct_change(1)
stockindex_data['DAX_DoD'] = stockindex_data['DAX'].pct_change(1)
stockindex_data['CAC_DoD'] = stockindex_data['CAC'].pct_change(1)
stockindex_data['Nikkei225_DoD'] = stockindex_data['Nikkei225'].pct_change(1)
stockindex_data['Hang_Seng_DoD'] = stockindex_data['Hang_Seng'].pct_change(1)
stockindex_data['SHSZ300_DoD'] = stockindex_data['SHSZ300'].pct_change(1)
stockindex_data = stockindex_data[['date', 'quarter', 'month', 'FTSE100_DoD', 'DAX_DoD', 'CAC_DoD', 'Nikkei225_DoD', 'Hang_Seng_DoD', 'SHSZ300_DoD']]
                        
stockindex_data.head()

Unnamed: 0,date,quarter,month,FTSE100_DoD,DAX_DoD,CAC_DoD,Nikkei225_DoD,Hang_Seng_DoD,SHSZ300_DoD
0,2002-01-02,2002Q1,2002-01,,,,,,
1,2002-01-03,2002Q1,2002-01,0.019259,0.019817,0.021998,0.0,0.006402,0.0
2,2002-01-04,2002Q1,2002-01,0.00094,0.009191,0.00034,0.0,0.024391,0.0
3,2002-01-07,2002Q1,2002-01,-0.005673,-0.016265,-0.014361,0.006519,0.016278,-0.010923
4,2002-01-08,2002Q1,2002-01,-0.008161,0.000793,-0.01028,-0.022551,-0.015045,-0.007196


### Step 3: Acquire tresaury bond (1-yr, 3yr, 10-yr) market daily performance

In [261]:
tbond_data = pd.read_csv('/content/other_data/tbond.csv')

tbond_data = tbond_data[['date', '3M', '1Y', '10Y']]
tbond_data['date'] = pd.to_datetime(tbond_data.date)
tbond_data = tbond_data[tbond_data['date'] < pd.Timestamp(2020,1,1)]

tbond_data['3M_DoD'] = tbond_data['3M'].pct_change(1)
tbond_data['1Y_DoD'] = tbond_data['1Y'].pct_change(1)
tbond_data['10Y_DoD'] = tbond_data['10Y'].pct_change(1)
tbond_data = tbond_data[['date', '3M_DoD', '1Y_DoD','10Y_DoD']]
                        
tbond_data.head()

Unnamed: 0,date,3M_DoD,1Y_DoD,10Y_DoD
0,2002-01-02,,,
1,2002-01-03,-0.005747,-0.017544,-0.007692
2,2002-01-04,-0.00578,0.004464,0.003876
3,2002-01-07,-0.023256,-0.026667,-0.017375
4,2002-01-08,0.0,0.0,0.001965


### Step 4: Acquire forex market (USD/EUR, USD/JPY, USD/AUD, etc.) daily performance.  

In [262]:
forex_data = pd.read_csv('/content/other_data/forex.csv')

forex_data = forex_data[['date', 'USDGBP', 'USDEUR', 'USDJPY', 'USDHKD', 'USDAUD', 'USDCAD']]

# filling missing data
forex_data = forex_data.fillna(method='ffill')

# convert data format
forex_data['date'] = pd.to_datetime(forex_data.date)
forex_data = forex_data[forex_data['date'] < pd.Timestamp(2020,1,1)]

# calculating day-over-day % price change
forex_data['USDGBP_DoD'] = forex_data['USDGBP'].pct_change(1)
forex_data['USDEUR_DoD'] = forex_data['USDEUR'].pct_change(1)
forex_data['USDJPY_DoD'] = forex_data['USDJPY'].pct_change(1)
forex_data['USDHKD_DoD'] = forex_data['USDHKD'].pct_change(1)
forex_data['USDAUD_DoD'] = forex_data['USDAUD'].pct_change(1)
forex_data['USDCAD_DoD'] = forex_data['USDCAD'].pct_change(1)
forex_data = forex_data[['date', 'USDGBP_DoD', 'USDEUR_DoD', 'USDJPY_DoD', 'USDHKD_DoD', 'USDAUD_DoD', 'USDCAD_DoD']]
                        
forex_data.head()

Unnamed: 0,date,USDGBP_DoD,USDEUR_DoD,USDJPY_DoD,USDHKD_DoD,USDAUD_DoD,USDCAD_DoD
0,2002-01-02,,,,,,
1,2002-01-03,0.005496,-0.003692,0.000304,-2.6e-05,-0.000412,0.001317
2,2002-01-04,-0.006185,0.009219,-0.004331,3.8e-05,-0.011121,-0.000626
3,2002-01-07,0.004921,0.002418,-0.001908,7.7e-05,0.004634,-0.000689
4,2002-01-08,0.00072,0.003484,0.014221,1.3e-05,-0.010158,0.004704


### Step 5: Acquire economic data, including monthly unemployment rate, quarterly yoy GDP growth.  

In [263]:
gdp_data = pd.read_csv('/content/other_data/gdp.csv')

gdp_data = gdp_data[['date', 'real_gdp']]

# filling missing data
gdp_data = gdp_data.fillna(method='ffill')

# convert data format
gdp_data['date'] = pd.to_datetime(gdp_data.date)
gdp_data = gdp_data[gdp_data['date'] < pd.Timestamp(2020,1,1)]
gdp_data['real_gdp_QoQ'] = gdp_data['real_gdp'].pct_change(1)
gdp_data = gdp_data.drop(columns=['real_gdp'])

# add quarter column
gdp_data['quarter'] = gdp_data['date'].dt.to_period('Q')

gdp_data.head()

Unnamed: 0,date,real_gdp_QoQ,quarter
0,2002-03-31,,2002Q1
1,2002-06-30,0.006159,2002Q2
2,2002-09-30,0.004029,2002Q3
3,2002-12-31,0.001308,2002Q4
4,2003-03-31,0.005166,2003Q1


### Step 5: Acquire monthly unemployment data.

In [264]:
unemployment_data = pd.read_csv('/content/other_data/unemployment.csv')

unemployment_data = unemployment_data[['date', 'unemployment']]

# filling missing data
unemployment_data = unemployment_data.fillna(method='ffill')

# convert data format
unemployment_data['date'] = pd.to_datetime(unemployment_data.date)
unemployment_data = unemployment_data[unemployment_data['date'] < pd.Timestamp(2020,1,1)]
unemployment_data['unemployment_MoM'] = unemployment_data['unemployment'].pct_change(1)
unemployment_data = unemployment_data.drop(columns=['unemployment'])

# add month column
unemployment_data['month'] = unemployment_data['date'].dt.to_period('M')

unemployment_data.head()

Unnamed: 0,date,unemployment_MoM,month
0,2002-01-31,,2002-01
1,2002-02-28,0.0,2002-02
2,2002-03-31,0.0,2002-03
3,2002-04-30,0.035088,2002-04
4,2002-05-31,-0.016949,2002-05


### Step 6: Merge all other data into a single dataframe.  

In [265]:
all_other_data = stockindex_data.merge(tbond_data, left_on='date', right_on='date', how='left')
all_other_data = all_other_data.merge(forex_data, left_on='date', right_on='date', how='left')
all_other_data = all_other_data.merge(gdp_data, left_on='quarter', right_on='quarter', how='left')
all_other_data = all_other_data.merge(unemployment_data, left_on='month', right_on='month', how='left')

all_other_data = all_other_data.drop(columns=['date_x', 'date_y'])
all_other_data = all_other_data.dropna()

all_other_data.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 4438 entries, 63 to 4682
Data columns (total 20 columns):
 #   Column            Non-Null Count  Dtype         
---  ------            --------------  -----         
 0   quarter           4438 non-null   period[Q-DEC] 
 1   month             4438 non-null   period[M]     
 2   FTSE100_DoD       4438 non-null   float64       
 3   DAX_DoD           4438 non-null   float64       
 4   CAC_DoD           4438 non-null   float64       
 5   Nikkei225_DoD     4438 non-null   float64       
 6   Hang_Seng_DoD     4438 non-null   float64       
 7   SHSZ300_DoD       4438 non-null   float64       
 8   3M_DoD            4438 non-null   float64       
 9   1Y_DoD            4438 non-null   float64       
 10  10Y_DoD           4438 non-null   float64       
 11  USDGBP_DoD        4438 non-null   float64       
 12  USDEUR_DoD        4438 non-null   float64       
 13  USDJPY_DoD        4438 non-null   float64       
 14  USDHKD_DoD        4438 

# Section 5: Combine stock price with all other data to create mega_data.  

### Step 1: Prepare stock data for integration.  

In [266]:
from pyspark.sql import functions as F
from pyspark.sql.window import Window

price_window = Window.partitionBy("symbol").orderBy("symbol")
stock_data_2002_2019_sdf = stock_data_2002_2019_sdf.withColumn("prev_price", F.lead(stock_data_2002_2019_sdf.adjusted_close).over(price_window))
stock_data_2002_2019_sdf = stock_data_2002_2019_sdf.withColumn("price_DoD", F.when(F.isnull(stock_data_2002_2019_sdf.adjusted_close - stock_data_2002_2019_sdf.prev_price), 0).otherwise((stock_data_2002_2019_sdf.adjusted_close - stock_data_2002_2019_sdf.prev_price)/stock_data_2002_2019_sdf.prev_price))

stock_data_2002_2019_sdf.show()


Py4JError: ignored

### Step 2: Integration by joining stock data with all other data.  

In [None]:
all_other_data_sdf = spark.createDataFrame(all_other_data) 

stock_data_2002_2019_sdf.createOrReplaceTempView("stock_data_0219_596")
all_other_data_sdf.createOrReplaceTempView("all_other_data")

mega_data_sdf = spark.sql(
    """
    SELECT stock_data_0219_596.symbol, stock_data_0219_596.timestamp_as_dt, stock_data_0219_596.adjusted_close, stock_data_0219_596.price_DoD, all_other_data.*
    FROM stock_data_0219_596
    LEFT JOIN all_other_data
      ON stock_data_0219_596.timestamp_as_dt = all_other_data.date;
    """
)

mega_data_sdf.show()


# Section 6: Run Linear Regression model on mega data.  

### Step 1: Split the data into features and label

In [None]:
from pyspark.ml.feature import StringIndexer, VectorAssembler

all_columns = [colName for colName in mega_data_sdf.columns]
drop_columns = ['symbol', 'timestamp_as_dt', 'adjusted_close', 'price_DoD', 'quarter', 'month', 'date']
columns_to_use = [i for i in all_columns if i not in drop_columns]

assembler = VectorAssembler(inputCols=columns_to_use, outputCol="features", handleInvalid="skip")
featured_sdf = assembler.transform(mega_data_sdf)

from pyspark.ml import Pipeline

mega_data_sdf = mega_data_sdf.drop("features")
pipeline = Pipeline(stages=[assembler])
model = pipeline.fit(mega_data_sdf)
modified_data_sdf = model.transform(mega_data_sdf)
modified_data_sdf.show(5)

train_sdf, test_sdf = modified_data_sdf.randomSplit([0.8, 0.2])


### Step 2: Run linear regression model on spark

In [None]:
from pyspark.ml.regression import LinearRegression

lr = LinearRegression(featuresCol='features', labelCol='price_DoD')
lr_model = lr.fit(train_sdf)

trainingSummary = lr_model.summary

print("RMSE: %f" % trainingSummary.rootMeanSquaredError)
print("r2: %f" % trainingSummary.r2)


### Step3: Evaluate linear regression model.  

In [None]:
from pyspark.ml.evaluation import RegressionEvaluator
from pyspark.sql.functions import col
from sklearn.metrics import mean_squared_error

predictions = lr_model.transform(test_sdf)

# Compute root mean squared error on the test set
y_test = predictions.select(col("label")).collect()
y_pred = predictions.select(col("prediction")).collect()
test_rmse_orig = mean_squared_error(y_test, y_pred, squared=False)
print(test_rmse_orig)


# Section 7: Construct our portfolio with GLM.  

### Step 0: Set up assumptions.  

In [235]:
# Initiating cash balance and portfolio

cash_balance = 1000000
portfolio_value = 0
valuation_mark = cash_balance + portfolio_value

portfolio = {
  "ADP": 0
}

num_stock_to_buy = 20
num_stock_to_sell = 50      # only if it's in our portfolio


### Step 1: Predict stock price using our model.  

In [267]:
# Run model to get projected share price change for each stock during 2018-2019.  

real_world_sdf = spark.sql(
    """
    SELECT stock_data_0219_596.symbol, stock_data_0219_596.timestamp_as_dt, stock_data_0219_596.adjusted_close, stock_data_0219_596.price_DoD, all_other_data.*
    FROM stock_data_0219_596
    LEFT JOIN all_other_data
      ON stock_data_0219_596.timestamp_as_dt = all_other_data.date
    WHERE stock_data_0219_596.timestamp_as_dt > pd.Timestamp(2017,12,31);
    """
)

featured_sdf = assembler.transform(real_world_sdf)
real_world_predictions = lr_model.transform(featured_sdf)
real_world_predictions.createOrReplaceTempView("real_world_predictions")


# Make prediction and select top 20 outperforming stocks and bottom 50 underperforming stocks.  
# this shall return two lists: list of stock to buy and list of stock to sell for each day. 

starting_date = pd.Timestamp(2018, 1, 1)
numdays = 500
date_list = [starting_date + datetime.timedelta(days=x) for x in range(numdays)]
list_of_stock_to_buy = {pd.Timestamp(2017, 12, 31): []}
list_of_stock_to_sell = {pd.Timestamp(2017, 12, 31): []}

for trading_date in date_list: 
  single_day_prediction_sdf = spark.sql(
      """
      SELECT *
      FROM real_world_predictions
      WHERE date == trading_date
      ORDER BY prediction DESC
      """
  )
  list_of_stock_to_buy[trading_date] = single_day_prediction_sdf.loc[0:19]['symbol']
  list_of_stock_to_sell[trading_date] = single_day_prediction_sdf.loc[-50:]['symbol']


Py4JError: ignored

### Step 2: Do daily stock trades and update portfolio

In [256]:
def get_share_price(symbol, date):
    stock_df = pd.read_csv(os.path.join('/content/stock_data/', symbol))
    stock_df['date'] = pd.to_datetime(stock_df.timestamp)
    result = len(stock_df.loc[stock_df.date == date]['adjusted_close'])
    if result == 0: 
      date = date - datetime.timedelta(days=1)
      price = get_share_price(symbol, date)
    if result == 1:
      price = stock_df.loc[stock_df.date == date]['adjusted_close'].iloc[0]
    return price

def get_SnPindex(date):
    stock_df = pd.read_csv(os.path.join('/content/other_data/stockindex.csv'))
    stock_df['date'] = pd.to_datetime(stock_df.date)
    result = len(stock_df.loc[stock_df.date == date]['S&P'])
    if result == 0: 
      date = date - datetime.timedelta(days=1)
      price = get_SnPindex(date)
    if result == 1:
      price = stock_df.loc[stock_df.date == date]['S&P'].iloc[0]
    return price

def trade_stock(stock_to_buy, stock_to_sell, date):
    cash_balance = 0
    for symbol in stock_to_sell: 
        price = get_share_price(symbol+".csv", date)
        volume_to_sell = portfolio[symbol]
        proceeds = volume_to_sell * price
        cash_balance = cash_balance + proceeds
    amount_to_buy_each = cash_balance / len(stock_to_buy)
    for symbol in stock_to_buy: 
        price = get_share_price(symbol+".csv", date)
        volume_to_buy = amount_to_buy_each / price
        portfolio[symbol] = portfolio[symbol] + volume_to_buy
        cash_balance = cash_balance - amount_to_buy_each
    return portfolio

def calc_portfolio_value(portfolio, date): 
    portfolio_value = 0
    for symbol, volume in portfolio.items(): 
        price = get_share_price(symbol+".csv", date)
        value = price * volume
        portfolio_value = portfolio_value + value
    return portfolio_value

# Apply strategy from 2018-1-1 to 2019-12-31.  
import datetime

cash_balance = 1000000
portfolio_value = 0
valuation_mark = cash_balance + portfolio_value

starting_date = pd.Timestamp(2018, 1, 1)
numdays = 500
date_list = [starting_date + datetime.timedelta(days=x) for x in range(numdays)]
valuation_marks = {pd.Timestamp(2017, 12, 31): valuation_mark}

for trading_date in date_list: 
    stock_to_buy = list_of_stock_to_buy[trading_date]      
    stock_to_sell = list_of_stock_to_sell[trading_date]         
    portfolio = trade_stock(stock_to_buy, stock_to_sell, trading_date)
    portfolio_value = calc_portfolio_value(portfolio, trading_date)
    valuation_mark = cash_balance + portfolio_value
    valuation_marks[trading_date] = valuation_mark

print(valuation_marks)

SnP_marks = {pd.Timestamp(2017, 12, 31): get_SnPindex(pd.Timestamp(2017, 12, 31))}
for trading_date in date_list:
    SnP_mark = get_SnPindex(trading_date)
    SnP_marks[trading_date] = SnP_mark
print(SnP_marks)

final_portfolio_return = valuation_marks[list(valuation_marks)[-1]] / valuation_marks[list(valuation_marks)[0]]
benchmark_return = SnP_marks[list(SnP_marks)[-1]] / SnP_marks[list(SnP_marks)[0]]

print("Our model generates ", final_portfolio_return, "over testing period")
print("While S&P500 as benchmark generated ", benchmark_return, "over testing period")


{Timestamp('2018-12-31 00:00:00'): 1000000, Timestamp('2019-01-01 00:00:00'): 1000000.0, Timestamp('2019-01-02 00:00:00'): 1000000.0, Timestamp('2019-01-03 00:00:00'): 1000000.0, Timestamp('2019-01-04 00:00:00'): 1000000.0, Timestamp('2019-01-05 00:00:00'): 1000000.0, Timestamp('2019-01-06 00:00:00'): 1000000.0, Timestamp('2019-01-07 00:00:00'): 1000000.0, Timestamp('2019-01-08 00:00:00'): 1000000.0, Timestamp('2019-01-09 00:00:00'): 1000000.0, Timestamp('2019-01-10 00:00:00'): 1000000.0, Timestamp('2019-01-11 00:00:00'): 1000000.0, Timestamp('2019-01-12 00:00:00'): 1000000.0, Timestamp('2019-01-13 00:00:00'): 1000000.0, Timestamp('2019-01-14 00:00:00'): 1000000.0, Timestamp('2019-01-15 00:00:00'): 1000000.0, Timestamp('2019-01-16 00:00:00'): 1000000.0, Timestamp('2019-01-17 00:00:00'): 1000000.0, Timestamp('2019-01-18 00:00:00'): 1000000.0, Timestamp('2019-01-19 00:00:00'): 1000000.0, Timestamp('2019-01-20 00:00:00'): 1000000.0, Timestamp('2019-01-21 00:00:00'): 1000000.0, Timestamp('