<a href="https://colab.research.google.com/github/stevegbrooks/big-portfolio-learner/blob/time-series-analysis/notebooks/step2_time_series_analysis.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# CIS 545 Final Project

## Big Portfolio Learner: Time Series Analysis

### Team members: Steven Brooks & Chenlia Xu

In [1]:
import random
import numpy as np 
import json
import matplotlib
import pandas as pd
import matplotlib.pyplot as plt
from matplotlib import cm
from datetime import datetime
import glob
import seaborn as sns
import re
import os

In [2]:
%%capture
## If boto3 not already installed uncomment the following:
!pip3 install boto3

In [3]:
import boto3
from botocore import UNSIGNED

from botocore.config import Config

s3 = boto3.resource('s3', config=Config(signature_version=UNSIGNED))
s3.Bucket('cis545project').download_file('data/stock_data.zip', 'stock_data.zip')
s3.Bucket('cis545project').download_file('data/technical_data.zip', 'technical_data.zip')

In [14]:
%%capture

if not os.path.exists("stock_data"):
  os.makedirs("stock_data")
!unzip /content/stock_data.zip -d /content/stock_data
!rm -f stock_data/.gitempty

if not os.path.exists("technical_data"):
  os.makedirs("technical_data")
!unzip /content/technical_data.zip -d /content/technical_data
!rm -f technical_data/.gitempty

# Setup for Spark

In [15]:
%%capture

!wget -nc https://downloads.apache.org/spark/spark-3.1.2/spark-3.1.2-bin-hadoop3.2.tgz
!tar xf spark-3.1.2-bin-hadoop3.2.tgz

!apt install libkrb5-dev
!pip install findspark
!pip install sparkmagic
!pip install pyspark
!pip install pyspark --user

!apt update
!apt install gcc python-dev libkrb5-dev

In [16]:
import os

import pyspark
from pyspark.sql import SQLContext
from pyspark.sql import SparkSession
from pyspark.sql.types import *
import pyspark.sql.functions as F

import os

spark = SparkSession.builder.getOrCreate()

%load_ext sparkmagic.magics

os.environ['SPARK_HOME'] = '/content/spark-3.1.2-bin-hadoop3.2'
os.environ["JAVA_HOME"] = "/usr/lib/jvm/java-8-openjdk-amd64"

try:
    if(spark == None):
        spark = SparkSession.builder.appName('Initial').getOrCreate()
        sqlContext=SQLContext(spark)
except NameError:
    spark = SparkSession.builder.appName('Initial').getOrCreate()
    sqlContext=SQLContext(spark)

The sparkmagic.magics extension is already loaded. To reload it, use:
  %reload_ext sparkmagic.magics


# Setup for Darts (Time Series Modeling)

In [17]:
%%capture
!pip install 'u8darts[all]'

In [18]:
import torch

from darts import TimeSeries
from darts.utils.timeseries_generation import gaussian_timeseries, linear_timeseries, sine_timeseries
from darts.models import RNNModel, TCNModel, TransformerModel, NBEATSModel, BlockRNNModel
from darts.metrics import mape, smape
from darts.dataprocessing.transformers import Scaler
from darts.utils.timeseries_generation import datetime_attribute_timeseries
from darts.datasets import AirPassengersDataset, MonthlyMilkDataset

torch.manual_seed(1); np.random.seed(1)  # for reproducibility

# Load the stock data

In [19]:
stock_data_sdf = spark.read.load(
    'stock_data/*.csv', 
    format = 'csv', 
    header = 'true', 
    inferSchema = 'true', 
    sep = ','
)

# Section 1: Train Test Split

We will train the data using the years 2002 to 2017. Our validation set will be the year 2018. Our test set will be the year 2019.

In [20]:
series_air = AirPassengersDataset().load()
series_milk = MonthlyMilkDataset().load()
series_air