<a href="https://colab.research.google.com/github/stevegbrooks/big-portfolio-learner/blob/step1b/step1b_clean.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# CIS 545 Final Project

## Big Portfolio Learner: Clean Up Technical Indicators Data

### Team members: Steven Brooks & Chenlia Xu

In [1]:
import random
import numpy as np 
import json
import matplotlib
import pandas as pd
import matplotlib.pyplot as plt
from matplotlib import cm
from datetime import datetime
import glob
import seaborn as sns
import re
import os

In [2]:
%%capture
## If boto3 not already installed uncomment the following:
!pip3 install boto3

In [3]:
import boto3
from botocore import UNSIGNED

from botocore.config import Config

s3 = boto3.resource('s3', config=Config(signature_version=UNSIGNED))
s3.Bucket('cis545project').download_file('data/stock_data.zip', 'stock_data.zip')
s3.Bucket('cis545project').download_file('data/technical_data.zip', 'technical_data.zip')

In [4]:
%%capture

stock_dir = "stock_data"
if not os.path.exists(stock_dir):
  os.makedirs(stock_dir)
!unzip /content/stock_data.zip -d /content/$stock_dir
!rm -f $stock_dir/.gitempty

tech_dir = "technical_data"
if not os.path.exists(tech_dir):
  os.makedirs(tech_dir)
!unzip /content/technical_data.zip -d /content/$tech_dir
!rm -f $tech_dir/.gitempty

# Setup Spark Session

In [8]:
%%capture

!apt install libkrb5-dev
!pip install findspark
!pip install sparkmagic
!pip install pyspark
!pip install pyspark --user
!pip install seaborn --user
!pip install imageio --user
!pip install folium --user

!apt update
!apt install gcc python-dev libkrb5-dev

In [9]:
from pyspark.sql import SparkSession
from pyspark.sql.types import *
import pyspark.sql.functions as F

import os

spark = SparkSession.builder.getOrCreate()

In [10]:
%load_ext sparkmagic.magics



In [11]:
#graph section
import networkx as nx
# SQLite RDBMS
import sqlite3
# Parallel processing
# import swifter
import pandas as pd
# NoSQL DB
from pymongo import MongoClient
from pymongo.errors import DuplicateKeyError, OperationFailure

import os
os.environ['SPARK_HOME'] = '/content/spark-2.4.5-bin-hadoop2.7'
os.environ["JAVA_HOME"] = "/usr/lib/jvm/java-8-openjdk-amd64"
import pyspark
from pyspark.sql import SQLContext

try:
    if(spark == None):
        spark = SparkSession.builder.appName('Initial').getOrCreate()
        sqlContext=SQLContext(spark)
except NameError:
    spark = SparkSession.builder.appName('Initial').getOrCreate()
    sqlContext=SQLContext(spark)

We will load the data into the Spark context here.

In [12]:
stock_data_sdf = spark.read.load(
    'stock_data/*.csv', 
    format = 'csv', 
    header = 'true', 
    inferSchema = 'true', 
    sep = ','
)

First we'll set up the spark dataframe for stock prices using the work in `step1a`.

In [24]:
from pyspark.sql.functions import log, to_timestamp
from pyspark.sql.functions import year, month, date_format


stock_data_sdf = stock_data_sdf.withColumn("timestamp_as_dt", to_timestamp(stock_data_sdf.timestamp, 'yyyy-MM-dd'))
stock_data_sdf = stock_data_sdf.withColumn("year", year('timestamp_as_dt'))
stock_data_sdf = stock_data_sdf.withColumn("adjusted_close_log", log("adjusted_close"))
stock_data_sdf = stock_data_sdf.filter("year >= 2002 AND year <= 2019")

count_by_symbol_year_sdf = stock_data_sdf.groupBy(["symbol", "year"]).count()
count_years_by_symbol_sdf = count_by_symbol_year_sdf.groupBy(["symbol"]).count()

### Just grab stocks that have data in each of the 18 years from 2002 to 2019
### AND remove the three outliers
stocks_with_all_analysis_yrs_sdf = count_years_by_symbol_sdf.filter("count == 18") #18 years of data from 2002 and 2019
stocks_to_remove = ['DCTH', 'BRK-A', 'AIKI']
stocks_with_all_analysis_yrs_sdf = stocks_with_all_analysis_yrs_sdf.filter(stocks_with_all_analysis_yrs_sdf.symbol.isin(stocks_to_remove) == False)

stock_data_sdf.createOrReplaceTempView("stock_data")
stocks_with_all_analysis_yrs_sdf.createOrReplaceTempView("stocks_with_all_analysis_yrs")

stock_data_2002_2019_sdf = spark.sql(
    """
    SELECT *
    FROM stock_data
    WHERE symbol IN (SELECT symbol FROM stocks_with_all_analysis_yrs)
    """
)

In [17]:
from pyspark.sql.types import StructField, DoubleType, StructType, StringType

structs=[
  StructField('symbol', StringType(), True),
  StructField('timestamp', StringType(), True),
  StructField('EMA', DoubleType(), True),
  StructField('MACD', DoubleType(), True),
  StructField('SlowD', DoubleType(), True),
  StructField('RSI', DoubleType(), True),
  StructField('Real_Middle_Band', DoubleType(), True)
]

schema = StructType(fields = structs)

technical_data_sdf = spark.read.load(
    'technical_data/*.csv', 
    format = 'csv', 
    header = 'true', 
    schema = schema, 
    sep = ','
)

Then we will reduce the technical data set to just those stocks that match up with the first dataset above.

In [18]:
technical_data_sdf = technical_data_sdf.withColumn("timestamp_as_dt", to_timestamp(technical_data_sdf.timestamp, 'yyyy-MM-dd'))
technical_data_sdf = technical_data_sdf.withColumn("year", year('timestamp_as_dt'))
technical_data_sdf = technical_data_sdf.filter("year >= 2002 AND year <= 2019")

technical_data_sdf.createOrReplaceTempView("technical_data")

technical_data_2002_2019_sdf = spark.sql(
    """
    SELECT *
    FROM technical_data
    WHERE symbol IN (SELECT symbol FROM stocks_with_all_analysis_yrs)
    """
)

technical_data_2002_2019_sdf.printSchema()

root
 |-- symbol: string (nullable = true)
 |-- timestamp: string (nullable = true)
 |-- EMA: double (nullable = true)
 |-- MACD: double (nullable = true)
 |-- SlowD: double (nullable = true)
 |-- RSI: double (nullable = true)
 |-- Real_Middle_Band: double (nullable = true)
 |-- timestamp_as_dt: timestamp (nullable = true)
 |-- year: integer (nullable = true)



Join the two datasets and split for train, validation, and test

In [28]:
technical_data_2002_2019_sdf.createOrReplaceTempView("cleaned_technical_data")
stock_data_2002_2019_sdf.createOrReplaceTempView("cleaned_stock_data")

In [34]:
spark.sql(
    """
    SELECT T.symbol, T.timestamp_as_dt, T.EMA, T.MACD, T.SlowD, T.RSI, T.Real_Middle_Band, S.volume
    FROM cleaned_technical_data T JOIN cleaned_stock_data S
    ON T.symbol = S.symbol AND T.timestamp_as_dt = S.timestamp_as_dt
    """
).createOrReplaceTempView("cleaned_technical_data")

In [37]:
spark.sql(
    """
    SELECT symbol, timestamp_as_dt, adjusted_close_log
    FROM cleaned_stock_data
    """
).createOrReplaceTempView("cleaned_stock_data")