# Loading and splitting the data

In this notebook we will first load and split the data into train, test and labels to pass into our model.

In [2]:
import pandas as pd
import numpy as np

In [3]:
df = pd.read_csv('normalised_data.csv')

In [5]:
df.describe()

Unnamed: 0,shangai,btc,crude oil,dax,euro,gold,silver,spy,ftse,hsi,...,MA200,stochRSI,RSI,btc_std_dev,std_dif,conf_int,hashrate,difficulty,transactions,t_cost
count,2778.0,2778.0,2778.0,1665.0,2778.0,2778.0,2778.0,2778.0,2778.0,2778.0,...,2691.0,2691.0,2691.0,2691.0,2691.0,2691.0,2778.0,2778.0,2778.0,2778.0
mean,0.270252,0.15485,0.701774,0.568666,0.394548,0.344136,0.253115,0.427729,0.551747,0.465237,...,0.245285,0.532964,0.583096,0.256707,0.102468,0.097318,0.1412163,0.1546927,0.303778,0.154293
std,0.160729,0.20994,0.156021,0.157132,0.25022,0.208389,0.189366,0.254075,0.232827,0.184326,...,0.344058,0.355758,0.19565,0.347463,0.160511,0.14965,0.2400142,0.2641203,0.315799,0.158696
min,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
25%,0.128674,0.002796,0.573618,0.461651,0.178477,0.192765,0.121774,0.197647,0.346391,0.339979,...,0.001123,0.202171,0.449066,0.001551,0.000504,0.000889,2.731303e-07,2.424056e-07,0.024813,0.038594
50%,0.289271,0.028008,0.655549,0.560341,0.31291,0.267663,0.165405,0.407019,0.572176,0.431315,...,0.045246,0.555324,0.572004,0.062386,0.013058,0.017446,0.003126407,0.003292851,0.157937,0.075552
75%,0.370071,0.333326,0.861095,0.674688,0.6243,0.504672,0.344026,0.654494,0.76345,0.596733,...,0.548718,0.875468,0.727869,0.672941,0.193318,0.183024,0.2312814,0.2553552,0.560025,0.240591
max,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,...,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0


In [None]:
labels = df.btc
train_df = 

In [7]:
import sagemaker
from sagemaker import get_execution_role

session = sagemaker.Session() # store the current SageMaker session

# get IAM role
role = get_execution_role()
print(role)

arn:aws:iam::724888201472:role/service-role/AmazonSageMaker-ExecutionRole-20201115T171901


In [8]:
# get default bucket
bucket_name = session.default_bucket()
print(bucket_name)
print()

sagemaker-eu-central-1-724888201472



In [9]:
# define location to store model artifacts
prefix = 'capstone'

output_path='s3://{}/{}/'.format(bucket_name, prefix)

print('Training artifacts will be uploaded to: {}'.format(output_path))

Training artifacts will be uploaded to: s3://sagemaker-eu-central-1-724888201472/capstone/


In [None]:
# define a PCA model
from sagemaker import PCA

# this is current features - 1
# you'll select only a portion of these to use, later
N_COMPONENTS=23

pca_model = PCA(role=role,
             train_instance_count=1,
             train_instance_type='ml.c4.xlarge',
             output_path=output_path, # specified, above
             num_components=N_COMPONENTS, 
             sagemaker_session=session)


In [None]:
# convert df to np array
train_data = train_df.values.astype('float32')

# convert to RecordSet format
record_train = pca_model.record_set(train_data)

In [None]:
%%time

# train the PCA mode on the formatted data
pca_model.fit(record_train)

In [None]:
# Get the name of the training job, it's suggested that you copy-paste
# from the notebook or from a specific job in the AWS console

training_job_name='pca-2020-10-06-16-58-17-215' # include one!

# where the model is saved, by default
model_key = os.path.join(prefix, training_job_name, 'output/model.tar.gz')
print(model_key)

# download and unzip model
boto3.resource('s3').Bucket(bucket_name).download_file(model_key, 'model.tar.gz')

# unzipping as pca_capstone
os.system('tar -zxvf model.tar.gz')
os.system('unzip pca_capstone')

In [None]:
import mxnet as mx

# loading the unzipped artifacts
pca_model_params = mx.ndarray.load('pca_capstone')

# what are the params
print(pca_model_params)

In [None]:
# get selected params
s=pd.DataFrame(pca_model_params['s'].asnumpy())
v=pd.DataFrame(pca_model_params['v'].asnumpy())

In [None]:
# Calculate the explained variance for the top n principal components
# you may assume you have access to the global var N_COMPONENTS
def explained_variance(s, n_top_components):
    '''Calculates the approx. data variance that n_top_components captures.
       :param s: A dataframe of singular values for top components; 
           the top value is in the last row.
       :param n_top_components: An integer, the number of top components to use.
       :return: The expected data variance covered by the n_top_components.'''
    
    start_idx = N_COMPONENTS - n_top_components  ## 33-3 = 30, for example
    # calculate approx variance
    exp_variance = np.square(s.iloc[start_idx:,:]).sum()/np.square(s).sum()
    
    return exp_variance[0]


In [None]:
alt_variate = []
def plot_variance_ncomp(s, n_top_components, cut=0.8):
    for i in range(n_top_components + 1):
        alt_variate.append(explained_variance(s, i))
        # print(i)
        alt_variate_np = np.array(alt_variate)
    # print(alt_variate_np)
    plt.plot(alt_variate_np, scaley=False)
    plt.plot(list(range(n_top_components)),[ cut for i in range(i)], color='red')
    plt.show()
    print('Explained variance: ', explained_variance(s,n_top_components)[0], 'with',n_top_components,'components')
    print('Total components: ', len(s))
    
plot_variance_ncomp(s, 12, 0.9)

In [None]:
# features
features_list = train_df.columns.values
print('Features: \n', features_list)

In [None]:
import seaborn as sns

def display_component(v, features_list, component_num, n_weights=10):
    
    # get index of component (last row - component_num)
    row_idx = N_COMPONENTS-component_num

    # get the list of weights from a row in v, dataframe
    v_1_row = v.iloc[:, row_idx]
    v_1 = np.squeeze(v_1_row.values)

    # match weights to features in counties_scaled dataframe, using list comporehension
    comps = pd.DataFrame(list(zip(v_1, features_list)), 
                         columns=['weights', 'features'])

    # we'll want to sort by the largest n_weights
    # weights can be neg/pos and we'll sort by magnitude
    comps['abs_weights']=comps['weights'].apply(lambda x: np.abs(x))
    sorted_weight_data = comps.sort_values('abs_weights', ascending=False).head(n_weights)

    # display using seaborn
    ax=plt.subplots(figsize=(10,6))
    ax=sns.barplot(data=sorted_weight_data, 
                   x="weights", 
                   y="features", 
                   palette="Blues_d")
    ax.set_title("PCA Component Makeup, Component #" + str(component_num))
    plt.show()


In [None]:
# display makeup of first component
num=2
display_component(v, train_df.columns.values, component_num=num, n_weights=10)

In [None]:
%%time
# this takes a little while, around 7mins
pca_predictor = pca_model.deploy(initial_instance_count=1, 
                              instance_type='ml.t2.medium')

In [None]:
# pass np train data to the PCA model
train_pca = pca_predictor.predict(train_data)

In [None]:
# check out the first item in the produced training features
data_idx = 0
print(train_pca[data_idx])

In [None]:
# create dimensionality-reduced data
def create_transformed_df(train_pca, counties_scaled, n_top_components):
    ''' Return a dataframe of data points with component features. 
        The dataframe should be indexed by State-County and contain component values.
        :param train_pca: A list of pca training data, returned by a PCA model.
        :param counties_scaled: A dataframe of normalized, original features.
        :param n_top_components: An integer, the number of top components to use.
        :return: A dataframe, indexed by State-County, with n_top_component values as columns.        
     '''
    # create new dataframe to add data to
    counties_transformed=pd.DataFrame()

    # for each of our new, transformed data points
    # append the component values to the dataframe
    for data in train_pca:
        # get component values for each data point
        components=data.label['projection'].float32_tensor.values
        counties_transformed=counties_transformed.append([list(components)])

    # index by county, just like counties_scaled
    counties_transformed.index=counties_scaled.index

    # keep only the top n components
    start_idx = N_COMPONENTS - n_top_components
    counties_transformed = counties_transformed.iloc[:,start_idx:]
    
    # reverse columns, component order     
    return counties_transformed.iloc[:, ::-1]
    

In [None]:
# specify top n
top_n = 7

# call your function and create a new dataframe
df_transformed = create_transformed_df(train_pca, train_df, n_top_components=top_n)

# add descriptive columns
PCA_list=['c_1', 'c_2', 'c_3', 'c_4', 'c_5', 'c_6', 'c_7']
df_transformed.columns=PCA_list 

# print result
df_transformed.head()

In [None]:
# delete predictor endpoint
session.delete_endpoint(pca_predictor.endpoint)