In [39]:
# Importing necessary libraries
import pandas as pd               
import numpy as np                
import matplotlib.pyplot as plt   
import os                         
import joblib                     
import hopsworks                  
import re
from hsfs.client.exceptions import RestAPIError
#Making the notebook able to fetch from the .env file
from dotenv import load_dotenv

load_dotenv()
# Check if the feature group already exists before creating it
def create_stocks_feature_view(fs, version):
    try:
        amd_fg = fs.get_feature_group('amd_stock', version=version)
        print(f"Feature group 'amd_stock' with version {version} already exists.")
    except RestAPIError as e:
        if e.error_code == 270089:
            print(f"Creating feature group 'amd_stock' with version {version}.")
            # Loading in the feature groups
            amd_fg = fs.get_feature_group('amd_stock', version=7)
            # Defining the query
            ds_query = amd_fg.select(['date', 'open', 'close'])
            # Creating the feature view
            feature_view = fs.create_feature_view(
                name='amd_stocks_fv',
                query=ds_query,
                labels=['open']
            )
            return feature_view, amd_fg
        else:
            raise e



In [40]:
#Getting connected to hopsworks 
api_key = os.environ.get('hopsworks_api') 
project = hopsworks.login(api_key_value=api_key) 
fs = project.get_feature_store()

Connection closed.
Connected. Call `.close()` to terminate connection gracefully.

Logged in to project, explore it here https://c.app.hopsworks.ai:443/p/564374
Connected. Call `.close()` to terminate connection gracefully.


In [41]:
#Defining the function to create feature view

def create_stocks_feature_view(fs, version):

    # Loading in the feature groups
    amd_fg = fs.get_feature_group('amd_stock', version=8)
    

    # Defining the query
    ds_query = amd_fg.select(['date', 'open', 'close'])\
        

    # Creating the feature view
    feature_view = fs.create_feature_view(
        name='amd_stocks_fv',
        query=ds_query,
        labels=['open']
    )

    return feature_view, amd_fg

In [42]:
def create_stocks_feature_view(fs, version):
    # Load and preprocess data
    amd_df = pd.read_csv('AMD_stock_prices.csv')
    amd_df['date'] = pd.to_datetime(amd_df['date'])

    # Ensure 'ticker' column exists
    if 'ticker' not in amd_df.columns:
        amd_df['ticker'] = 'AMD'

    # Clean column names
    def clean_column_name(col):
        col = col.lower()
        col = re.sub(r'[^a-z0-9_]', '_', col)
        if not re.match(r'^[a-z]', col):
            col = 'f_' + col
        return col

    amd_df.columns = [clean_column_name(col) for col in amd_df.columns]

    # Convert data types
    numeric_columns = [col for col in amd_df.columns if col not in ['date', 'ticker']]
    for col in numeric_columns:
        amd_df[col] = pd.to_numeric(amd_df[col], errors='coerce')

    # Create or get the feature group
    try:
        amd_fg = fs.get_feature_group('amd_stock', version=version)
        print(f"Feature group 'amd_stock' with version {version} already exists.")
    except RestAPIError as e:
        print(f"Creating feature group 'amd_stock' with version {version}.")
        amd_fg = fs.create_feature_group(
            name='amd_stock',
            version=version,
            description='AMD stock dataset',
            primary_key=['ticker'],
            event_time='date',
            online_enabled=False
        )
        # Insert data into the feature group
        amd_fg.insert(amd_df, write_options={"wait_for_job": True})
        print("Data inserted into the feature group.")

    # Create or get the feature view
    try:
        feature_view = fs.get_feature_view('amd_stocks_fv', version=version)
        print(f"Feature view 'amd_stocks_fv' with version {version} already exists.")
    except RestAPIError as e:
        print(f"Creating feature view 'amd_stocks_fv' with version {version}.")
        # Define the query
        feature_query = amd_fg.select_all()
        # Create the feature view
        feature_view = fs.create_feature_view(
            name='amd_stocks_fv',
            version=version,
            description='Feature view for AMD stock data',
            labels=['f_4__close'],  # Adjust label column as needed
            query=feature_query
        )
        print("Feature view created.")

    return feature_view, amd_fg

version = 8  # Use the desired version number

feature_view, amd_fg = create_stocks_feature_view(fs, version)

Feature group 'amd_stock' with version 8 already exists.
Feature view 'amd_stocks_fv' with version 8 already exists.


In [43]:
# Verify the feature view
print("Features in the feature view:")
for feature in feature_view.features:
    print(f"- {feature.name} ({feature.type})")


print("Sample data from the feature view:")



Features in the feature view:
- date (timestamp)
- f_1__open (double)
- f_2__high (double)
- f_3__low (double)
- f_4__close (double)
- f_5__volume (double)
- ticker (string)
Sample data from the feature view:


In [45]:
#Defining a function to get fixed data from the feature view
from OPS.preprocessing_stocks import extract_business_day


def fix_data_from_feature_view(df,start_date,end_date):
    df = df.sort_values("date")
    df = df.reset_index()
    df = df.drop(columns=["index"])

    # Create a boolean mask for rows that fall within the date range
    mask = (pd.to_datetime(df['date']) >= pd.to_datetime(start_date)) & (pd.to_datetime(df['date']) <= pd.to_datetime(end_date))
    len_df = np.shape(df)
    df = df[mask] # Use the boolean mask to filter the DataFrame
    print('From shape {} to {} after cropping to given date range: {} to {}'.format(len_df,np.shape(df),start_date,end_date))

    # Get rid off all non-business days
    isBusinessDay, is_open = extract_business_day(start_date,end_date)
    is_open = [not i for i in is_open] # Invert the mask to be able to drop all non-buisiness days

    filtered_df = df.drop(df[is_open].index) # Use the mask to filter the rows of the DataFrame
    print('From shape {} to {} after removing non-business days'.format(np.shape(df),np.shape(filtered_df)))
    print(filtered_df)
    



            1. open  2. high    3. low  4. close   5. volume
date                                                        
2024-10-11  164.185   169.35  163.0101    167.89  42136175.0
2024-10-10  169.760   172.01  162.0000    164.18  75113590.0
2024-10-09  174.050   174.05  169.5500    171.02  33890735.0
2024-10-08  171.750   173.60  170.0000    172.80  34730152.0
2024-10-07  171.080   172.41  168.2100    170.97  38379930.0
<class 'pandas.core.frame.DataFrame'>
DatetimeIndex: 6277 entries, 2024-10-11 to 1999-11-01
Data columns (total 5 columns):
 #   Column     Non-Null Count  Dtype  
---  ------     --------------  -----  
 0   1. open    6277 non-null   float64
 1   2. high    6277 non-null   float64
 2   3. low     6277 non-null   float64
 3   4. close   6277 non-null   float64
 4   5. volume  6277 non-null   float64
dtypes: float64(5)
memory usage: 294.2 KB
2022-10-17 00:00:00
2024-10-11 00:00:00
