Get Prepared data for processing


In [None]:
import os
import pandas as pd
import numpy as np
import joblib
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import MinMaxScaler

df = pd.read_csv('../data/processed/Processed_data.csv',
                 index_col='Date', parse_dates=True)

sentiment_map = {'Bearish': -1, 'Neutral': 0, 'Bullish': 1}
df['SentimentNum'] = df['Sentiment'].map(sentiment_map)

# Extract date features from the index
df['date'] = df.index
df['year'] = df['date'].dt.year
df['month'] = df['date'].dt.month
df['day'] = df['date'].dt.day
df['day_of_week'] = df['date'].dt.dayofweek

# Cyclical encoding of month and day of week
df['month_sin'] = np.sin(2 * np.pi * df['month'] / 12)
df['month_cos'] = np.cos(2 * np.pi * df['month'] / 12)
df['day_of_week_sin'] = np.sin(2 * np.pi * df['day_of_week'] / 7)
df['day_of_week_cos'] = np.cos(2 * np.pi * df['day_of_week'] / 7)


# Recalculate days starting from 1.1.2000
df['days_since_start'] = (df['date'] - pd.Timestamp('2000-01-01')).dt.days


# Select features from Processed_data.csv

X = df[["month_sin", "month_cos", "day_of_week_sin", "day_of_week_cos", "days_since_start", "GDP growth rate (%)", "Unemployment rate (%)", "Real interest rate (%)",
        "Inflation rate (%)", "Population growth (%)", "Export growth (%)", "Import growth (%)", "SentimentNum"]]

# Select target variables from Processed_data.csv and replace 0 with NaN for training and testing

print(X.head()) 

y = df[['AAPL', 'BTC-USD', 'GOOGL', 'MSFT']].replace(0, np.nan)

# Split the data into training and testing sets

X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, shuffle=True, random_state=42)


# Scale the features using MinMaxScaler


scaler_X = MinMaxScaler()
scaler_y = MinMaxScaler()

X_train_scaled = scaler_X.fit_transform(X_train)
X_test_scaled = scaler_X.transform(X_test)
y_train_scaled = scaler_y.fit_transform(y_train)
y_test_scaled = scaler_y.transform(y_test)


# Check if folder prepared data exists, if not create it
if not os.path.exists('../data/prepared'):
    os.makedirs('../data/prepared')


# Convert numpy arrays to pandas dataframes and Save the training and testing sets to CSV files
X_train_scaled = pd.DataFrame(
    X_train_scaled, columns=X.columns, index=X_train.index)
X_test_scaled = pd.DataFrame(
    X_test_scaled, columns=X.columns, index=X_test.index)
y_train_scaled = pd.DataFrame(
    y_train_scaled, columns=y.columns, index=y_train.index)
y_test_scaled = pd.DataFrame(
    y_test_scaled, columns=y.columns, index=y_test.index)

# Save the data to CSV files

X_train_scaled.sort_index().to_csv('../data/prepared/X_train_scaled.csv')
X_test_scaled.sort_index().to_csv('../data/prepared/X_test_scaled.csv')

y_train_scaled.sort_index().to_csv('../data/prepared/y_train_scaled.csv')
y_test_scaled.sort_index().to_csv('../data/prepared/y_test_scaled.csv')

# Save the original data to CSV files for reference
y_test.sort_index().to_csv('../data/prepared/y_test_original.csv')

# Save the scaler to a file for later use

joblib.dump(scaler_X, '../models/scaler_X.pkl')
joblib.dump(scaler_y, '../models/scaler_y.pkl')

            month_sin  month_cos  day_of_week_sin  day_of_week_cos  \
Date                                                                 
2002-01-01        0.5   0.866025         0.781831         0.623490   
2002-01-02        0.5   0.866025         0.974928        -0.222521   
2002-01-03        0.5   0.866025         0.433884        -0.900969   
2002-01-04        0.5   0.866025        -0.433884        -0.900969   
2002-01-05        0.5   0.866025        -0.974928        -0.222521   

            days_since_start  GDP growth rate (%)  Unemployment rate (%)  \
Date                                                                       
2002-01-01               731             1.700447               5.783000   
2002-01-02               732             1.703448               5.783564   
2002-01-03               733             1.706448               5.784129   
2002-01-04               734             1.709449               5.784693   
2002-01-05               735             1.712449    

['../models/scaler_y.pkl']