Get Prepared data for processing


In [None]:
import os
import pandas as pd
import numpy as np
import joblib
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import MinMaxScaler

df = pd.read_csv('../data/processed/Processed_data.csv',
                 index_col='Date', parse_dates=True)

sentiment_map = {'Bearish': -1, 'Neutral': 0, 'Bullish': 1}
df['SentimentNum'] = df['Sentiment'].map(sentiment_map)

# New column for date with Epoch time in seconds
df['Date'] = df.index.astype('int64') / 10**9

# Select features from Processed_data.csv

X = df[["Date", "GDP growth rate (%)", "Unemployment rate (%)", "Real interest rate (%)",
        "Inflation rate (%)", "Population growth (%)", "Export growth (%)", "SentimentScore", "SentimentNum"]]

# Select target variables from Processed_data.csv and replace 0 with NaN for training and testing

y = df[['AAPL', 'BTC-USD', 'GOOGL', 'MSFT']].replace(0, np.nan)

# Split the data into training and testing sets

X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, shuffle=True, random_state=42)


# Scale the features using MinMaxScaler


scaler = MinMaxScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)
y_train_scaled = scaler.fit_transform(y_train)
y_test_scaled = scaler.transform(y_test)


# Check if folder prepared data exists, if not create it
if not os.path.exists('../data/prepared'):
    os.makedirs('../data/prepared')


# Convert numpy arrays to pandas dataframes and Save the training and testing sets to CSV files
X_train_scaled = pd.DataFrame(
    X_train_scaled, columns=X.columns, index=X_train.index)
X_test_scaled = pd.DataFrame(
    X_test_scaled, columns=X.columns, index=X_test.index)
y_train_scaled = pd.DataFrame(
    y_train_scaled, columns=y.columns, index=y_train.index)
y_test_scaled = pd.DataFrame(
    y_test_scaled, columns=y.columns, index=y_test.index)

# Save the data to CSV files

X_train_scaled.sort_index().to_csv('../data/prepared/X_train_scaled.csv')
X_test_scaled.sort_index().to_csv('../data/prepared/X_test_scaled.csv')

y_train_scaled.sort_index().to_csv('../data/prepared/y_train_scaled.csv')
y_test_scaled.sort_index().to_csv('../data/prepared/y_test_scaled.csv')

# Save the original data to CSV files for reference
y_test.sort_index().to_csv('../data/prepared/y_test_original.csv')

# Save the scaler to a file for later use

joblib.dump(scaler, '../data/prepared/scaler.pkl')

['../data/prepared/scaler.pkl']