Get Prepared data for processing


In [None]:
import os
import pandas as pd
import numpy as np
import joblib
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import MinMaxScaler

df = pd.read_csv('../data/processed/Processed_data.csv',
                 index_col='Date', parse_dates=True)

sentiment_map = {'Bearish': -1, 'Neutral': 0, 'Bullish': 1}
df['SentimentNum'] = df['Sentiment'].map(sentiment_map)

# Select features from Processed_data.csv

X = df[["GDP growth rate (%)", "Unemployment rate (%)", "Real interest rate (%)",
        "Inflation rate (%)", "Population growth (%)", "Export growth (%)", "SentimentScore", "SentimentNum"]]

# Select target variables from Processed_data.csv and replace 0 with NaN for training and testing

y = df[['AAPL', 'BTC-USD', 'GOOGL', 'MSFT']].replace(0, np.nan)

# Split the data into training and testing sets

X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, shuffle=True, random_state=42)


print(X_train.head(20))


# Scale the features using MinMaxScaler


scaler = MinMaxScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)


# Check if folder prepared data exists, if not create it
if not os.path.exists('../data/prepared'):
    os.makedirs('../data/prepared')


# Convert numpy arrays to pandas dataframes and Save the training and testing sets to CSV files
X_train = pd.DataFrame(X_train_scaled, columns=X.columns, index=X_train.index)
X_test = pd.DataFrame(X_test_scaled, columns=X.columns, index=X_test.index)

# Save the data to CSV files

X_train.sort_index().to_csv('../data/prepared/X_train.csv')
X_test.sort_index().to_csv('../data/prepared/X_test.csv')

y_train.sort_index().to_csv('../data/prepared/y_train.csv')
y_test.sort_index().to_csv('../data/prepared/y_test.csv')

# Save the scaler to a file for later use

joblib.dump(scaler, '../data/prepared/scaler.pkl')

            GDP growth rate (%)  Unemployment rate (%)  \
Date                                                     
2011-05-16             1.832449               8.623521   
2019-01-06             2.518800               3.729082   
2011-08-03             1.989303               8.433055   
2009-05-12            -0.684468               9.390025   
2015-10-03             2.097120               4.970342   
2017-12-04             2.927467               3.931211   
2010-02-21             2.537192               9.537427   
2002-08-20             2.393548               5.913373   
2008-06-12            -1.084457               7.329383   
2008-10-12            -1.981153               8.486049   
2021-07-11             4.201213               4.459934   
2004-12-15             3.500467               5.104669   
2017-02-24             2.532909               4.287093   
2005-01-20             3.447163               5.060003   
2009-06-09            -0.280064               9.419099   
2020-07-09    

['../data/prepared/scaler.pkl']