# Data Exploration

In [None]:
# note: The downloaded data is ready to be fed into any model. 
# Features, which are used as input variables for training the model, are provided in rows 1 to 144 of each file.
# Labels, which represent the target variable that the model aims to predict, are provided from row 145 to the end of each file.


In [None]:
# txt to numpy array to csv file
# The data is in the form of a .txt file, with each row containing a single data point.
# after running this cell, the data will be in the form of a .csv file in your local directory

import numpy as np
import pandas as pd
import os


# Directory containing the folders & files
data_dir = 'BenchmarkDatasets\BenchmarkDatasets'

# Iterate through each folder (auction and noauction)
for folder in ['Auction', 'NoAuction']:
    folder_path = os.path.join(data_dir, folder)

    # Iterate through each normalization setup (z-score)
    for normalization_setup in ['1.' + str(folder)+ '_' + 'Zscore']:
        normalization_setup_path = os.path.join(folder_path, normalization_setup)
       
        # Iterate through each type of data (training and testing)
        for data_type in [ 'Training',  'Testing']:
           
            data_type_path = os.path.join(normalization_setup_path, f"{folder}_Zscore_{data_type}")
            # print what is in the folder
            print(os.listdir(data_type_path))
            
            # train_auction  -> test_auction -> train_noauction -> test_noauction    
            for file in os.listdir(data_type_path):
                print(file)
                # Read the content of the TXT file
                with open(os.path.join(data_type_path, file), 'r') as file:
                    lines = file.readlines()
                    filename = os.path.basename(file.name) 
                    # Convert the content of the TXT file to a numpy array
                    data = np.array([line.strip().split() for line in lines])

                    # save the numpy array as a CSV file
                    df = pd.DataFrame(data)

                    # Save to csv
                    df.to_csv(f"{filename.split('.')[0]}.csv", index=False, header = False)
                    
               

In [5]:
#csv file to particular stock data 
# input: csv file, stock index
# stock index: 0, 1, 2, 3, 4 - each number being a different company stock
# output: numpy array of the stock data

def extract_stock_from_csv(csv_file, stock_idx):
    # Load CSV file into a pandas DataFrame
    df = pd.read_csv(csv_file)

    # Convert DataFrame to NumPy array
    raw_data = df.to_numpy()

    # Calculate boundaries
    n_boundaries = 4
    boundaries = np.sort(
        np.argsort(np.abs(np.diff(raw_data[0], prepend=np.inf)))[-n_boundaries - 1:]
    )
    boundaries = np.append(boundaries, [raw_data.shape[1]])

    # Split data
    split_data = tuple(raw_data[:, boundaries[i]:boundaries[i + 1]] for i in range(n_boundaries + 1))

    return split_data[stock_idx]

In [None]:
extract_stock_from_csv('Train_Dst_Auction_ZScore_CF_1.csv', 0)

In [11]:
# extract features & labels & transpose to connect them 
# input: data (csv file) 
# output: x (features), y (labels)


def extract_feature_label (data):
    # Load CSV file into a pandas DataFrame
    df = pd.read_csv(data)
    # pd to numpy
    data = df.to_numpy()
    
    data_length = data.shape[0] - 5

    # x is the features, y is the labels
    x = data[:data_length, :].T 
    y = data[-5:, :].T
    
    return x, y

In [15]:
# Example of one file. 
# x is the features, y is the labels for all files we work on 

x, y = extract_feature_label('Train_Dst_Auction_ZScore_CF_1.csv')
print(x.shape)
print(y.shape)


(47342, 143)
(47342, 5)


# Baseline Model 