In [29]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import MinMaxScaler
import torch
from torch.utils.data import Dataset, DataLoader

In [33]:
data = pd.read_csv('jan_train.csv')

# change this as needed for the number of satellites used
data = data[data['sat_id'].isin([0, 1, 2, 3, 4, 5, 6])]

# Convert time to seconds since first day
data['epoch'] = pd.to_datetime(data['epoch'])
reference_time = pd.Timestamp('2014-01-01 00:00:00.000')
data['time'] = (data['epoch'] - reference_time).dt.total_seconds()
data.drop(columns=['epoch'], inplace=True)
data['time'] = data['time'].astype(int)

# Fill in NA cells with average
data.fillna(data.mean(), inplace=True)

# Remove outliers 
Q1 = data.quantile(0.25)
Q3 = data.quantile(0.75)
IQR = Q3 - Q1
lower_bound = Q1 - 1.5 * IQR
upper_bound = Q3 + 1.5 * IQR

outlier_mask = ((data < lower_bound) | (data > upper_bound)).any(axis=1)
data = data[~outlier_mask]

# Normalize between -1 and 1
numeric_cols = data.select_dtypes(include=[np.number]).columns.difference(['time', 'id', 'sat_id'])
scaler = MinMaxScaler(feature_range=(-1, 1))
data.loc[:, numeric_cols] = scaler.fit_transform(data[numeric_cols])

data[:10]

Unnamed: 0,id,sat_id,x,y,z,Vx,Vy,Vz,x_sim,y_sim,z_sim,Vx_sim,Vy_sim,Vz_sim,time
0,0,0,-0.24579,0.486323,-0.976791,-0.208606,-0.675218,-0.913636,-0.143951,0.461467,-0.998713,-0.207422,-0.674203,-0.915322,0
3,3,0,-0.252363,-0.321483,-0.976352,0.090341,-0.591174,0.664548,-0.150823,-0.376853,-1.0,0.091557,-0.591108,0.663871,8409
4,4,0,-0.197762,-0.519495,-0.658086,0.141318,-0.426047,0.926969,-0.096552,-0.582583,-0.682571,0.14267,-0.426248,0.927151,11212
8,8,0,0.088839,-0.761753,0.975226,0.1596,0.157358,0.997815,0.188801,-0.834976,0.949056,0.161335,0.15732,1.0,22424
24,24,0,-0.245577,0.487125,-0.975713,-0.209069,-0.67471,-0.915479,-0.143928,0.461423,-0.998693,-0.207537,-0.67422,-0.915317,67272
27,27,0,-0.252593,-0.32078,-0.977168,0.090047,-0.591675,0.663382,-0.150901,-0.376898,-0.99994,0.09148,-0.591095,0.663845,75681
28,28,0,-0.198073,-0.51898,-0.659248,0.141133,-0.426615,0.926232,-0.096652,-0.582626,-0.682529,0.14261,-0.426251,0.927068,78484
32,32,0,0.088484,-0.761977,0.973798,0.15968,0.156903,0.998033,0.188675,-0.835083,0.94895,0.161355,0.157246,0.999917,89696
48,48,0,-0.245371,0.487919,-0.974625,-0.209538,-0.67421,-0.91729,-0.143911,0.461369,-0.998671,-0.207656,-0.674249,-0.915282,134544
51,51,0,-0.252825,-0.320088,-0.977933,0.089763,-0.592169,0.662253,-0.150981,-0.376955,-0.999839,0.091412,-0.591075,0.663851,142953


In [31]:
#get the processed data into dataloader, ready for use in the network
class SatelliteDataset(Dataset):
    def __init__(self, features, targets):
        self.features = features
        self.targets = targets

    def __len__(self):
        return len(self.features)

    def __getitem__(self, idx):
        return self.features[idx], self.targets[idx]

features = data[['x', 'y', 'z', 'Vx', 'Vy', 'Vz']].values
targets = data[['x_sim', 'y_sim', 'z_sim', 'Vx_sim', 'Vy_sim', 'Vz_sim']].values

# convert to tensors
features_tensor = torch.tensor(features, dtype=torch.float32)
targets_tensor = torch.tensor(targets, dtype=torch.float32)

dataset = SatelliteDataset(features_tensor, targets_tensor)

dataloader = DataLoader(dataset, batch_size=32, shuffle=True)