In [1]:
import pandas as pd
import numpy as np

In [2]:
def shuffle_data(X, y):

    data_num = np.arange(X.shape[0])
    np.random.shuffle(data_num)
    return X[data_num], y[data_num]

def train_test_split(X, y, test_size=0.5, shuffle=True):
    """
    Splits dataset into training and test sets.
    
    Parameters:
    ----------
    X : array-like
        Feature dataset.
    y : array-like
        Target values.
    test_size : float or int, default=0.5
        - If float (0 < test_size < 1), it represents the proportion of the dataset to include in the test split.
        - If int (1 <= test_size < len(y)), it represents the absolute number of test samples.
    shuffle : bool, default=True
        If True, shuffles data before splitting.
    
    Returns:
    -------
    X_train, X_test, y_train, y_test : array-like
        Split feature and target sets for training and testing."""
    
    if shuffle:
        X, y = shuffle_data(X, y)
    if test_size <1 :
        train_ratio = len(y) - int(len(y) *test_size)
        X_train, X_test = X[:train_ratio], X[train_ratio:]
        y_train, y_test = y[:train_ratio], y[train_ratio:]
        return X_train, X_test, y_train, y_test
    elif test_size in range(1,len(y)):
        X_train, X_test = X[test_size:], X[:test_size]
        y_train, y_test = y[test_size:], y[:test_size]
        return X_train, X_test, y_train, y_test
    

In [3]:
# Standardization formula: z = (x - mean) / standard_deviation

class Standardiser():
    def __init__(self) -> None:
        pass
    
    def fit(self, X):
        self.size = X.shape[1]
        
        self.mean = X.mean(axis=0)
        self.std = X.std(axis=0)
        return self     #returns self, which allows method chaining
    
    def transform(self,X):
        if X.shape[1] != self.size:
            raise Exception("Wrong array dimensions!")
        return (X-self.mean)/self.std
        

In [6]:
#Read the data

dile_name = 'data-reg.csv'
raw_data = pd.read_csv(dile_name, header=None).to_numpy()

X = raw_data[:,:5]
Y = raw_data[:,-1]

print(f"Mean of all predictors{X.mean(axis=0)}")

scaler = Standardiser().fit(X)

X_scaled = scaler.transform(X)
print(f"Mean of all predictors standardised{X_scaled.mean(axis=0)}")

print(raw_data[1:5,:])


X_train, y_train, X_test, y_test = train_test_split(X,Y, test_size=0.8, shuffle=False)



   
   


Mean of all predictors[0.20131508 8.97294864 0.13104092 0.11286003 0.22287212]
Mean of all predictors standardised[-2.53208760e-17  0.00000000e+00 -7.79103877e-18  4.28507132e-17
 -1.81790905e-17]
[[ 1.42724753e+00  1.49424745e+02  3.43088589e+00  4.66607724e+00
   9.00486546e+00  2.97000000e+01]
 [ 2.18171598e+00 -7.07277125e+01 -2.59226200e-01 -2.36445576e+00
   2.37749440e+00  1.78000000e+01]
 [ 5.95405824e+00  6.17248717e+02  1.08111101e+01  6.89822116e+00
   1.27327617e+01  2.68000000e+01]
 [ 4.22955892e+00  2.45741445e+02 -3.94933829e+00  5.22799998e+00
   2.79170509e+00  1.66000000e+01]]
