In [5]:
import numpy as np
import matplotlib.pyplot as plt
import random

Train Test Split function - used to randomly seperate training and testing data to improve the model.

In [6]:
def train_test_split(X, y, test_size=0.2, random_state=None):
    if random_state is not None:
        np.random.seed(random_state)
    
    indices = np.arange(X.shape[0])
    np.random.shuffle(indices)
    
    test_size = int(len(X) * test_size)
    test_indices = indices[:test_size]
    train_indices = indices[test_size:]
    
    X_train, X_test = X[train_indices], X[test_indices]
    y_train, y_test = y[train_indices], y[test_indices]
    
    return X_train, X_test, y_train, y_test

Time Series Split - provides train and test indices to split time ordered data.

In [7]:
def TimeSeriesSplit(X, y, n_splits=5):
    fold_size = len(X) // n_splits
    splits = []
    
    for i in range(1, n_splits + 1):
        train_end = i * fold_size
        X_train, y_train = X[:train_end], y[:train_end]
        X_test, y_test = X[train_end:train_end + fold_size], y[train_end:train_end + fold_size]
        splits.append((X_train, X_test, y_train, y_test))
    
    return splits

Standard Scaler - standardizes features to have zero mean and unit variance (equally contribute to model performance)

In [8]:
class StandardScaler:
    def __init__(self):
        self.mean_ = None
        self.scale_ = None

    def fit(self, X):
        self.mean_ = np.mean(X, axis=0)
        self.scale_ = np.std(X, axis=0)

    def transform(self, X):
        return (X - self.mean_) / self.scale_

    def fit_transform(self, X):
        self.fit(X)
        return self.transform(X)

MinMax Scaler - used to linearly scale data ( scales feautures to a fixed range)

In [9]:
class minMaxScaler:
    def __init__(self):
        self.min_ = None
        self.max_ = None

    def fit(self, X):
        self.min_ = np.min(X, axis=0)
        self.max_ = np.max(X, axis=0)

    def transform(self, X):
        return (X - self.min_) / (self.max_ - self.min_)

    def fit_transform(self, X):
        self.fit(X)
        return self.transform(X)

One Hot Encoder - used to convert categorical integer labels into binary vectors.

In [10]:
def OneHotEncoder(y):
    classes = np.unique(y)
    one_hot = np.zeros((y.shape[0], len(classes)))
    for idx, cls in enumerate(classes):
        one_hot[y == cls, idx] = 1
    return one_hot

Label Encoder - used to convert text labels into integers.

In [11]:
class LabelEncoder:
    def __init__(self):
        self.classes_ = None

    def fit(self, y):
        self.classes_ = np.unique(y)

    def transform(self, y):
        return np.array([np.where(self.classes_ == label)[0][0] for label in y])

    def fit_transform(self, y):
        self.fit(y)
        return self.transform(y)

Simple Imputer - It is used to handle missing data in the dataset by giving options to fill.

In [12]:
class SimpleImputer:
    def __init__(self, strategy='mean'):
        self.strategy = strategy
        self.fill_value_ = None

    def fit(self, X):
        if self.strategy == 'mean':
            self.fill_value_ = np.nanmean(X, axis=0)
        elif self.strategy == 'median':
            self.fill_value_ = np.nanmedian(X, axis=0)
        elif self.strategy == 'most_frequent':
            self.fill_value_ = np.array([np.bincount(col[~np.isnan(col)].astype(int)).argmax() for col in X.T])

    def transform(self, X):
        X_imputed = X.copy()
        for i in range(X.shape[1]):
            X_imputed[np.isnan(X_imputed[:, i]), i] = self.fill_value_[i]
        return X_imputed

    def fit_transform(self, X):
        self.fit(X)
        return self.transform(X)