# Libraries import

In [12]:
import sys

del sys.modules["os"]
import os
import pandas as pd
import numpy as np
import types
import time
from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import MinMaxScaler


# Global variables

In [9]:
dir_path = os.path.join("C:\\", "app", "python-scripts", "machine_learning", "project")
train_data_file = "train_data.csv"
train_labels_file = "train_labels.csv"
test_data_file = "test_data.csv"
header_list = []


# Function definition's

In [13]:
def loaddata(
    dir_path: str, file_name: str, header_list: list
    ) -> pd.core.frame.DataFrame:
    """
    The function loads the given file and returns it as DataFrame
    Args:
        dir_path:   application working directory
        file_name:  the name of file to be loaded
        header_list:list of column names

    Returns:

    """
    return pd.read_csv(os.path.join(dir_path, file_name), names=header_list)

def dump_file(_dir_path: str, _file_name: str, _buffer: pd.core.frame.DataFrame):
    """
    The function saves the given variable buffer into binary file
    Args:
        _dir_path:   application working directorty
        _file_name:  the name of file to be saved
        _buffer:     variable to write

    Returns:

    """
    with open(os.path.join(_dir_path, _file_name), "wb") as f:
        dump(_buffer, f)


def load_file(_dir_path: str, _file_name: str) -> pd.core.frame.DataFrame:
    """
    The function load dataset from binary file, and return pandas DataFrame
    Args:
        _dir_path:   application working directory
        _file_name:  binary file to load

    Returns: DataFrame

    """
    with open(os.path.join(_dir_path, _file_name), "rb") as f:
        return load(f)


def standard_scaler(_df: pd.core.frame.DataFrame) -> np.ndarray:
    """
    standardization of dataset data using StandardScaler
    Args:
        _df:    dataset to standarization

    Returns: Standardized features

    """
    scaler = StandardScaler().fit(_df)
    return scaler.transform(_df)


def min_max_scaler(_df: pd.core.frame.DataFrame) -> np.ndarray:
    """
    standardization of dataset data using MinMaxScaler
    Args:
        _df:    dataset to standarization

    Returns: Standardized features

    """
    scaler = MinMaxScaler()
    return scaler.fit_transform(_df)




# Main section

In [14]:
if __name__ == "__main__":
    start_time = time.time()
    # Load labels with list of headers into pandas datafame
    header_list = ["T0000"]
    df_targets = loaddata(dir_path, train_labels_file, header_list)

    # Make dataset header
    header_list = []
    for i in range(10000):
        header_list.append(f"F{i:04d}")

    # Load dataset with list of headers into pandas dataframe
    df_features = loaddata(dir_path, train_data_file, header_list)
    print(f"loading csv elapsed time: {time.time() - start_time}[sec]")

    
    start_time = time.time()
    np_features = min_max_scaler(df_features)
    print(f"standarization time: {time.time() - start_time}[sec]")


loading csv elapsed time: 15.52216911315918
standarization time: 0.48613905906677246
