# Libraries import

In [16]:
import sys

del sys.modules["os"]
import os
import pandas as pd
import numpy as np
import types
import time
from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import MinMaxScaler
from sklearn.decomposition import PCA



# Global variables

In [9]:
dir_path = os.path.join("C:\\", "app", "python-scripts", "machine_learning", "project")
train_data_file = "train_data.csv"
train_labels_file = "train_labels.csv"
test_data_file = "test_data.csv"
header_list = []


# Function definition's

In [21]:
def loaddata(
    dir_path: str, file_name: str, header_list: list
    ) -> pd.core.frame.DataFrame:
    """
    The function loads the given file and returns it as DataFrame
    Args:
        dir_path:   application working directory
        file_name:  the name of file to be loaded
        header_list:list of column names

    Returns:

    """
    return pd.read_csv(os.path.join(dir_path, file_name), names=header_list)

def dump_file(_dir_path: str, _file_name: str, _buffer: pd.core.frame.DataFrame):
    """
    The function saves the given variable buffer into binary file
    Args:
        _dir_path:   application working directorty
        _file_name:  the name of file to be saved
        _buffer:     variable to write

    Returns:

    """
    with open(os.path.join(_dir_path, _file_name), "wb") as f:
        dump(_buffer, f)


def load_file(_dir_path: str, _file_name: str) -> pd.core.frame.DataFrame:
    """
    The function load dataset from binary file, and return pandas DataFrame
    Args:
        _dir_path:   application working directory
        _file_name:  binary file to load

    Returns: DataFrame

    """
    with open(os.path.join(_dir_path, _file_name), "rb") as f:
        return load(f)


def standard_scaler(_df: pd.core.frame.DataFrame) -> np.ndarray:
    """
    standardization of dataset data using StandardScaler
    Args:
        _df:    dataset to standarization

    Returns: Standardized features

    """
    scaler = StandardScaler().fit(_df)
    return scaler.transform(_df)


def min_max_scaler(_df: pd.core.frame.DataFrame) -> np.ndarray:
    """
    standardization of dataset data using MinMaxScaler
    Args:
        _df:    dataset to standarization

    Returns: Standardized features

    """
    scaler = MinMaxScaler()
    return scaler.fit_transform(_df)


def log_transformation(df: pd.core.frame.DataFrame) -> pd.core.frame.DataFrame:
    """
    functions find and reduces skewed data with log-transformation
    not used, because this dataset has normal distribution
    Args:
        _df: dataset to transform
    Returns: DataFrame witnormalized data.
    """
    for col in df.columns:
        df[str(col)].map(lambda i: np.log(i) if i > 0 else 0)
    return df


def pca_data_rescaled(_np: np.ndarray, _n_comp: int, _svd_solver: str) -> np.ndarray:
    """
    the function reduces the size of the dataset 
    using the PCA method 
    Args:
        _df: dataset to reduced
    Returns: DataFrame with reduced size
    """
    pca = PCA(n_components=_n_comp, svd_solver=_svd_solver)
    pca.fit(_np)
    return pca.transform(_np)


def explain_variance_graph():
    """
    plots the minimum number of components for 0.99 variances 
    Args: none
    Returns: none
    """
    %matplotlib inline
    plt.rcParams["figure.figsize"] = (12,6)

    fig, ax = plt.subplots()
    xi = np.arange(1, 3558, step=1)
    y = np.cumsum(pca.explained_variance_ratio_)

    plt.ylim(0.0,1.1)

    plt.plot(xi, y, marker='o', linestyle='--', color='b')

    plt.xlabel('Number of Components')
    plt.ylabel('Cumulative variance (%)')
    plt.title('The number of components needed to explain variance')

    plt.axhline(y=0.99, color='r', linestyle='-')
    plt.text(0.5, 0.85, '99% cut-off threshold', color = 'red', fontsize=16)

    ax.grid(axis='x')
    plt.show()
    
    

# Main section

In [19]:
if __name__ == "__main__":
    start_time = time.time()
    # Load labels with list of headers into pandas datafame
    header_list = ["T0000"]
    df_targets = loaddata(dir_path, train_labels_file, header_list)

    # Make dataset header
    header_list = []
    for i in range(10000):
        header_list.append(f"F{i:04d}")

    # Load dataset with list of headers into pandas dataframe
    df_features = loaddata(dir_path, train_data_file, header_list)
    print(f"loading csv elapsed time: {time.time() - start_time}[sec]")

    
    start_time = time.time()
    np_features = min_max_scaler(df_features)
    print(f"standarization time: {time.time() - start_time}[sec]")

    
    start_time = time.time()
    print(f'oryginal shape {np_features.shape}')
    np_pca_reduced = pca_data_rescaled(np_features, 0.99, 'auto')
    print(f'new shape {np_pca_reduced.shape}')
    print(f"reducing dataset size elapsed time: {time.time() - start_time}")

loading csv elapsed time: 18.633920907974243[sec]
standarization time: 0.47403693199157715[sec]
oryginal shape (3750, 10000)
new shape (3750, 3557)
reducing dataset size elapsed time: 85.14154243469238
