In [14]:
import pandas as pd
import os
import matplotlib.pyplot as plt
import numpy as np
from sklearn.model_selection import StratifiedShuffleSplit
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import cross_validate
from sklearn.model_selection import KFold
from sklearn import metrics
from sklearn.utils import all_estimators
from sklearn.pipeline import Pipeline
from sklearn.impute import KNNImputer
from scipy.stats.mstats import winsorize
from sklearn.preprocessing import FunctionTransformer
from sklearn.preprocessing import OneHotEncoder
from sklearn.preprocessing import LabelEncoder
from sklearn.compose import ColumnTransformer
from sklearn.decomposition import PCA

import warnings
warnings.filterwarnings('ignore')
from inspect import signature, _empty


def load_data(datapath) -> pd.DataFrame:
    """
    This function will take the relative file path of a csv file and return a pandas DataFrame of the csv content.
    """
    csv_path = os.path.abspath(datapath)
    return pd.read_csv(csv_path)

def create_strat_cat(raw_data) -> pd.DataFrame:
    """
    This function will add a categorical column to the dataframe. This column is the categorical representation of the class
    label of each instance. This will enable the data to be split according to the distribution of the class values. The appended
    dataframe will be returned.
    """
    strat_label = raw_data.columns[-1]
    description = raw_data.describe()
    strat_bins = list(description.loc['min':'max',strat_label])
    strat_bins[0], strat_bins[-1] = -np.inf, np.inf
    raw_data[f"{strat_label}_cat"] = pd.cut(raw_data[strat_label],bins=strat_bins,labels=[1,2,3,4])
    data_w_strat_cat = raw_data
    return data_w_strat_cat, strat_label

def data_split(datapath) -> pd.DataFrame:
    """
    This function will take a relative datapath of a dataset in csv format and will split the data into training attributes, 
    training labels, test attributes, and test labels according to the distribution of a categorical class label.
    """
    raw_data = load_data(datapath)
    data_w_strat_cat, strat_label = create_strat_cat(raw_data)
    split = StratifiedShuffleSplit(n_splits=1,test_size=0.2,random_state=42)
    for train_index, test_index in split.split(data_w_strat_cat,data_w_strat_cat[f"{strat_label}_cat"]):
        train_set = data_w_strat_cat.loc[train_index]
        test_set = data_w_strat_cat.loc[test_index]
    for set_ in(train_set,test_set):
        set_.drop(f"{strat_label}_cat",axis=1,inplace=True)
    train = train_set.copy()
    test = test_set.copy()

    data_label = train.columns[-1]
    train_attrib = train.drop(data_label,axis=1)
    train_labels = train[data_label].copy()
    test_attrib = test.drop(data_label,axis=1)
    test_labels = test[data_label].copy()
    display(train_attrib)
    return train_attrib

def transform(datapath):
    train_attrib =  data_split(datapath)
    cat = []
    num = []
    for i in range(len(train_attrib.axes[1])):
        if (type(train_attrib.iat[1,i])) in (object,bool,str):
            cat += [train_attrib.axes[1][i]]
        else:
            num += [train_attrib.axes[1][i]]
    OutlierWinsorize = FunctionTransformer(winsorize,validate = True)
    cat_pipe = Pipeline(steps=[('encoder', OneHotEncoder(handle_unknown='ignore', sparse=False))])
    num_pipe = Pipeline(steps=[('wisorization', OutlierWinsorize)])
    ctrans = ColumnTransformer(transformers=[('categorical', cat_pipe, cat),('numeric', num_pipe, num)]) 
    finalpipe = Pipeline(steps=[('column_transform', ctrans),('scaler',StandardScaler()),('imputer', KNNImputer())])
    #('pca', PCA(n_components=.95))
    x = pd.DataFrame(ctrans.fit_transform(train_attrib))
    display(x)

transform('employees.csv')

Unnamed: 0,Gender,Start Date,Bonus %,Senior Management,Team
822,Female,39870.0,7.266,True,Business Development
932,Female,39805.0,2.010,True,Client Services
416,Male,39628.0,5.966,,Distribution
198,Female,33234.0,9.640,True,Product
696,Male,41054.0,11.593,True,Marketing
...,...,...,...,...,...
203,Female,41278.0,2.784,True,Business Development
511,Male,30501.0,8.130,True,Business Development
695,Female,36256.0,5.146,False,Human Resources
643,Male,32049.0,9.770,False,Human Resources


Unnamed: 0,0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18
0,1.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,39870.0,7.266
1,1.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,39805.0,2.010
2,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,39628.0,5.966
3,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,33234.0,9.640
4,0.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,41054.0,11.593
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
795,1.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,41278.0,2.784
796,0.0,1.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,30501.0,8.130
797,1.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,36256.0,5.146
798,0.0,1.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,32049.0,9.770
