# References:

- [Utilities](https://github.com/yuxiaohuang/teaching/blob/master/gwu/machine_learning_I/fall_2020/code/utilities/p2_shallow_learning/pmlm_utilities_shallow.ipynb)
- [Regression](https://github.com/yuxiaohuang/teaching/blob/master/gwu/machine_learning_I/fall_2020/code/p2_shallow_learning/p2_c1_data_preprocessing/code_example/regression.ipynb)
- [Classification](https://github.com/yuxiaohuang/teaching/blob/master/gwu/machine_learning_I/fall_2020/code/p2_shallow_learning/p2_c1_data_preprocessing/code_example/classification.ipynb)

In [1]:
# imports
from sklearn.model_selection import train_test_split
import pandas as pd
import numpy as np
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import MinMaxScaler
from sklearn.preprocessing import StandardScaler
from imblearn.over_sampling import RandomOverSampler
from imblearn.over_sampling import SMOTE

### Data Preprocessing:

In [2]:
def data_splitter(df, train_size, stratification_column, seed=42):
    """
    Function splits dataframe into 2 parts. 
    """
    
    if stratification_column is not None:
        # stratified split
        df_train, df_test = train_test_split(df, train_size=train_size, random_state=seed, stratify=df[stratification_column])
    else:
        df_train, df_test = train_test_split(df, train_size=train_size, random_state=seed)
    
    df_train, df_test = df_train.reset_index(drop=True), df_test.reset_index(drop=True)
    
    return df_train, df_test

In [3]:
def data_split_view_distribution(df_train, df_val, df_test, target_column):
    # TODO this will take a train-test-validation dataframes and plot the distribution of target to show stratified subplots
    # if discrete, then plot barcharts
    # if continuous, then plot histogram
    pass

In [4]:
def common_var_checker(df_train, df_val, df_test, target):
    """
    The common variables checker

    Parameters
    ----------
    df_train : the dataframe of training data
    df_val : the dataframe of validation data
    df_test : the dataframe of test data
    target : the name of the target

    Returns
    ----------
    The dataframe of common variables between the training, validation and test data
    {"common_var":[....]}
    """
    
    # Get the dataframe of common variables between the training, validation and test data
    df_common_var = pd.DataFrame(np.intersect1d(np.intersect1d(df_train.columns, df_val.columns), np.union1d(df_test.columns, [target])),
                                 columns=['common_var'])
                
    return df_common_var

In [5]:
def remove_uncommon_features(df, df_common_var):
    # find uncommon features
    uncommon_features = np.setdiff1d(df.columns, df_common_var['common_var'])
    
    # print(uncommon_features)
    
    # delete the uncommon features
    df = df.drop(columns=uncommon_features)
    
    return df

In [6]:
def find_unique_identifiers(df, dtype='float'):
    """
    Find identifier columns in entire dataset

    Parameters
    ----------
    df : dataframe which is combination of train-validation-test
    dtype : the data type identifiers cannot have, 'float' by default
            i.e., if a feature has this data type, it cannot be an identifier
    
    Returns
    ----------
    The dataframe of identifiers
    """
    
    # Get the dataframe of identifiers
    df_id = df[[var for var in df.columns
                # If the data type is not dtype
                if (df[var].dtype != dtype
                    # If the value is unique for each sample
                    and df[var].nunique(dropna=True) == df[var].notnull().sum())]]
    
    return df_id

In [7]:
def drop_unique_identifiers(df, df_id):
    df = df.drop(columns=np.intersect1d(df_id.columns, df.columns))
    return df

In [8]:
def datetime_transformer(df, datetime_vars):
    """
    The datetime transformer

    Parameters
    ----------
    df : the dataframe
    datetime_vars : the datetime variables as list
    
    Returns
    ----------
    The dataframe where datetime_vars are transformed into the following 6 datetime types:
    year, month, day, hour, minute and second
    """
    
    # The dictionary with key as datetime type and value as datetime type operator
    dict_ = {'year'   : lambda x : x.dt.year,
             'month'  : lambda x : x.dt.month,
             'day'    : lambda x : x.dt.day,
             'hour'   : lambda x : x.dt.hour,
             'minute' : lambda x : x.dt.minute,
             'second' : lambda x : x.dt.second}
    
    # Make a copy of df
    df_datetime = df.copy(deep=True)
    
    # For each variable in datetime_vars
    for var in datetime_vars:
        # Cast the variable to datetime
        df_datetime[var] = pd.to_datetime(df_datetime[var])
        
        # For each item (datetime_type and datetime_type_operator) in dict_
        for datetime_type, datetime_type_operator in dict_.items():
            # Add a new variable to df_datetime where:
            # the variable's name is var + '_' + datetime_type
            # the variable's values are the ones obtained by datetime_type_operator
            df_datetime[var + '_' + datetime_type] = datetime_type_operator(df_datetime[var])
            
    # Remove datetime_vars from df_datetime
    df_datetime = df_datetime.drop(columns=datetime_vars)
                
    return df_datetime

In [9]:
def nan_checker(df):
    """
    The NaN checker

    Parameters
    ----------
    df : the dataframe(train+test+validation) which contains NaNs. You should replace all other
    representation of NaN like nan, n/a with np.NaN
    
    Returns
    ----------
    The dataframe of variables with NaN, their proportion of NaN and data type
    """
    
    # Get the dataframe of variables with NaN, their proportion of NaN and data type
    df_nan = pd.DataFrame([[var, df[var].isna().sum() / df.shape[0], df[var].dtype]
                           for var in df.columns if df[var].isna().sum() > 0],
                          columns=['var', 'proportion', 'dtype'])
    
    # Sort df_nan in accending order of the proportion of NaN
    df_nan = df_nan.sort_values(by='proportion', ascending=False).reset_index(drop=True)
    
    return df_nan

In [10]:
def combine_dfs(dfs:list):
    """List of dataframes to combine into 1 dataframe"""
    df = pd.concat([dfs], sort=False)
    return df

In [11]:
def separate_dfs(combined_df, train_len, val_len):
    """
    combined_df assumes that dataframes were combined in following order: train, validation, test
    """
    # Separating the training data
    df_train = df.iloc[:train_len, :]

    # Separating the validation data
    df_val = df.iloc[train_len:(train_len + val_len), :]

    # Separating the test data
    df_test = df.iloc[(train_len + val_len):, :]
    
    return df_train, df_val, df_test

In [12]:
def remove_missing_values(df_nan, df_train, df_val, df_test):
    """
    df_nan is output of nan_checker(..) function which returns df that contains columns: var, proportion, dtype
    """
    
    # we want to remove columns which have data type of float64 and keep columns of data type string
    df_miss = df_nan[df_nan['dtype'] == 'float64'].reset_index(drop=True)
    
    # print(df_miss)
    
    if len(df_miss['var']) > 0:
        # Remove rows with missing values from df_train
        df_remove_train = df_train.dropna(subset=np.intersect1d(df_miss['var'], df_train.columns),
                                        inplace=False)

        # Remove rows with missing values from df_val
        df_remove_val = df_val.dropna(subset=np.intersect1d(df_miss['var'], df_val.columns),
                                        inplace=False)

        # Remove rows with missing values from df_test
        df_remove_test = df_test.dropna(subset=np.intersect1d(df_miss['var'], df_test.columns),
                                    inplace=False)
        
        return df_remove_train, df_remove_val, df_remove_test
    
    else:
        # no rows contain missing data thus return nothing
        return None    

In [13]:
def impute_missing_values(df_nan, df_train, df_val, df_test, strategy="mean"):
    """
    df_nan is output of nan_checker(..) function which returns df that contains columns: var, proportion, dtype
    """
    
    # we want to impute columns which have data type of float64 and keep columns of data type string
    df_miss = df_nan[df_nan['dtype'] == 'float64'].reset_index(drop=True)
    
    # If there are missing values
    if len(df_miss['var']) > 0:
        # The SimpleImputer
        si = SimpleImputer(missing_values=np.nan, strategy=strategy)

        # Impute the variables with missing values in df_train, df_val and df_test 
        df_train[df_miss['var']] = si.fit_transform(df_train[df_miss['var']])
        df_val[df_miss['var']] = si.transform(df_val[df_miss['var']])
        df_test[df_miss['var']] = si.transform(df_test[df_miss['var']])
    
    return df_train, df_val, df_test

In [14]:
def cat_var_checker(df, dtype='object'):
    """
    The categorical variable checker

    Parameters
    ----------
    df : the dataframe (train + test + validation)
    dtype : the data type categorical variables should have, 'object' by default
            i.e., if a variable has this data type, it should be a categorical variable
    
    Returns
    ----------
    The dataframe of categorical variables and their number of unique value
    """
    
    # Get the dataframe of categorical variables and their number of unique value
    df_cat = pd.DataFrame([[var, df[var].nunique(dropna=False)]
                           # If the data type is dtype
                           for var in df.columns if df[var].dtype == dtype],
                          columns=['var', 'nunique'])
    
    # Sort df_cat in accending order of the number of unique value
    df_cat = df_cat.sort_values(by='nunique', ascending=False).reset_index(drop=True)
    
    return df_cat

In [15]:
def perform_one_hot_encoding(df, df_cat, target):
    """
    df_cat is obtained from cat_var_checker
    df contains (train + test + validation)
    """
    df = pd.get_dummies(df, columns=np.setdiff1d(df_cat['var'], [target]))
    return df

### Scaling the data:

In [16]:
def split_features_and_targets(df_train, df_val, df_test, target):
    # Get the feature matrix
    X_train = df_train[np.setdiff1d(df_train.columns, [target])].values
    X_val = df_val[np.setdiff1d(df_val.columns, [target])].values
    X_test = df_test[np.setdiff1d(df_test.columns, [target])].values

    # Get the target vector
    y_train = df_train[target].values
    y_val = df_val[target].values
    y_test = df_test[target].values
    
    return X_train, y_train, X_val, y_val, X_test, y_test

#### Min-Max Scaler

In [17]:
def perform_min_max_scaler(X_train, y_train, X_val, y_val, X_test, y_test,  classification_problem=True):
    """
    - all np arrays formed using split_features_and_targets(...)
    - classification_problem=True, means the target column will not be normalized
    - classification_problem=False, means the target column will be normalized
    """
    mms = MinMaxScaler()
    
    # normalize the feature data
    # find statistics based on training data
    X_mms_train = mms.fit_transform(X_train)

    # Normalize the validation data
    X_mms_val = mms.transform(X_val)

    # Normalize the test data
    X_mms_test = mms.transform(X_test)
    
    # perform normalization on target variable in regression problem
    if not classification_problem:
        # Normalize the training data
        y_mms_train = mms.fit_transform(y_train.reshape(-1, 1)).reshape(-1)

        # Normalize the validation data
        y_mms_val = mms.transform(y_val.reshape(-1, 1)).reshape(-1)

        # Normalize the test data
        y_mms_test = mms.transform(y_test.reshape(-1, 1)).reshape(-1)
    
    return X_mms_train, y_mms_train, X_mms_val, y_mms_val, X_mms_test, y_mms_test

#### Standardization Scaler

In [18]:
def perform_standard_scaler(X_train, y_train, X_val, y_val, X_test, y_test,  classification_problem=True):
    """
    - all np arrays formed using split_features_and_targets(...)
    - classification_problem=True, means the target column will not be normalized
    - classification_problem=False, means the target column will be normalized
    """
    # The StandardScaler
    ss = StandardScaler()
    
    # normalize the feature data
    # find statistics based on training data
    # Standardize the training data
    X_ss_train = ss.fit_transform(X_train)

    # Standardize the validation data
    X_ss_val = ss.transform(X_val)

    # Standardize the test data
    X_ss_test = ss.transform(X_test)
    
    # perform normalization on target variable in regression problem
    if not classification_problem:
        # Standardize the training data
        y_ss_train = ss.fit_transform(y_train.reshape(-1, 1)).reshape(-1)

        # Standardize the validation data
        y_ss_val = ss.transform(y_val.reshape(-1, 1)).reshape(-1)

        # Standardize the test data
        y_ss_test = ss.transform(y_test.reshape(-1, 1)).reshape(-1)
    
    return X_ss_train, y_ss_train, X_ss_val, y_ss_val, X_ss_test, y_ss_test

### Handle Class imbalance

#### Oversampling

In [19]:
def oversampler(X_train, y_train, random_seed=42):
    ros = RandomOverSampler(random_state=random_seed)
    X_ros_train, y_ros_train = ros.fit_resample(X_train, y_train)
    
    return X_ros_train, y_ros_train

#### SMOTE

In [20]:
def smote(X_train, y_train, random_seed=42):
    smote = SMOTE(random_state=random_seed)

    # Augment the training data
    X_smote_train, y_smote_train = smote.fit_resample(X_train, y_train)
    
    return X_smote_train, y_smote_train

### Visualizations

#### Bar Graphs