# Common Utilities Code:

In [1]:
# imports
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

**Data Preprocessing:**

In [2]:
from sklearn.model_selection import train_test_split

def data_splitter(df, train_size, stratification_column, seed=42):
    """
    Function splits dataframe into 2 parts. 
    """
    
    if stratification_column is not None:
        # stratified split
        df_train, df_test = train_test_split(df, train_size=train_size, random_state=seed, stratify=df[stratification_column])
    else:
        df_train, df_test = train_test_split(df, train_size=train_size, random_state=seed)
    
    df_train, df_test = df_train.reset_index(drop=True), df_test.reset_index(drop=True)
    
    return df_train, df_test

In [3]:
def find_unique_identifiers(df, dtype='float'):
    """
    Find identifier columns in entire dataset

    Parameters
    ----------
    df : dataframe which is combination of train-validation-test
    dtype : the data type identifiers cannot have, 'float' by default
            i.e., if a feature has this data type, it cannot be an identifier
    
    Returns
    ----------
    The dataframe of identifiers
    """
    
    # Get the dataframe of identifiers
    df_id = df[[var for var in df.columns
                # If the data type is not dtype
                if (df[var].dtype != dtype
                    # If the value is unique for each sample
                    and df[var].nunique(dropna=True) == df[var].notnull().sum())]]
    
    return df_id

In [4]:
def drop_unique_identifiers(df, df_id):
    df = df.drop(columns=np.intersect1d(df_id.columns, df.columns))
    return df

In [5]:
def nan_checker(df):
    """
    The NaN checker

    Parameters
    ----------
    df : the dataframe(train+test+validation) which contains NaNs. You should replace all other
    representation of NaN like nan, n/a with np.NaN
    
    Returns
    ----------
    The dataframe of variables with NaN, their proportion of NaN and data type
    """
    
    # Get the dataframe of variables with NaN, their proportion of NaN and data type
    df_nan = pd.DataFrame([[var, df[var].isna().sum() / df.shape[0], df[var].dtype]
                           for var in df.columns if df[var].isna().sum() > 0],
                          columns=['var', 'proportion', 'dtype'])
    
    # Sort df_nan in accending order of the proportion of NaN
    df_nan = df_nan.sort_values(by='proportion', ascending=False).reset_index(drop=True)
    
    return df_nan

In [6]:
from sklearn.impute import SimpleImputer

def impute_data(columns, df_train, df_val, df_test, strategy):
    
    si = SimpleImputer(missing_values=np.nan, strategy=strategy)
    
    # find statistics based on train data
    df_train[columns] = si.fit_transform(df_train[columns])
    
    # impute validation data using train statistics
    df_val[columns] = si.transform(df_val[columns])
    
    # impute test data using train statistics
    df_test[columns] = si.transform(df_test[columns])
    
    return df_train, df_val, df_test

In [7]:
# https://imbalanced-learn.org/dev/references/generated/imblearn.over_sampling.SMOTEN.html
from imblearn.over_sampling import SMOTEN

def perform_smote(X_train, y_train, strategy):
    # The SMOTE
    smote = SMOTEN(random_state=42, sampling_strategy=strategy)

    # Augment the training data
    X_smote_train, y_smote_train = smote.fit_resample(X_train, y_train)
    
    return X_smote_train, y_smote_train

---------------------

### Evaluation Tools:

- F2 Score
- Confusion Matrix

**F2 Score**

In [8]:
from sklearn.metrics import fbeta_score

def f2_measure(y_true, y_pred):
    return fbeta_score(y_true, y_pred, beta=2)

**Confusion Matrix**

In [9]:
from sklearn.metrics import confusion_matrix

def confusion_matrix_plotter(y_true, y_pred, title):
    # compute confusion matrix
    conf_matrix = confusion_matrix(y_true, y_pred)
    
    # plot confusion matrix
    fig, ax = plt.subplots(figsize=(7.5, 7.5))
    ax.matshow(conf_matrix, cmap=plt.cm.Blues, alpha=0.3)
    for i in range(conf_matrix.shape[0]):
        for j in range(conf_matrix.shape[1]):
            ax.text(x=j, y=i, s=conf_matrix[i, j], va='center', ha='center', size='xx-large')

    plt.xlabel('Predictions', fontsize=18)
    plt.ylabel('Actuals', fontsize=18)
    plt.title(title, fontsize=18)
    plt.show()

------------------

**Modeling Utils:**

In [10]:
from sklearn.model_selection import PredefinedSplit

def get_train_val_ps(X_train, y_train, X_val, y_val):
    """
    Get the:
    feature matrix and target velctor in the combined training and validation data
    target vector in the combined training and validation data
    PredefinedSplit
    
    Parameters
    ----------
    X_train : the feature matrix in the training data
    y_train : the target vector in the training data
    X_val : the feature matrix in the validation data
    y_val : the target vector in the validation data  

    Return
    ----------
    The feature matrix in the combined training and validation data
    The target vector in the combined training and validation data
    PredefinedSplit
    """  

    # Combine the feature matrix in the training and validation data
    X_train_val = np.vstack((X_train, X_val))

    # Combine the target vector in the training and validation data
    y_train_val = np.vstack((y_train.reshape(-1, 1), y_val.reshape(-1, 1))).reshape(-1)

    # Get the indices of training and validation data
    train_val_idxs = np.append(np.full(X_train.shape[0], -1), np.full(X_val.shape[0], 0))

    # The PredefinedSplit
    ps = PredefinedSplit(train_val_idxs)

    return X_train_val, y_train_val, ps

------------

**Pickling Utils**

In [11]:
import pickle

def save_object(pkl_file_path, pkl_file_name, file_to_save):
    with open(pkl_file_path+"/"+pkl_file_name+".pkl", 'wb') as file:
        pickle.dump(file_to_save, file)

In [12]:
import pickle

def load_object(pkl_file_path, pkl_file_name):
    
    with open(pkl_file_path+"/"+pkl_file_name+".pkl", 'rb') as file:
        pickled_object = pickle.load(file)
    
    return pickled_object

# References:

- [Utilities](https://github.com/yuxiaohuang/teaching/blob/master/gwu/machine_learning_I/fall_2020/code/utilities/p2_shallow_learning/pmlm_utilities_shallow.ipynb)
- [Regression](https://github.com/yuxiaohuang/teaching/blob/master/gwu/machine_learning_I/fall_2020/code/p2_shallow_learning/p2_c1_data_preprocessing/code_example/regression.ipynb)
- [Classification](https://github.com/yuxiaohuang/teaching/blob/master/gwu/machine_learning_I/fall_2020/code/p2_shallow_learning/p2_c1_data_preprocessing/code_example/classification.ipynb)
- [Tree Models](https://github.com/yuxiaohuang/teaching/blob/master/gwu/machine_learning_I/fall_2020/code/p2_shallow_learning/p2_c2_supervised_learning/p2_c2_s5_tree_based_models/code_example/code_example.ipynb)
- [Logistic Regression](https://github.com/yuxiaohuang/teaching/blob/master/gwu/machine_learning_I/fall_2020/code/p2_shallow_learning/p2_c2_supervised_learning/p2_c2_s3_logistic_regression/case_study/case_study_bcw.ipynb)
- [Imbalanced Classification German Bank](https://machinelearningmastery.com/imbalanced-classification-of-good-and-bad-credit/)