<h3><center> ToolBox: Functions for Future Use </center></h3>

In [4]:
import numpy as np
import pandas as pd 
import matplotlib.pyplot as plt
import seaborn as sns
from scipy.stats import spearmanr
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler

In [3]:
def corr_pval(df: pd.DataFrame, columns: list) -> pd.DataFrame:
    """Calculates the pairwise spearmanr correlation
    coefficient and its corresponding p-values"""
    
    length = len(columns)
    corr_matrix = np.empty(shape=(length, length), dtype=object)

    for row_index, first_value in enumerate(columns):
        col_vals = df[first_value]
        for col_index, second_value in enumerate(columns):
            second_col_vals = df[second_value]
            if type(corr_matrix[col_index][row_index]) == list:
                corr_matrix[row_index][col_index] = corr_matrix[col_index][row_index]
            else:
                corr, pval = spearmanr(col_vals, second_col_vals)
                corr_matrix[row_index][col_index] = [np.round(corr, 3), np.round(pval, 3)]

    return pd.DataFrame(corr_matrix, columns=columns, index=columns)

def ci_table(weights: dict) -> pd.DataFrame:
    """Returns a DataFrame for confidence intervals 
    that have 4 columns:
    1) Columns: Independent Variables
    2) Lower Percentile
    3) Upper Percentile
    4) Median
    
    weights: a dictonary that have independet variables as keys and
    their correponding bootstrapped samples as values"""
    all_columns = predictor_columns
    lower = []
    upper = []
    median = []
    for i in all_columns:
        all_weights = np.sort(weights[i])
        lower.append(np.percentile(all_weights, 2.5))
        upper.append(np.percentile(all_weights, 97.5))
        median.append(np.percentile(all_weights, 50))

    ci = pd.DataFrame({'Columns': all_columns,
                       'Lower Percentile': lower,
                       'Upper Percentile': upper,
                       'Median': median})
    return ci

def ci_visualize(df: pd.DataFrame, figsize=(9, 3)):
    """Visualizes the Confidence Intervals of the regression weights
    
    df: DataFrame that has 4 columns: 
    1) Columns: Independent Variables
    2) Lower Percentile
    3) Upper Percentile
    4) Median"""
    sns.set_style('ticks')
    plt.figure(figsize=figsize)
    for lower,upper,median, z in zip(df['Lower Percentile'], 
                                     df['Upper Percentile'],
                                     df['Median'],
                                     range(len(df))):
        if (lower == 0 or upper == 0) or (lower < 0 and upper > 0):
            plt.plot((lower,upper),(z,z),'-|', color='black')
            plt.scatter(median, z, color='black', s=15)
        else:
            plt.plot((lower,upper),(z,z),'-|', color='red')
            plt.scatter(median, z, color='red', s=15)

    plt.yticks(range(len(df)),list(df['Columns']))
    plt.axvline(0, marker='|', linestyle='dashed', markersize=5, linewidth=1, color='gray')
    plt.xlabel('Regression Weights', fontsize=13)
    plt.title('Confidence Intervals', fontsize=15)
    sns.despine()
    plt.show();

In [None]:
def split_normalize(data: pd.DataFrame, predictor_columns: list, 
                    dependent_col: str, date_col = 'month_year', 
                    normalize='standardize', split_ratio=0.20):
    """
    Splits the data and then normalizes it according to normalization type
    Available types: 'standardize', 'time-window-standardize'
    """
    assert normalize in ['standardize', 'time-window-standardize'], "Available normalization types: 'standardize', 'time-window-standardize'"
    
    #initializing empty dataframes
    Xtrain = pd.DataFrame(columns=predictor_columns)
    Xtest = pd.DataFrame(columns=predictor_columns)
    ytrain = pd.Series(dtype='float64')
    ytest = pd.Series(dtype='float64')
    
    time_windows = data[date_col].unique()
    full_columns = predictor_columns + [date_col]
    X = data[full_columns]
    y = data[dependent_col]
    
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=split_ratio)
    
    if normalize == 'time-window-standardize':
        for window in time_windows:
            X_train_temp = X_train[X_train[date_col] == window].drop(columns=[date_col])
            X_test_temp = X_test[X_test[date_col] == window].drop(columns=[date_col])
            y_train_temp = y_train.loc[np.array(X_train[date_col] == window)]
            y_test_temp = y_test.loc[np.array(X_test[date_col] == window)]

            X_scaler = StandardScaler()
            y_scaler = StandardScaler()

            X_train_temp = pd.DataFrame(X_scaler.fit_transform(X_train_temp), columns=predictor_columns)
            X_test_temp = pd.DataFrame(X_scaler.transform(X_test_temp), columns=predictor_columns)
            y_train_temp = pd.Series(y_scaler.fit_transform(y_train_temp.to_numpy().reshape(-1, 1)).reshape(1, -1)[0])
            y_test_temp = pd.Series(y_scaler.transform(y_test_temp.to_numpy().reshape(-1, 1)).reshape(1, -1)[0])

            Xtrain = pd.concat([Xtrain, X_train_temp], ignore_index=True)
            Xtest = pd.concat([Xtest, X_test_temp], ignore_index=True)
            ytrain = pd.concat([ytrain, y_train_temp], ignore_index=True)
            ytest = pd.concat([ytest, y_test_temp], ignore_index=True)
            
    else:
        X_scaler = StandardScaler()
        y_scaler = StandardScaler()
        Xtrain = pd.DataFrame(X_scaler.fit_transform(X_train), columns=predictor_columns)
        Xtest = pd.DataFrame(X_scaler.transform(X_test), columns=predictor_columns)
        ytrain = pd.DataFrame(y_scaler.fit_transform(y_train), columns=[dependent_col])
        ytest = pd.DataFrame(y_scaler.transform(y_test), columns=[dependent_col])
        
    return Xtrain, Xtest, ytrain, ytest