In [4]:
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt

from scipy.stats import yeojohnson
from scipy.stats import skew
import matplotlib.gridspec as gridspec
import matplotlib.image as mpimg
from scipy.stats import normaltest

from sklearn.model_selection import train_test_split, GridSearchCV, RandomizedSearchCV, StratifiedKFold, cross_validate, learning_curve, RepeatedKFold
from sklearn.linear_model import BayesianRidge
from sklearn.compose import ColumnTransformer
from sklearn.impute import SimpleImputer
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import PowerTransformer, FunctionTransformer, RobustScaler, StandardScaler
from sklearn.linear_model import BayesianRidge, Lasso
from sklearn.model_selection import train_test_split, KFold
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score
from scipy.stats import shapiro
from mapie.regression import MapieRegressor
from mapie.metrics import regression_coverage_score


import scipy.stats as ss
from scipy.stats import boxcox

import math 
import numpy as np
from scipy import stats

import os
import re
import timeit
import re

import warnings
warnings.filterwarnings(action='ignore')
%matplotlib inline

import pickle

from mapie.metrics import regression_coverage_score

fig_size = plt.rcParams["figure.figsize"]

pd.options.display.float_format = '{:.9f}'.format


pd.set_option('display.max_rows', 500)
pd.set_option('display.max_columns', 500)
pd.set_option('display.width', 1000)

In [3]:
import warnings
warnings.filterwarnings('ignore')

# Functions

In [6]:
def fix_col_names(df):
    '''
    takes a df and converts column names to lowercase underscore seperated
    '''
    df.columns = df.columns.str.lower().str.replace(' ', '_')
    return df

In [7]:
def summary(df,data_type='numeric'):
    '''
    Calcualte the descriptive stats for different type of data
        Args:
            df: DataFrame.
            data_type=['numeric','categorical']
        Returns:
            this function returns a data frame containing summary stats for different types of data
                

    '''
    df_column=df.select_dtypes(include=['int64','float64']).columns
    df_describe = df[df_column]
    stats = {
    "Variable": list(x.title() for x in df_column),
    "Min": list(df_describe.apply(np.min,axis=0).values),
    "Median": list(df_describe.apply(np.median,axis=0).values),
    "Mean": list(df_describe.apply(np.mean,axis=0).values),
    "Variance": list(df_describe.apply(np.var,axis=0).values),
    "Max": list(df_describe.apply(np.max,axis=0).values),
    "Std": list(df_describe.apply(np.std,axis=0).values),
    "Kurtosis": df_describe.apply(lambda x:x.kurt(),axis=0).values.tolist(),
    "Skewness": df_describe.apply(lambda x:x.skew(),axis=0).values.tolist(),
    # "Sum": list(df_describe.apply(np.sum,axis=0).values),
    # "Mad": df_describe.apply(lambda x:x.mad(),axis=0).values.tolist(),
    "N_Zeros": df_describe.apply(lambda x:len(x)-np.count_nonzero(x),axis=0).values.tolist(),
    "N_Nulls": df_describe.apply(lambda x:np.count_nonzero(np.isnan(x)),axis=0).values.tolist(),
    'Count': df_describe.apply(lambda x:len(x),axis=0).values
    }
    print ("Numeric Variables Dataset Shape: "+ str(df_describe.shape))
    return pd.DataFrame(stats)

In [8]:
def display_descriptive_stats(df_in, col_ls, nfeats_per_group=25):
    """
    This function displays descriptive statistics for the specified columns/list of columns. It splits the specified columns into groups and displays 
    the statistics for each group side by side.
    
    Parameters:
    df_in (pd.DataFrame): The input DataFrame containing the data for which descriptive statistics are to be displayed.
    col_ls (list): A list of column names for which the descriptive statistics are to be calculated and displayed.
    nfeats_per_group (int, optional): The number of features to display per group. Default is 25.
    
    Returns:
    None: The function displays the results and does not return any value.
    
    The function performs the following steps:
    1. Prints the count of columns to be analyzed.
    2. Iterates over each group of columns, calculates descriptive statistics for each group, and displays the 
    statistics in three categories:
    - General statistics including Min, Median, Mean, Max, Kurtosis, and Skewness.
    - Standard Deviation.
    - Count statistics including the number of zeros, number of nulls, and total count.
    5. Uses pandas Styler to format the DataFrames and apply color gradients to enhance visual interpretation.
    """
    
    print('cols cnt for analysis:  ', len(col_ls))
    
    def horizontal(dfs):
        from IPython.display import HTML
        
        html = '<div style="display:flex">'
        for df in dfs:
            html += '<div style="margin-right: 32px">'
            html += df.to_html()
            html += '</div>'
        html += '</div>'
        display(HTML(html))
    
    loop_cnt = range(int(np.ceil(len(col_ls)/nfeats_per_group)))
    print('loop cnt: ',loop_cnt)
    print('feature count per table: ',nfeats_per_group)

    
    n=0
    for i in loop_cnt:
        
        tbl_cols = col_ls[n:n+nfeats_per_group]

        
        df_num_stats = summary(df_in[tbl_cols])

        df_stat_points = df_num_stats[['Variable','Min', 'Median', 'Mean', 'Max', 'Kurtosis', 'Skewness']]
        # df_stat_points.index = df_num_stats['Variable']
        df_stat_std = df_num_stats[['Std']]
        df_stat_cnts = df_num_stats[['N_Zeros',	'N_Nulls', 'Count']]
        
        # display(df_num_stats_summary, '\n')
        
        # df_time_med = df_time_pivot.loc[:, pd.IndexSlice['median',:]]
        # df_time_mean = df_time_pivot.loc[:, pd.IndexSlice['mean',:]]
       
        horizontal(

            # https://stackoverflow.com/questions/38931566/pandas-style-background-gradient-both-rows-and-columns
            # df_stat_points - might benefit from all on the same scale, but there's no direct option, this link gives a possible solution
            # other option is to have coloring follow a log scale to remove influence of outliers
            
           [df_stat_points.style.format(precision=2,thousands=',').background_gradient(cmap='PuBu',axis=1),
            df_stat_std.style.format(precision=2,thousands=',').hide(axis="index").background_gradient(cmap='inferno',axis=1),
            df_stat_cnts.style.format(precision=2,thousands=',').hide(axis="index").background_gradient(cmap='BuGn',axis=1)]
            # raw=True
        )

        n+=nfeats_per_group



In [9]:
def check_normality_and_style(df, num_ls):
    """
    Checks the normality of columns in a DataFrame and returns a styled DataFrame
    highlighting columns that are not normally distributed.

    Parameters:
    df (pd.DataFrame): The DataFrame containing the data.
    num_ls (list): List of column names to check for normality.

    Returns:
    pd.io.formats.style.Styler: A styled DataFrame with normality test results.
    """
    # Create a list to store the results
    results = []

    # Loop through the columns and perform the normality test
    for i in num_ls:
        stat, p_value = normaltest(df[i].values)
        label = "Not Normal" if p_value >= 0.05 else "Normal"
        results.append([i, label, stat, p_value])

    # Convert the results list to a DataFrame
    results_df = pd.DataFrame(results, columns=['Column', 'Normality', 'Statistic', 'p-value'])

    # Function to highlight Gaussian rows
    def highlight_gaussian(row):
        color = 'lightblue' if row['Normality'] == 'Not Normal' else ''
        return ['background-color: {}'.format(color) for _ in row]

    # Apply the highlighting function to the DataFrame
    styled_results_df = results_df.style.apply(highlight_gaussian, axis=1)

    return styled_results_df

In [10]:
def plot_jointplots(df, num_ls, target):
    """
    Generates joint plots for each numerical feature in the list against the target variable,
    including both the original and Yeo-Johnson transformed features.

    Parameters:
    df (pd.DataFrame): The DataFrame containing the data.
    num_ls (list): List of numerical column names to plot.
    target (str): The target variable name.
    
    """
    for variable in num_ls:
        print("_" * 30)
        print(variable)
        
        # Normal feature plot (g0)
        g0 = sns.jointplot(x=df[variable], y=df[target], kind='reg')
        g0.savefig('g0.png')
        plt.close(g0.fig)

        try:
            # Yeo-Johnson transformed feature plot (g1)
            transformed_variable, _ = yeojohnson(df[variable])
            g1 = sns.jointplot(x=transformed_variable, y=df[target], kind='reg')
            g1.savefig('g1.png')
            plt.close(g1.fig)
        except Exception as e:
            print(e)
            g1 = None

        # Create subplots from temporal images
        f, axarr = plt.subplots(ncols=2, sharey=False, figsize=(25, 30))
        axarr[0].imshow(mpimg.imread('g0.png'))
        if g1:
            axarr[1].imshow(mpimg.imread('g1.png'))
        else:
            axarr[1].text(0.5, 0.5, 'Transformation Error', 
                          horizontalalignment='center', 
                          verticalalignment='center')

        # Turn off x and y axis
        [ax.set_axis_off() for ax in axarr.ravel()]

        plt.tight_layout()
        plt.show()

        # Clean up the temporary image files
        os.remove('g0.png')
        if g1:
            os.remove('g1.png')