In [None]:
import graphviz
import itertools
import matplotlib.pyplot as plt # graphing with insane defaults
import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
import plotly.express as px
import seaborn as sns # graphing with sane defaults
import scipy.stats as stats
from sklearn import linear_model
from sklearn import preprocessing # Preprocess data (e.g. scale numerical data to 0-1
from sklearn import tree
from sklearn.base import BaseEstimator
from sklearn.decomposition import PCA
from sklearn.discriminant_analysis import QuadraticDiscriminantAnalysis
from sklearn.exceptions import ConvergenceWarning
from sklearn.ensemble import RandomForestRegressor, ExtraTreesRegressor, GradientBoostingRegressor, AdaBoostRegressor, HistGradientBoostingRegressor, IsolationForest, AdaBoostClassifier, RandomForestClassifier, GradientBoostingClassifier, HistGradientBoostingClassifier
from sklearn.feature_selection import SelectFromModel
from sklearn.gaussian_process import GaussianProcessClassifier
import sklearn.metrics as metrics
from sklearn.naive_bayes import GaussianNB
from sklearn.neighbors import KNeighborsClassifier
from sklearn.neural_network import MLPClassifier
from sklearn.metrics import confusion_matrix, ConfusionMatrixDisplay, classification_report, PrecisionRecallDisplay
from sklearn.model_selection import train_test_split, KFold
from sklearn.svm import SVC
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import mean_absolute_error
from sklearn.utils._testing import ignore_warnings
from termcolor import colored, cprint
import typing # Apply common types to objects)
import warnings
from yellowbrick.classifier.rocauc import roc_auc
from yellowbrick.classifier import precision_recall_curve
from yellowbrick.features import PCA as yellowPCA, Manifold
from yellowbrick.regressor import ResidualsPlot, PredictionError

warnings.simplefilter(action='ignore', category=FutureWarning)

feature_engineering = typing.TypeVar('rabbitml.feature_engineering')

class rabbitml:
    """
    An automl library designed for tabular data

    @Taran Sean Marley 
    https://www.kaggle.com/taranmarley
    """
    class feature_engineering:
        """
        A class intended to move through and improve the features of a dataset.
        """
        
        def auto_casefold(self, df : pd.DataFrame) -> pd.DataFrame:
            """
            Take a dataframe, find the string columns and convert them all to lower case through casefold

            Parameters
            ----------
            dataframe : pd.DataFrame
                The dataframe to casefold over to convert to lower case

            Returns
            -------
            pd.DataFrame
                The same dataframe given with the new lorrrwer case values if applied

            """
            for col in df.columns:
                if self.is_string_type(df[col]):
                    df[col] = df[col].astype(str).str.casefold()
            return df

        def break_up_by_string(self, df_temp : pd.DataFrame, splitting_string : str, cols : typing.List = None) -> pd.DataFrame:
            """
            Break up columns by string to create new columns from each split.

            Parameters
            ----------
            df_temp : pd.DataFrame
                Dataframe to start splitting up object columns
            splitting_string : str
                String to split up columns by
            cols : typing.List
                Optional parameter that if provided will be the only columns that will be considered for splitting

            Returns
            -------
            pd.DataFrame
                modified dataframe with extra columns containing split up values
            """
            obj_cols = df_temp.select_dtypes(include=[object])
            if cols is not None:
                obj_cols = cols
            # count spaces
            for col in obj_cols:
                if df_temp[col].str.contains(splitting_string).sum() > 0:
                    df2 = df_temp[col].str.split(splitting_string, expand=True)
                    # Rename columns
                    rename_dict = {}
                    for rename_col in df2.columns:
                        if (splitting_string != " "):
                            rename_dict[rename_col] = col + splitting_string + str(rename_col)
                        else:
                            rename_dict[rename_col] = col + str(rename_col)
                    df2 = df2.rename(columns=rename_dict)
                    df2 = df2.fillna(0)
                    df_temp = pd.concat([df_temp,df2], axis=1) 
            return df_temp
        
        
        def compare_object_columns(self, df_temp : pd.DataFrame, df_temp_2 : pd.DataFrame, silent = False, replace = False) -> None:
            """
            Compare object columns and print out the if there is a difference between them. This helps determining the differences between a test dataframe and a training dataframe
            
            Parameters
            ----------
            df_temp : pd.DataFrame
                First dataframe to compare columns with
            df_temp_2 : pd.DataFrame
                Second dataframe to compare columns with
            silent : bool
                Print the results or not
            replace : bool
                Replace bad values in df_temp with NaN values
            """
            for col in df_temp.select_dtypes(include="object").columns:
                if col in df_temp_2.columns:
                    unique_df_list = df_temp[col].unique().tolist()
                    test_df_list = df_temp_2[col].unique().tolist()
                    if set(unique_df_list) != set(test_df_list):
                        unique_df_list = ["nan" if x is np.nan else x for x in unique_df_list]
                        test_df_list = ["nan" if x is np.nan else x for x in test_df_list] 
                        unique_df_list.sort()
                        test_df_list.sort()
                        # Print lists if requested
                        if not silent:
                            print("***",col)
                            print(unique_df_list)
                            print(test_df_list)
                        # Replace with NaN if requested by parameter 
                        for x in unique_df_list:
                            if x not in test_df_list:
                                df_temp[col].replace({x:np.nan})
                                
        def create_anomaly_scores_preds(self, df_temp : pd.DataFrame, estimator : BaseEstimator, df_test_temp : pd.DataFrame = None, target_col : str = None) -> typing.Tuple[pd.DataFrame, pd.DataFrame]:
            """
            Create anomaly predictions and scores and add them to the given dataFrame

            Parameters
            ----------
            df_temp : pd.DataFrame
                DataFrame that will be examined for outliers and added to
            estimator : BaseEstimator
                Estimator 
            df_test_temp : pd.DataFrame 
            
            Returns
            -------
            typing.Tuple[pd.DataFrame, pd.DataFrame]
                The main dataframe and the optional testing dataframe if it exists
            """
            df_temp = df_temp.copy()
            len_of_train = len(df_temp)
            if df_test_temp is not None:
                df_temp = pd.concat([df_temp.copy(), df_test_temp.copy()], ignore_index=True)
            X = df_temp.copy()
            estimator = estimator.fit(X)
            df_temp["anomaly_" + estimator.__class__.__name__] = estimator.predict(X)
            df_temp["anomaly_" + estimator.__class__.__name__] = df_temp["anomaly_" + estimator.__class__.__name__].replace({-1:0})
            df_temp["anomaly_score_" + estimator.__class__.__name__] = estimator.score_samples(X)
            if df_test_temp is not None:
                df_test_temp = df_temp[len_of_train:]
                df_temp = df_temp[:len_of_train]
            return df_temp, df_test_temp
        
        def create_interactions(self, df_temp : pd.DataFrame, column_list : typing.List) -> pd.DataFrame:
            """
            Create interactions by totalling and multiplying columns within a dataframe

            Parameters
            ----------
            df_temp : pd.DataFrame
                Dataframe to create interactions in
            column_list : typing.List
                List of columns to create interactions from

            Returns
            ----------
            pd.DataFrame
                Dataframe with interactions added
            """
            # Cross wise multiplication interactions
            for x in itertools.combinations(column_list, 2):
                df_temp[x[0]+"_X_"+x[1]] = df_temp[x[0]] * df_temp[x[1]]
                df_temp = df_temp.copy()
            # Iterative Totals
            iterative_total = 0
            i = 0
            for j in (column_list):
                iterative_total = iterative_total + df_temp[j]
                if i > 0:
                    df_temp["A" + str(i) + "_iter_score"] = iterative_total
                    df_temp = df_temp.copy()
                i = i + 1
            return df_temp
            
        def detect_continous_columns(self, df_temp : pd.DataFrame, ratio : float = 0.05, continous_columns : typing.List = []) -> typing.List:
            """
            Detect the continous columns in a dataframe. Columns that have more than the given ratio by total length of dataframe will be considered continous.

            Parameters
            ----------
            df_temp : pd.DataFrame
                Dataframe to detect continous columns in. This is assumed to already be encoded to a numerical format
            ratio : float / int
                Ratio of the total length of dataframe that will be used to cull continous from discrete data, if given as an int then this is consider to be a discrete number instead of a ratio
            continous_columns : typing.List
                Continous columns that can be given to the function without checking

            Returns
            ----------
            typing.List
                List of columns found
            """
            continous_cutoff : int = round(ratio * len(df_temp))
            if ratio > 1:
                continous_cutoff = ratio
            for col in df_temp.columns:
                if not self.is_string_type(df_temp[col]):
                    if col not in continous_columns:
                        if df_temp[col].nunique() > continous_cutoff:
                            continous_columns.append(col)
            return continous_columns

        def detect_duplicates(self, df_temp : pd.DataFrame, silent : bool = False, id_cols : typing.List = []) -> None: 
            """
            Detect duplicates in data and return the columns in which duplicates where detected.

            Parameters
            ----------
            df_temp : pd.DataFrame
                Dataframe to detect duplicates in
            silent : bool
                Whether to run print statements 
            id_cols : typing.List
                Given id cols that aren't auto detected - Useful if there is an obvious ID column that also wants to be detected for duplication
            """
            # Filter out identity columns
            cols_to_use = []
            for col in df_temp.columns:
                if len(df_temp[col].unique()) != len(df_temp[col]):
                    cols_to_use.append(col)
            id_cols = self.detect_id_columns(df_temp)
            id_temp = df_temp.copy()[id_cols]
            df_temp = df_temp.copy()[cols_to_use]    
            count_dupes = df_temp.duplicated().sum()
            count_dupes_in_ID = id_temp.duplicated().sum()
            if not silent:
                print('Duplicates in data: ', str(count_dupes))
                print('Duplicates in id columns: ', str(count_dupes_in_ID))
                print('When filtering out id columns: ', str(id_cols))

        def detect_nans(self, df_temp : pd.DataFrame, name = '', silent : bool = False, plot : bool = True) -> typing.List:
            """
            Detect NaNs in a provided dataframe and return the columns that NaNs were detected in     

            Parameters
            ----------
            df_temp : pd.DataFrame
                Dataframe to detect NaN values in
            name : str
                Name of the dataframe which helps give a more descriptive read out
            silent : bool
                Whether the print statements should fire
            plot : bool
                Whether to return a plot of the counts of NaNs in the data

            Returns
            -------
            typing.List
                List of columns in the provided dataframe that contain NaN values
            """
            plt.rcParams["figure.figsize"] = (9,9)
            
            count_nulls = df_temp.isnull().sum().sum()
            columns_with_NaNs = []
            # Count NaNs by column
            if count_nulls > 0:
                for col in df_temp.columns:
                    if df_temp[col].isnull().sum().sum() > 0:
                        columns_with_NaNs.append(col)
            # Print out the NaN values
            if not silent:            
                if name != '': 
                    print('******')
                    cprint('Detecting NaNs in ' + str(name), attrs=['bold'])
                    print('******')
                print('NaNs in data:', count_nulls)
                if count_nulls > 0:
                    print('******')
                    for col in columns_with_NaNs:
                        print('NaNs in', col + ": ", df_temp[col].isnull().sum().sum())
                    print('******')
            print('')
            # Plot the NaN values in columns in bar plot
            if plot and count_nulls > 0:
                sns.barplot(y=df_temp[columns_with_NaNs].isnull().sum().index, x=df_temp[columns_with_NaNs].isnull().sum().values).set_title(str(name) + " NaNs")
                plt.show()
            return columns_with_NaNs
        
        def detect_id_columns(self, df_temp : pd.DataFrame) -> typing.List:
            """
            Detect which columns are ID columns, those for which one unique value exists for each row.

            Parameters
            ----------
            df_temp : pd.DataFrame
                Dataframe to detect ID columns

            Returns
            -------
            typing.List
                List of Identity columns that were detected
            """
            id_cols = []
            for col in df_temp.columns:
                if df_temp[col].nunique() == len(df_temp[col]):
                    id_cols.append(col)
            return id_cols
        
        def detect_uncorrelated_columns(self, df_temp : pd.DataFrame) -> typing.List:
            """
            Detect which columns are very uncorrelated columns

            Parameters
            ----------
            df_temp : pd.DataFrame
                Dataframe to detect ID columns

            Returns
            -------
            typing.List
                List of Identity columns that were detected
            """
            id_cols = []
            # Create correlation dataframe
            corr_df = pd.DataFrame(columns=list(df_temp.columns))
            for col_from in df_temp.columns:
                for col_to in df_temp.columns:
                    corr_df.loc[col_from, col_to] = df_temp[col_from].corr(df_temp[col_to])
            corr_df = corr_df.abs()
            corr_df["sum"] = corr_df.sum(axis=0) - 1
            # Add to id_cols if correlation very low
            for i, col in enumerate(df_temp.columns):
                if corr_df.iloc[i,-1] < (0.04 * len(df_temp.columns)) and corr_df.iloc[i,-1] != -1:
                    id_cols.append(col)
            return id_cols

        def drop_unshared_columns(self, df_temp : pd.DataFrame, df_temp_2 : pd.DataFrame, exclude_columns : typing.List) -> None:
            """
            Detect which columns are not shared between the two dataframes excepting for a target_col if provided.
            Delete in place.

            Parameters
            ----------
            df_temp : pd.DataFrame
                Dataframe to check for shared columns        
            df_temp_2 : pd.DataFrame
                Second dataframe to check for shared columns
            exclude_columns : typing.List
                Columns not to remove in this process
            """    
            drop_cols : typing.List = []
            for col in df_temp_2.columns:
                if col not in df_temp.columns:
                    if col not in exclude_columns:
                        drop_cols.append(col)
            df_temp_2.drop(columns=drop_cols, axis=1, inplace=True)
            drop_cols : typing.List = []
            for col in df_temp.columns:
                if col not in df_temp_2.columns:
                    if col not in exclude_columns:
                        drop_cols.append(col)
            df_temp.drop(columns=drop_cols, axis=1, inplace=True)
                        
        def encode_binary_object(self, series : pd.Series) -> pd.Series:
            """
            Encode a binary object series

            Parameters
            ----------
            series : pd.Series
                The series to be encoded. 

            Returns
            -------
            pd.Series
                The encoded series
            """
            map_dict = {}
            series_list = series.unique().tolist()
            series_list.sort()
            for i, x in enumerate(series_list):
                map_dict[x] = i
            series = series.map(map_dict)
            return series
        
        def encode_columns(self, df : pd.DataFrame, columns : pd.Series, test_df : pd.DataFrame = None, cutoff : int = 20) -> typing.Tuple[pd.DataFrame, pd.DataFrame]:
            """
            Encode columns based on the number of unique values in each column

            Parameters
            ----------
            df_temp : pd.DataFrame
                Dataframe to encode columns in 
            columns : pd.Series
                Columns to encode
            test_df : pd.DataFrame
                Test dataframe to encode based on classes in the Dataframe
            cut_off : int
                The cut off number of classes to choose between label encoding and get dummies. This keeps the dimensionality under control

            Returns
            -------
            (pd.DataFrame, pd.DataFrame)
                Original dataframe and the test dataframe
            """    
            for col in columns:
                le = preprocessing.LabelEncoder()
                classes_to_encode = df[col].astype(str).unique().tolist()
                classes_to_encode.sort()
                classes_to_encode.append('None')
                le.fit(classes_to_encode)
                # Get dummies except for binary variables which are handled by len(le.classes) != 3
                if len(le.classes_) < cutoff and len(le.classes_) != 3:
                    df = pd.get_dummies(df, columns = [col])
                    if test_df is not None:
                        test_df = pd.get_dummies(test_df, columns = [col])
                else:
                    # First test for binary variables that should be encoded and change things if that is the case
                    binary_detected = False
                    if df[col].nunique() == 2 and df[col].isnull().sum().sum() == 0:
                        # Detect test_df exists and is binary and the unqiue values of test compare the unique values in regular df
                        if test_df is not None and test_df[col].nunique() == 2 and test_df[col].isnull().sum().sum() == 0 and set(test_df[col].unique()) == set(df[col].unique()): 
                            binary_detected = True
                        elif test_df is None:
                            binary_detected = True
                    if binary_detected:
                        classes_to_encode.remove('None')
                        le.fit(classes_to_encode)
                    # Test that the column isn't too unique to be useful
                    if df[col].nunique() > (len(df[col]) * 0.95):
                        print("Dropping column:", col, "due to high uniqueness that would lead to overfitting")
                        df = df.drop(columns=col)
                        if test_df is not None:
                            test_df = test_df.drop(columns=col)
                        continue
                    # If no test dataframe encode as normal else we should clear out classes not found in test
                    if test_df is None:
                        df[col] = le.transform(df[col].astype(str))
                    else:
                        check_col = df.copy()[col]
                        # Clean out labels in train that aren't in test
                        input_dict = {}
                        for unique in df[col].unique():
                            if unique not in pd.unique(test_df[col]) and not binary_detected:
                                input_dict[unique] = 'None'
                        df[col] = df[col].replace(input_dict)     
                        # Check whether there is little crossover between test and df 
                        if len(df[col]) * 0.9 < df[col].tolist().count('None'):
                            print("Dropping column:", col, "due to little to no crossover with test dataframe")
                            df = df.drop(columns=col)
                            test_df = test_df.drop(columns=col)
                            continue
                        df[col] = le.transform(df[col].astype(str))
                        #Clean out unseen labels in test
                        input_dict = {}
                        for unique in test_df[col].unique():
                            if unique not in pd.unique(check_col) and not binary_detected:
                                input_dict[unique] = 'None'
                        test_df[col] = test_df[col].replace(input_dict)
                        test_df[col] = le.transform(test_df[col].astype(str))
            return df, test_df

        def fill_nans_create_columns(self, df_temp : pd.DataFrame, columns : typing.List, value : float = 0) -> pd.DataFrame:
            """
            Fill NaN of provided columns and create columns to signify they weren't there.

            Parameters
            ----------
            df_temp : pd.DataFrame
                Dataframe to modify
            columns : typing.List
                Columns of the provided dataframe to modify
            value : float
                Value to replace the NaN values with

            Returns
            -------
            pd.DataFrame
                Modified Dataframe with NaNs filled and new columns signifying the rows that contained NaNs
            """
            for col in columns:
                df_temp[col + "_was_null"] = df_temp[col].isnull().astype(int)
                df_temp[col] = df_temp[col].fillna(value)
            return(df_temp)
        
        def is_string_type(self, series: pd.Series) -> bool:
            """
            Detect if a series contains is a string type 

            Parameters
            ----------
            series : pd.Series
                The series to detect the presence of a string type

            Returns
            -------
            bool
                Whether a string type was detected or not

            @Inspired by work by https://stackoverflow.com/users/3876599/yourstruly
            """
            if pd.StringDtype.is_dtype(series.dtype):
                # Is a string extension type
                return True

            if series.dtype != "object":
                # No object column - definitely not a string
                return False

            try:
                series.str
            except AttributeError:
                return False

            return True        

        def quantile_transform_column_wise(self, df_temp : pd.DataFrame, target_col : str = "", output_distribution="uniform") -> pd.DataFrame:
            """
            Transform values in dataframe to quantile uniform distribution

            Parameters
            ----------
            df_temp : pd.DataFrame
                Dataframe to quantile transform 
            target_col : str
                This is the target col and is not transformed

            Returns
            -------
            pd.DataFrame
                Modified dataframe
            """    
            df_temp = df_temp.copy()
            # find n_samples
            n_samples : int = 1000
            if len(df_temp) < 1000:
                n_samples = len(df_temp)
            for col in df_temp.columns:
                if col != target_col:
                    transformed = preprocessing.QuantileTransformer(random_state=1, n_quantiles=n_samples, output_distribution=output_distribution).fit_transform(df_temp[col].values.reshape(-1, 1))
                    df_temp[col] = pd.Series(transformed[:,0], index=df_temp[col].index, name=df_temp[col].name)
            return df_temp
        
        def min_max_column_wise(self, df_temp : pd.DataFrame, target_col : str = ""):
            df_temp = df_temp.copy()
            for col in df_temp.columns:
                if col != target_col:
                    df_temp[col] = preprocessing.MinMaxScaler().fit_transform(df_temp[col].values.reshape(-1, 1))
            return df_temp
        
        def pipeline(self, df_temp : pd.DataFrame, test_df_temp : pd.DataFrame = None, target_col : str = None, create_interactions : bool = True, id_cols : typing.List = None, break_up_cols : typing.List[typing.Tuple[str, str]] = None) -> typing.Tuple[pd.DataFrame, pd.DataFrame]:
            """
            A pipeline through which the data is processed and feature engineered
            
            Parameters
            ----------
            df_temp : pd.DataFrame
                Dataframe to process features of 
            test_df_temp : pd.DataFrame
                The test dataframe to process features of 
            target_col : str
                The optional target column that won't be processed due to this being problematic for the end result
            create_interactions : bool
                Whether to multiply and add columns together. Defaults to true
            id_cols : typing.List 
                Id columns if given manually will eliminate auto testing and likely is more important 
            break_up_cols : typing.List(typing.Tuple[str, str])
                A list of tuples the first element of which is the column name and the second element is the string with which to split up the elements.
                
            Returns 
            -------
            (pd.DataFrame, pd.DataFrame)
                
            """            
            df_temp = df_temp.copy()
            if test_df_temp is not None:
                test_df_temp = test_df_temp.copy()
            # Remove ID Column if provided
            if id_cols is not None:
                print("Provided id columns dropped:", id_cols)
                df_temp = df_temp.drop(columns=id_cols)
                if test_df_temp is not None:
                    test_df_temp = test_df_temp.drop(columns=id_cols)
            target = None
            if target_col != "":
                target = df_temp[target_col]
                df_temp = df_temp.drop(columns=target_col) 
            # Detect potential error, is test exists and its columns don't match the original dataframe minus target then there is 
            if test_df_temp is not None:
                if set(df_temp.columns) != set(test_df_temp.columns):
                    cprint("Unfortunately the columns of the testing dataframe and training dataframe do not match, this may yield errors or bad interactions. This is not a recommended format.", 'red', attrs=['bold'])
            # Break up columns if requested
            if break_up_cols is not None:
                for break_up in break_up_cols:
                    df_temp = self.break_up_by_string(df_temp, break_up[1], cols=[break_up[0]])
                    if test_df_temp is not None:
                        test_df_temp = self.break_up_by_string(test_df_temp, break_up[1], cols=[break_up[0]])
            self.detect_nans(df_temp, "Training Data")
            self.fill_nans_create_columns(df_temp, df_temp.columns, -1)
            self.detect_duplicates(df_temp)
            # Detect Ids and completely uncorrelated columns and remove them
            continous_columns = self.detect_continous_columns(df_temp, 20, continous_columns=[])
            # Encode columns so all are of a numerical type - This helps detect and remove uncorrelated continous columns from the interactions
            df_correlation_test, _ = self.encode_columns(df_temp, df_temp.select_dtypes(include="object").columns)
            # Detect completely uncorrelated columns and remove them from the continous_columns detected
            uncorrelated_cols = self.detect_uncorrelated_columns(df_correlation_test)
            for c in uncorrelated_cols:
                if c in continous_columns:
                    continous_columns.remove(c)
            # Process test dataframe if it exists
            if test_df_temp is not None:
                self.detect_nans(test_df_temp, "Testing Data" )
                self.fill_nans_create_columns(test_df_temp, test_df_temp.columns, -1)
                self.detect_duplicates(test_df_temp)
                # Create Interactions by adding and multiplying columns together 
                if create_interactions:
                    test_df_temp = self.create_interactions(test_df_temp, continous_columns).copy()
            # Encode columns so all are of a numerical type
            df_temp, test_df_temp = self.encode_columns(df_temp, df_temp.select_dtypes(include="object").columns, test_df_temp)
            # Drop unshared columns
            if test_df_temp is not None:
                self.drop_unshared_columns(df_temp, test_df_temp, target_col)
            # Detect Ids and completely uncorrelated columns and remove them
            uncorrelated_cols = self.detect_uncorrelated_columns(df_temp)
            print("Removed Uncorrelated: ", uncorrelated_cols)
            # Create Interactions by adding and multiplying columns together
            if create_interactions:
                #continous_columns = self.detect_continous_columns(df_temp, 20, continous_columns=[])
                df_temp = self.create_interactions(df_temp, continous_columns).copy()
                if test_df_temp is not None:
                    test_df_temp = self.create_interactions(test_df_temp, continous_columns).copy()
            if uncorrelated_cols is not None and len(uncorrelated_cols) > 0:
                df_temp = df_temp.drop(columns=uncorrelated_cols)
                # Drop unshared columns due to the drop of uncorrelated above
                if test_df_temp is not None:
                    self.drop_unshared_columns(df_temp, test_df_temp, target_col)
            # Create Anomaly Scores
            df_temp, test_df_temp = self.create_anomaly_scores_preds(df_temp, IsolationForest(random_state=0), test_df_temp, target_col)
            if target_col != "":
                df_temp[target_col] = target
            return df_temp, test_df_temp
        
    class eda:
        """
        A set of tools for Exploratory Data Analysis
        """
        
        def box_plots(self, df_temp : pd.DataFrame, columns : typing.List) -> None:
            """
            Make box plots of different continous columns
            
            Parameters
            ----------
            df_temp : pd.DataFrame
                Dataframe to make box plots of
            columns : typing.List
                A list of continous columns to use
            """
            if len(columns) > 25:
                columns = columns[:25]
            fig = plt.figure(figsize = (15, 9))
            colors = ['#1f77b4', '#ff7f0e', '#2ca02c', '#d62728', '#9467bd', '#8c564b', '#e377c2', '#7f7f7f', '#bcbd22', '#17becf']
            palette = itertools.cycle(colors)

            for index,col in enumerate(df_temp[columns]):
                plt.subplot(5, 5, index + 1)
                sns.boxplot(y = col, data = df_temp[columns], color=next(palette))
                plt.tight_layout()
            plt.show()
        
        def box_plot_correlated_columns(self, df_temp : pd.DataFrame, target_col : str, fe : feature_engineering) -> None:
            """
            Create and display box plots of categorical columns based on the most correlated columns in a dataset

            Parameters
            ----------
            df_temp : pd.DataFrame
                The feature dataset to find the correlations from and make box plots from
            target_col : str
                The column that is the aim for prediction
            fe : feature_engineering
                The feature engineering library
            """
            scaled_df = fe.quantile_transform_column_wise(df_temp, "")
            correlated_cols = self.calculate_correlations(scaled_df, target_col, 7, silent=True, visualise=False)
            correlated_cols.remove(target_col)
            fig = plt.figure(figsize = (16, 16))
            for index, col in enumerate(correlated_cols):
                plt.subplot(3, 2, index + 1)
                ax = sns.boxplot(x=target_col, y=col, data=df_temp, palette="Set3")
            plt.show()

        def class_balance(self, df_temp : pd.DataFrame, target_col : str) -> None:
            """
            Display and show a plot of the target categorical value

            Parameters
            ----------
            df_temp : pd.DataFrame
                Dataframe to find class balance in 
            target_col : str
                Name of column with which to find the target categorical value
            """
            sns.countplot(x=df_temp[target_col])
            plt.show()
            column_values = df_temp[target_col].values.ravel()
            unique_values = pd.unique(column_values)
            unique_values = np.sort(unique_values)
            for value in unique_values:
                print(value,":",(len(df_temp.loc[df_temp[target_col] == value]) / len(df_temp)) * 100, "%")

        def pca_dimension_reduction_info(self, df_temp : pd.DataFrame, target_col : str) -> None:
            """
            Examine the results of dimensionality reduction on the dataset
            
            Parameters
            ----------
            df_temp : pd.DataFrame
                DataFrame to conduct PCA on
            target_col : str
                target column to remove before conducting PCA         
            """
            df_temp = df_temp.copy()
            y = df_temp[target_col]
            X = df_temp.drop(columns=target_col, axis=1).values
            X_scaled = X
            print(str(len(X_scaled[0])) + " initial feature components")
            pca = PCA(n_components=0.95)
            X_p = pca.fit(X_scaled).transform(X_scaled)
            print("95% variance explained by " + str(len(X_p[0])) + " components by principle component analysis")
            pca = PCA(n_components=2)
            TwoX_p = pca.fit(X_scaled).transform(X_scaled)
            print(str(round(pca.explained_variance_ratio_.sum() * 100)) + "% variance explained by 2 components by principle component analysis")
            # 2D plot
            fig = px.scatter(TwoX_p, x=0, y=1, color=y, width=600, height=600, title="Two Component PCA")
            fig.show()
            pca = PCA(n_components=3)
            ThreeX_p = pca.fit(X_scaled).transform(X_scaled)
            print(str(round(pca.explained_variance_ratio_.sum() * 100)) + "% variance explained by 3 components by principle component analysis")
            # 3D plot 
            fig = px.scatter_3d(ThreeX_p, x=0, y=1, z=2, color=y, width=600, height=600, title="Three Component PCA")
            fig.show()
            
        def pca_four_component(self, df_temp : pd.DataFrame, target_col : str, fe : feature_engineering) -> pd.DataFrame:
            """
            Do a four component PCA and show the resulting pair wise plot

            Parameters
            ----------
            df_temp : pd.DataFrame
                features to run PCA on
            target_col : str
                target column to PCA against
            fe : feature_engineering
                Feature engineering library to use for scaling

            Returns
            -------
            pd.DataFrame
                A dataframe with target column and PCA features only. 
            """
            pca = PCA(n_components=4)
            y = df_temp[target_col]
            X = df_temp.drop(columns=target_col)
            fourX_p = pca.fit(X).transform(X)
            print(str(round(pca.explained_variance_ratio_.sum() * 100)) + "% variance explained by 4 components by principle component analysis")

            labels = {
                str(i): f"PC {i+1} ({var:.1f}%)"
                for i, var in enumerate(pca.explained_variance_ratio_ * 100)
            }

            fig = px.scatter_matrix(
                fourX_p,
                title="4 Component PCA",
                labels=labels,
                dimensions=range(4),
                color=y,
                width=800,
                height=800
            )
            fig.update_traces(diagonal_visible=False)
            fig.show()

            return_df = pd.DataFrame(fourX_p, columns=["PCA1","PCA2","PCA3","PCA4"])
            return_df[target_col] = y
            return return_df
        
        def pca_visualisation_2d(self, df_temp : pd.DataFrame, target_col : str, plot_title = "Principle Component Plot") -> None:
            """
            Visualize 2d
            
            Parameters
            ----------
            df_temp : pd.DataFrame
                The dataframe to use features from for embedding
            target_col : str
                The target variable to be dropped from dataframe
            """
            if len(df_temp) > 3000:
                df_temp = df_temp.copy().sample(n=2999, random_state=1)
            y = df_temp[target_col]
            X = df_temp.drop(columns=[target_col])
            visualizer = yellowPCA(scale=True, projection=2, alpha=0.4, title=plot_title)
            visualizer.fit_transform(X, y)
            visualizer.show()
            plt.show()
            
        def pca_visualisation_3d(self, df_temp : pd.DataFrame, target_col : str) -> None:
            """
            Visualize 3d PCA embedding
            
            Parameters
            ----------
            df_temp : pd.DataFrame
                The dataframe to use features from for embedding
            target_col : str
                The target variable to be dropped from dataframe
            """
            if len(df_temp) > 3000:
                df_temp = df_temp.copy().sample(n=2999, random_state=1)
            y = df_temp[target_col]
            X = df_temp.drop(columns=[target_col])
            visualizer = yellowPCA(scale=True, projection=3, alpha=0.4, size=(700,700))
            visualizer.fit_transform(X, y)
            visualizer.show()
            plt.show()
            
        def line_plots(self, df_temp : pd.DataFrame, columns : typing.List) -> None:
            """
            Make line plots of different continous columns
            
            Parameters
            ----------
            df_temp : pd.DataFrame
                Dataframe to make line plots of
            columns : typing.List
                A list of continous columns to use
            """    
            pltdf = df_temp.copy()
            pltdf = pltdf[columns]
            pltdf = pltdf.sample(frac=1, random_state=42).reset_index(drop=True)
            pltdf.iloc[:50, :25].plot(subplots=True, layout=(5,5), figsize=(15,10))
            plt.show()
        
        def calculate_correlations(self, df_temp : pd.DataFrame, target_col : str, n_cols : int = 10, silent : bool = False, visualise : bool = False) -> typing.List:
            """
            Calculate the pearson correlations between the target variable and the dataframe and returns columns that are beyond a certain ratio correlation 
            
            Parameters
            ----------
            df_temp : pd.DataFrame
                dataframe to examine
            target_col : str
                the target column to measure correlation against
            n_cols : int
                number of columns to return, this amount of columns with the highest correlation
            silent : bool
                whether to print to console
            visualise : bool
                whether to display a heatmap of correlations
            """
            df_temp = df_temp.copy()
            if not silent:
                print("Correlations with",target_col + ":")
            # Generate correlation list
            correlations_list = []
            for col_one in df_temp.iloc[:,:].columns:
                correlation_value =  abs(df_temp[col_one].corr(df_temp[target_col]))
                # Check for NaN
                if (correlation_value == correlation_value):
                    correlations_list.append((correlation_value,col_one))
            # Sort List
            correlations_list = sorted(correlations_list, key=lambda tup: tup[0], reverse=True)
            # Go through list to find columns to return
            cols = []
            for i, row in enumerate(correlations_list):
                correlation = row[0]
                col = row[1]
                if i < n_cols:
                    cols.append(col)
                    # print the correlation
                    if not silent:
                        print(col, ":", correlation)            
            corrdf = df_temp.copy()
            corrdf = corrdf[cols].corr()
            if visualise == True:
                sns.heatmap(abs(corrdf), annot=True, cmap="Blues")
            return cols

        def decision_tree(self, df_temp : pd.DataFrame, depth : int, target_col : str, class_names : typing.List = None) -> None:
            """
            Draw a decision_tree from the given dataframe to the given depth and display it
            
            Parameters
            ----------
            df_temp : pd.DataFrame
                The dataframe to create a decision tree from
            depth : int
                The depth of the decision tree to create
            target_col : str
                The target column to make a decision tree towards
            """
            tree_set = df_temp.copy()
            target = tree_set[target_col]
            tree_set.drop([target_col], axis=1, inplace=True)
            tree_clf = DecisionTreeClassifier(max_depth=depth, random_state=1)
            tree_clf.fit(tree_set, target)
            text_representation = tree.export_text(tree_clf, feature_names=tree_set.columns.tolist())
            # print(text_representation)
            print("accuracy: " + str(tree_clf.score(tree_set, target)))    
            plt.rcParams["figure.figsize"] = (18,18)
            # tree.plot_tree(tree_clf, feature_names=tree_set.columns, filled=True)
            class_unique_values = class_names
            if class_names == None:
                class_column_values = df_temp[target_col].values.ravel()
                class_unique_values = pd.unique(class_column_values)
                class_unique_values = np.sort(class_unique_values)
                class_unique_values = class_unique_values.astype('str')
            dot_data = tree.export_graphviz(tree_clf, out_file=None, 
                                            feature_names=tree_set.columns,  
                                            class_names=class_unique_values,
                                            filled=True)
            return graphviz.Source(dot_data, format="png")
            return None
        
        def manifold_embedding(self, df_temp : pd.DataFrame, target_col : str, manifold_type="tsne", classes=['very low', 'low', 'med', 'high', 'very high']) -> None:
            """
            Create a manifold embedding from a feature set based on a target value. This will take the dataset, scale it, bin the target and assign classes to it.

            Parameters
            ----------
            df_temp : pd.DataFrame
                The features and target to create a manifold embedding from
            target_col : str
                The target column to create the manifold to. Must be label encoded. Default requires 5 unique values
            manifold_type : str 
                The type of manifold 
            """
            y = df_temp[target_col]
            X = df_temp.drop(columns=[target_col], axis=1)
            standard_scaler = preprocessing.StandardScaler()
            X = standard_scaler.fit_transform(X)
            viz = Manifold(manifold=manifold_type, classes=classes)

            viz.fit_transform(X[:4000], y[:4000])  # Fit the data to the visualizer
            viz.show()               # Finalize and render the figure
        
        def pair_grid_plot(self, df_temp : pd.DataFrame, cols : typing.List) -> None:
            """
            Pair grid plots of given columns
            
            Parameters
            ----------
            df_temp : pd.DataFrame
                Data to plot from
            cols : typing.List
                Columns to make a pairgrid from
            """
            df_temp = df_temp.copy()
            # Deal with seaborn bug on boolean datatype
            for col in df_temp.columns:
                if df_temp[col].dtype == "bool":
                    df_temp[col] = df_temp[col].astype(int)
            # Create PairGrid
            g = sns.PairGrid(df_temp[cols].iloc[:500,:], diag_sharey=False)
            g.map_upper(sns.histplot, multiple="stack")
            g.map_lower(sns.kdeplot)
            g.map_diag(sns.kdeplot, lw=2)
        
        def pivot_table(self, df_temp : pd.DataFrame, target_col : str, number_to_display : int = 15, display : bool = True) -> pd.DataFrame:
            """
            Create a pivot table based on the target variable and if requested plot it. So that the mean of each variable can be compared to others.

            Parameters
            ----------
            df_temp : pd.DataFrame
                DataFrame that contains the features to create a pivot table from
            target_col : str
                The target column that will be pivoted on - Must be categorical 
            display : bool
                Display plot of table
            number_to_display : int
                The number of rows to aim for in the pivot table

            Returns
            -------
            pd.DataFrame
                Pivot Table that was created from the features provided
            """
            df_temp = df_temp.copy()
            y = df_temp[target_col]
            X = df_temp.drop(columns=target_col, axis=1)
            df_temp = pd.DataFrame(X)  
            df_temp.columns = X.columns
            df_temp[target_col] = y
            table = pd.pivot_table(data=df_temp,index=[target_col]).T
            # Cut out rows with no real difference
            table_copy = table.copy()
            differences_tuple_list = []
            for idx, row in table_copy.iterrows():
                # Find difference
                difference = max(row)-min(row)
                # Record difference
                differences_tuple_list.append((idx, difference))
            differences_tuple_list = sorted(differences_tuple_list, key=lambda tup: tup[0], reverse=True)
            # Get indexes of top 15 sorted tuples
            diff_index = []
            i = 0
            for t in differences_tuple_list:
                if t[1] != 0 and i < number_to_display:
                    i = i + 1
                    diff_index.append(t[0])
            diff_index = [x for x in table_copy.index.tolist() if x not in diff_index]
            table = table.drop(diff_index)
            if display:
                sns.heatmap(table, annot=True, cmap="Blues")
            return table
        
        def pipeline(self, df_temp, target_col : str, fe : feature_engineering, kind="categorical"):
            """
            A pipeline to apply a bunch of different EDA procedures against the 
            """
            print("***")
            cprint("Class Balance", attrs=['bold'])
            print("***")
            if kind == "categorical":
                self.class_balance(df_temp, target_col)
                plt.show()
            else: 
                sns.displot(x=df_temp[target_col])
                plt.title("Class Balance")
                plt.show()
            print("***")
            cprint("Dimensional Reduction", attrs=['bold'])
            print("***")
            self.pca_dimension_reduction_info(df_temp, target_col)
            plt.show()
            self.pca_four_component(df_temp, target_col, fe)
            plt.show()            
            print("***")
            cprint("Box Plots", attrs=['bold'])
            print("***")
            continous_columns = fe.detect_continous_columns(df_temp, 20, continous_columns=[])
            self.box_plots(df_temp, continous_columns)
            plt.show()
            print("***")
            cprint("Line Plots", attrs=['bold'])
            print("***")
            continous_columns = fe.detect_continous_columns(df_temp, 20, continous_columns=[])
            self.line_plots(df_temp, continous_columns)
            plt.show()
            print("***")
            cprint("Correlations", attrs=['bold'])
            print("***")
            scaled_df = fe.quantile_transform_column_wise(df_temp, target_col)
            correlations = self.calculate_correlations(scaled_df, target_col, 10, visualise=True)
            plt.show()
            print("***")
            cprint("Pair Grid Plot", attrs=['bold'])
            print("***")
            self.pair_grid_plot(df_temp, self.calculate_correlations(df_temp, target_col, 5))
            plt.show()
            print("***")
            cprint("Decision Tree", attrs=['bold'])
            print("***")  
            tree_graph = None
            if kind == "regression":
                df_temp_copy = df_temp.copy()
                df_temp_copy["TargetBin"] = pd.qcut(df_temp_copy[target_col], 5, ["very low", "low","med","high","very high"]).cat.codes
                tree_graph = self.decision_tree(df_temp_copy.drop(columns=target_col), 2, "TargetBin", ["very low", "low","med","high","very high"])
            else:
                tree_graph = self.decision_tree(df_temp, 2, target_col)
            if tree_graph is not None:
                display(tree_graph)
                plt.show()
            print("***")
            cprint("Anomalies", attrs=['bold'])
            print("***")    
            plt.rcParams["figure.figsize"] = (9,9)
            eda.pca_visualisation_2d(fe.quantile_transform_column_wise(df_temp, "anomaly_IsolationForest", "normal"), "anomaly_IsolationForest", "Anomalies in Data")
            plt.show()
            if kind == "regression":
                print("***")
                cprint("Normality Test", attrs=['bold'])
                print("***")    
                stats.probplot(df[target_col], dist="norm", plot=plt)
                plt.show()
            print("***")
            cprint("Pivot Table", attrs=['bold'])
            print("***")                
            if kind == "regression":
                scaled_df["TargetBin"] = pd.qcut(scaled_df[target_col], 5, ["very low", "low","med","high","very high"])
                self.pivot_table(scaled_df.drop(columns=target_col), "TargetBin")
            else:
                self.pivot_table(scaled_df, target_col)                
            plt.show()
            print("***")
            cprint("Manifold Embedding", attrs=['bold'])
            print("***")                                  
            if kind == "regression":
                scaled_df["TargetBin"] = pd.qcut(scaled_df[target_col], 5, ["very low", "low","med","high","very high"]).cat.codes
                self.manifold_embedding(scaled_df.drop(columns=target_col), "TargetBin")
            else:
                scaled_df_copy = scaled_df.copy()
                le = preprocessing.LabelEncoder()
                unique_values = scaled_df_copy[target_col].unique()
                unique_values.sort()
                unique_values = unique_values[::-1]
                le.fit(scaled_df_copy[target_col].unique())
                scaled_df_copy[target_col] = le.transform(scaled_df_copy[target_col])
                self.manifold_embedding(scaled_df_copy, target_col, classes=unique_values.tolist())    
            plt.show()    
    
    class feature_selection:
        """
        Determine which features to use for training. 
        """
        def kendall_tau_feature_elimination(df_temp : pd.DataFrame, columns : typing.List, target_col : str, test_p_value : float = 0.001) -> typing.List:
            """
            Eliminate features that don't pass a Kendall Tau test in regards to the target variable. 
            This would tend to eliminate useless or unhelpful features from the dataset while retaining appropriate ones.
            A list of features the do pass the test is created and returned. 

            Should be used with continous columns only.
            
            Parameters
            ----------
            df_temp : pd.DataFrame
                DataFrame to conduct Kendall Tau on features 
            columns : typing.List
                A list of continous columns
            target_col : str
                Target column to conduct Kendall Tau towards
            test_p_value : float
                P value to test against. The sensitivity for 
                
            Returns
            -------
            typing.List
                features that passed the Kendall Tau features
            """
            new_features = []
            for feature in columns:
                tau, p_value = stats.kendalltau(df_temp[target_col], df_temp[feature])
                if p_value <= test_p_value:
                    new_features.append(feature)
            return new_features    

        def select_from_model_features(df_temp: pd.DataFrame, target_col : str, estimator : BaseEstimator, threshold : float = None) -> typing.Tuple[typing.List, np.ndarray]:
            """
            Use a compatible sklearn estimator for determining a list of features that are important to the dataset

            Parameters
            ----------
            df_temp : pd.DataFrame
                dataframe to find the features in
            target_col : str
                the name of the target column so it can be selected
            estimator : BaseEstimator
                The sklearn estimator to fit on. Must have coef_
            threshold : float
                Threshold to decide which features to keep or not

            Returns
            -------
            typing.Tuple[typing.List, np.ndarray]
                List of features selected, array of the coefficients that have been selected
            """
            X = df_temp.drop(columns=target_col)
            y = df_temp[target_col]
            estimator.fit(X, y)
            model = SelectFromModel(estimator, threshold = threshold, prefit=True) 
            feature_names = np.array(df_temp.drop(columns=target_col).columns)
            return feature_names[model.get_support()].tolist(), model.estimator.coef_
    
    class prediction:
        """
        Create and compare predictors then analyse them
        """
        def categorical_model_analysis(self, clf : BaseEstimator, X : np.ndarray, y : np.ndarray):
            """
            Plot and display various graphs that will increase model explainability
            Parameters
            ----------
            clf : BaseEstimator
                The sklearn compatible estimator to investigate
            X : np.ndarray
                The features to use the estimator on
            y : np.ndarray
                The target to use the estimator on

            """
            X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=0)
            clf.fit(X_train, y_train)
            predictions = clf.predict(X_test)
            print("***")
            cprint("Confusion Matrix", attrs=['bold'])
            print("***")
            cm = confusion_matrix(y_test, predictions, labels=clf.classes_)
            disp = ConfusionMatrixDisplay(confusion_matrix=cm, display_labels=clf.classes_)
            disp.plot()
            plt.grid(False)
            plt.show()
            print("***")
            cprint("Precision Recall Curve", attrs=['bold'])
            print("***")
            precision_recall_curve(clf, X_train, y_train, X_test, y_test)
            print("***")
            cprint("ROC AUC", attrs=['bold'])
            print("***")
            roc_auc(clf, X_train, y_train, X_test=X_test, y_test=y_test)
            print("***")
            cprint("Classification Report", attrs=['bold'])
            print("***")
            print(classification_report(y_test, predictions, labels=clf.classes_, zero_division=False))
            
        def classification_compare(self, X : np.ndarray, y : np.ndarray) -> typing.List:
            """
            Classify the target col using a number of classifiers

            Parameters
            ----------
            X : np.ndarray
                The features to use for classification
            y : np.ndarray
                The target to use the classifiers on

            Returns
            -------
            typing.List
                A list of classifiers that have been fitted on the data provided
            """

            classifiers = [
                ("AdaBoostClassifier", AdaBoostClassifier()),
                ("DecisionTreeClassifier", DecisionTreeClassifier()),
                ("GaussianNB", GaussianNB()),
                #("GaussianProcessClassifier", GaussianProcessClassifier()),
                ("GradientBoostingClassifier", GradientBoostingClassifier()),
                ("HistGradientBoostingClassifier", HistGradientBoostingClassifier(random_state=1)),
                ("KNeighborsClassifier", KNeighborsClassifier()),
                ("RandomForestClassifier", RandomForestClassifier(random_state=1)),
                ("SVC", SVC())
            ]
            classification_results = []
            kfold = KFold(n_splits=5)
            for entry in classifiers:
                name : str = entry[0]
                classifier = entry[1]
                scores = []
                for train_index, test_index in kfold.split(X, y):
                    classifier.fit(X[train_index], y[train_index])
                    scores.append(classifier.score(X[test_index], y[test_index]))
                classification_results.append((classifier, (sum(scores) / len(scores))))
                print(name, "Score:", sum(scores) / len(scores))        
            return classification_results
        
        def confusion_matrix_display(self, clf : BaseEstimator, X : np.ndarray, y : np.ndarray) -> None:
            """
            Shown the confusion matrix for a given sklearn estimator
            
            Parameters
            ----------
            clf : BaseEstimator
                The classifier to use for the confusion matrix
            X : np.ndarray
                The numpy array of features
            y : np.ndarray
                The numpy array of the target variable
            """
            X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=0)
            predictions = clf.predict(X_test)
            cm = confusion_matrix(y_test, predictions, labels=clf.classes_)
            disp = ConfusionMatrixDisplay(confusion_matrix=cm, display_labels=clf.classes_)
            disp.plot()
            plt.grid(False)
            plt.show()
            
        @ignore_warnings(category=ConvergenceWarning)
        def ensemble_regression_compare(self, X : np.ndarray, y : np.ndarray) -> typing.List:
            """
            Regress on the target col using a number of ensemble models

            Parameters
            ----------
            X : np.ndarray
                The features to use for regression
            y : np.ndarray
                The target to use the regression on

            Returns
            -------
            typing.List
                A list of regressor that have been fitted on the data provided
            """
            ensembles = [("GradientBoostingRegressor", GradientBoostingRegressor()), ("RandomForestRegressor", RandomForestRegressor()), ("ExtraTreesRegressor", ExtraTreesRegressor()), ("AdaBoostRegressor", AdaBoostRegressor()), ("HistGradientBoostingRegressor", HistGradientBoostingRegressor())]
            ensemble_results = []
            kfold = KFold(n_splits=5)
            for entry in ensembles:
                name : str = entry[0]
                model = entry[1]
                scores = []
                mae_scores = []
                for train_index, test_index in kfold.split(X, y):
                    model.fit(X[train_index], y[train_index])
                    scores.append(model.score(X[test_index], y[test_index]))
                    mae_scores.append(mean_absolute_error(y[test_index], model.predict(X[test_index])))
                ensemble_results.append((model, (sum(scores) / len(scores))))
                print(name, "Score:", sum(scores) / len(scores))        
                print(name, "MAE Accuracy:", sum(mae_scores) / len(mae_scores))        
                print("***")        
            return ensemble_results

        def linear_classification_compare(self, X : np.ndarray, y : np.ndarray) -> typing.List:
            """
            Classify the target col using a number of linear classifiers
            
            Parameters
            ----------
            X : np.ndarray
                The features to use for classification
            y : np.ndarray
                The target to use the classifiers on
                
            Returns
            -------
            typing.List
                A list of classifiers that have been fitted on the data provided
            """
            classifiers = [("RidgeClassifier", linear_model.RidgeClassifier()), ("SGDClassifier", linear_model.SGDClassifier()), ("LogisticRegression", linear_model.LogisticRegression(solver="liblinear"))]
            classification_results = []
            kfold = KFold(n_splits=5)
            for entry in classifiers:
                name : str = entry[0]
                classifier = entry[1]
                scores = []
                for train_index, test_index in kfold.split(X, y):
                    classifier.fit(X[train_index], y[train_index])
                    scores.append(classifier.score(X[test_index], y[test_index]))
                classification_results.append((classifier, (sum(scores) / len(scores))))
                print(name, "Score:", sum(scores) / len(scores))        
            return classification_results
        
        @ignore_warnings(category=ConvergenceWarning)
        def linear_regression_compare(self, X : np.ndarray, y : np.ndarray) -> typing.List:
            """
            Regress on the target col using a number of linear regressors

            Parameters
            ----------
            X : np.ndarray
                The features to use for regression
            y : np.ndarray
                The target to use the regression on

            Returns
            -------
            typing.List
                A list of regressor that have been fitted on the data provided
            """
            regressors = [("LinearRegression", linear_model.LinearRegression()), ("RidgeRegression", linear_model.Ridge()), ("Lasso", linear_model.Lasso(max_iter=4000)), ("ElasticNet", linear_model.ElasticNet()),("LassoLARS", linear_model.LassoLars(normalize=True)),("BayesianRidge", linear_model.BayesianRidge()), ("ARDRegression", linear_model.ARDRegression()), ("Stochastic Gradient Descent", linear_model.SGDRegressor()), ("Huber Regressor", linear_model.HuberRegressor())]
            regressor_results = []
            kfold = KFold(n_splits=5)
            for entry in regressors:
                name : str = entry[0]
                regressor = entry[1]
                scores = []
                mae_scores = []
                for train_index, test_index in kfold.split(X, y):
                    regressor.fit(X[train_index], y[train_index])
                    scores.append(regressor.score(X[test_index], y[test_index]))
                    mae_scores.append(mean_absolute_error(y[test_index], regressor.predict(X[test_index])))
                regressor_results.append((regressor, (sum(scores) / len(scores))))
                print(name, "Score:", sum(scores) / len(scores))        
                print(name, "MAE Accuracy:", sum(mae_scores) / len(mae_scores))        
                print("***")        
            return regressor_results
        
        def residuals_plot(self, model : BaseEstimator, X_train : np.ndarray, y_train : np.ndarray, X_test : np.ndarray, y_test : np.ndarray) -> None:
            """
            Create a residuals plot for a given model

            Parameters
            ----------
            model : BaseEstimator
                The trained regression model that will be used to make a regression model from
            X_train : np.ndarray
                The features to train against
            y_train : np.ndarray
                The target to train against
            X_test : np.ndarray
                The features to score against
            y_test : np.ndarray
                The target to score against
            """
            plt.rcParams["figure.figsize"] = (9,9)
            visualizer = ResidualsPlot(model)
            visualizer.fit(X_train, y_train)  # Fit the training data to the visualizer
            visualizer.score(X_test, y_test)  # Evaluate the model on the test data
            visualizer.show()                 # Finalize and render the figure
            
        def prediction_error(self, model : BaseEstimator, X_train : np.ndarray, y_train : np.ndarray, X_test : np.ndarray, y_test : np.ndarray) -> None:
            """
            Create a prediction error plot for a given model

            Parameters
            ----------
            model : BaseEstimator
                The trained regression model that will be used to make a regression visualisation from
            X_train : np.ndarray
                The features to train against
            y_train : np.ndarray
                The target to train against
            X_test : np.ndarray
                The features to score against
            y_test : np.ndarray
                The target to score against
            """
            plt.rcParams["figure.figsize"] = (9,9)
            visualizer = PredictionError(model)
            visualizer.fit(X_train, y_train)  # Fit the training data to the visualizer
            visualizer.score(X_test, y_test)  # Evaluate the model on the test data
            visualizer.show()                 # Finalize and render the figure
        
        def regression_results(self, y_true : np.ndarray, y_pred : np.ndarray) -> None:
            """
            Give a read out of common regressor metrics

            Parameters
            ----------
            y_true : np.ndarray
                actual results
            y_pred : np.ndarray
                predicted results
            """
            # Regression metrics
            explained_variance=metrics.explained_variance_score(y_true, y_pred)
            mean_absolute_error=metrics.mean_absolute_error(y_true, y_pred) 
            mse=metrics.mean_squared_error(y_true, y_pred) 
            median_absolute_error=metrics.median_absolute_error(y_true, y_pred)
            r2=metrics.r2_score(y_true, y_pred)
            print('Explained Variance:     ', round(explained_variance,4))    
            if np.amin(y_true) > 0 and np.amin(y_pred) > 0:
                mean_squared_log_error=metrics.mean_squared_log_error(y_true, y_pred)
                print('Mean Squared Log Error: ', round(mean_squared_log_error,4))
            print('R2:                     ', round(r2,4))
            print('Mean Absolute Error:    ', round(mean_absolute_error,4))
            print('Mean Squared Error:     ', round(mse,4))
            print('Root Mean Square Error: ', round(np.sqrt(mse),4))
            
        def categorical_pipeline(self, train_df : pd.DataFrame, target_col : str) -> typing.List:
            """
            A pipeline that goes through and finds a list of classifiers and visualises the best one

            Parameters
            ----------
            train_df : pd.DataFrame
                The training data frame 
            target_col : str
                The name of the target column
                
            Returns
            -------
            typing.List
                A list of classifiers with score
            """
            # Define X and y
            X = train_df.drop(columns=target_col).values
            y = train_df[target_col].values
            # Compare Linear Classifiers
            classifiers = self.linear_classification_compare(X, y)
            # Compare Ensemble Classifiers
            classifiers.extend(self.classification_compare(X, y))
            # Sort list
            classifiers = sorted(classifiers, key=lambda tup: tup[1], reverse=True)
            # Get best Classifier
            classifier = classifiers[0]
            # Print best Classifier
            print("******")
            print("Best Classifier")
            print("******")
            print(classifier[0])
            # Create the train and test data
            X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
            # Print metrics
            self.categorical_model_analysis(classifier[0], X , y)
            return classifiers
            
        def regression_pipeline(self, train_df : pd.DataFrame, target_col : str) -> typing.List:
            """
            A pipeline that goes through and finds a list of regressors and visualises the best one
            
            Parameters
            ----------
            train_df : pd.DataFrame
                The training data frame 
            target_col : str
                The name of the target column
                  
            Returns
            -------
            typing.List
                A list of regressors with score
            """
            # Define X and y
            X = train_df.drop(columns=target_col).values
            y = train_df[target_col].values
            # Compare Linear Regressors
            regressors = self.linear_regression_compare(X, y)
            # Compare Ensemble Regressors
            regressors.extend(self.ensemble_regression_compare(X, y))
            # Sort list
            regressors = sorted(regressors, key=lambda tup: tup[1], reverse=True)
            # Get best Regressor
            regressor = regressors[0]
            # Print best Regressor
            print("******")
            print("Best Regressor")
            print("******")
            print(regressor[0])
            # Create the train and test data
            X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
            # Print metrics
            self.regression_results(y_test, regressor[0].predict(X_test))
            # Plot prediction error of X and y of the data
            self.prediction_error(regressor[0], X_train, y_train, X_test, y_test)
            # Plot residuals of X and y of the data
            self.residuals_plot(regressor[0], X_train, y_train, X_test, y_test)
            return regressors

# Introduction

In this notebook I will be using an AutoML library of my own development to do feature engineering, EDA and prediction. This library is entirely contained in the hidden cell above. 

This has been an ongoing development and today I will add the LightGBM and XGBoost classifiers and work on feature selection.

Notebooks on the development of the RabbitML library are below: 

[AutoML from Scratch #1](https://www.kaggle.com/code/taranmarley/automl-from-scratch-1/notebook)

[AutoML from Scratch #2](https://www.kaggle.com/code/taranmarley/automl-from-scratch-2/notebook)

[AutoML from Scratch #3](https://www.kaggle.com/code/taranmarley/automl-from-scratch-3/notebook)

[AutoML from Scratch #4](https://www.kaggle.com/code/taranmarley/automl-from-scratch-4/notebook)

[AutoML from Scratch #5](https://www.kaggle.com/code/taranmarley/automl-from-scratch-5/notebook)

[AutoML from Scratch #6](https://www.kaggle.com/code/taranmarley/automl-from-scratch-6/notebook)

[AutoML from Scratch #7](https://www.kaggle.com/code/taranmarley/automl-from-scratch-7/notebook)

[AutoML from Scratch #8](https://www.kaggle.com/code/taranmarley/automl-from-scratch-8/notebook)

# Load Data

In [None]:
df = pd.read_csv("../input/spaceship-titanic/train.csv")
test_df = pd.read_csv("../input/spaceship-titanic/test.csv")

Look at data

In [None]:
df.head()

Describe all the data

In [None]:
df.describe(include='all').T

# Feature Engineering

In [None]:
fe = rabbitml.feature_engineering()
df, test_df = fe.pipeline(df, test_df, target_col="Transported", id_cols=["PassengerId"], break_up_cols=[("Cabin", "/")])
df.head(4)

# Exploratory Data Analysis

In [None]:
eda = rabbitml.eda()
eda.pipeline(df, "Transported", fe, "categorical")
display()

# Observation Selection

It is useful to chuck out observations from training that are seen as anomolous based on the test dataframe. 

In [None]:
def iforest_test_elimination(df_temp : pd.DataFrame, test_df_temp : pd.DataFrame, target_col : str) -> pd.DataFrame:
    """
    Eliminate observations from train that are seen as anomolous from the point of view of the test dataset
    
    Parameters
    ----------
    df_temp : pd.DataFrame
        Training Dataframe containing the training features. This is the dataframe we will be deleting from
    test_df_temp : pd.DataFrame
        Test DataFrame containing the test features. This is the dataframe the isolation forest will be trained on 
    """
    iforest = IsolationForest(random_state=1, contamination=0.01)
    iforest.fit(test_df_temp.values)
    anomaly_removed_df = df_temp.copy()
    anomaly_removed_df["anomaly_delete"] = iforest.predict(anomaly_removed_df.drop(columns=target_col).values)
    anomaly_removed_df = anomaly_removed_df.drop(anomaly_removed_df[anomaly_removed_df.anomaly_delete == -1].index)
    return anomaly_removed_df.drop(columns="anomaly_delete")

anomaly_removed_df = iforest_test_elimination(df, test_df, "Transported")

# Feature Selection

Detect the columns that have a similar distribution to the test set

In [None]:
from scipy.stats import ks_2samp

def check_distributions(df_temp : pd.DataFrame, df_test_temp : pd.DataFrame) -> typing.List[str]:
    """
    Check the distributions between columns between two dataframes 
    
    Parameters
    ----------
    df_temp : pd.DataFrame
        The first dataframe to check the distributions by column 
    df_test_temp : pd.DataFrame
        The second dataframe to check the distributions by column
        
    Returns
    -------
    typing.List[str]
        The list of columns with the same distribution between dataFrames
    """
    same_cols = []
    for col in df_temp.columns:
        if col in df_test_temp.columns:
            if (ks_2samp(df_temp[col], df_test_temp[col])[1]) > 0.85:
                same_cols.append(col)
    return same_cols

same_cols = check_distributions(anomaly_removed_df, test_df)
samed_df = anomaly_removed_df.copy()[same_cols]
samed_df["Transported"] = anomaly_removed_df["Transported"]
samed_test_df = test_df[same_cols]
samed_df.head()

In [None]:
scaled_df = fe.quantile_transform_column_wise(samed_df, "Transported")
scaled_test_df = fe.quantile_transform_column_wise(samed_test_df, "Transported")
final_scaled_df = scaled_df.copy()
final_scaled_test_df = scaled_test_df.copy()
new_features = rabbitml.feature_selection.kendall_tau_feature_elimination(scaled_df, scaled_df.columns, "Transported", 0.001)
cprint("Features to keep: " + str(new_features))
eliminated_df = scaled_df.copy()[new_features]
new_features.remove("Transported")
eliminated_test_df = scaled_test_df.copy()[new_features]
eliminated_df.head(1)

scaled_test_df# Prediction

In [None]:
predictor = rabbitml.prediction()
scaled_df = fe.min_max_column_wise(samed_df, "Transported")
scaled_test_df = fe.min_max_column_wise(samed_test_df, "Transported")
classifiers = predictor.categorical_pipeline(scaled_df, "Transported")
display()

# Add LightGBM

That's a good rundown of what we have so far but I'd like to add LightGBM to the mix and will do so below:

In [None]:
has_lightgbm = True
try:
   import lightgbm
except ImportError:
   has_lightgbm = False

def lightgbm_classifier(X : np.ndarray, y : np.ndarray) -> typing.Tuple:
    """
    Apply a lightgbm classifier to the give features and target
    
    Parameters
    ----------
    X : np.ndarray
        The features of the data as a numpy array
    y : np.ndarray
        The target to classify the given features against
        
    Returns
    -------
    typing.Tuple
        The lightgbm classifier and its score
    """
    if has_lightgbm:
        classifier = lightgbm.LGBMClassifier(random_state = 0)
        kfold = KFold(n_splits=5)
        scores = []
        for train_index, test_index in kfold.split(X, y):
            classifier.fit(X[train_index], y[train_index])
            scores.append(classifier.score(X[test_index], y[test_index]))
        print("LightGBM", "Score:", sum(scores) / len(scores))    
        return (classifier, sum(scores) / len(scores))
lightgbmclf = lightgbm_classifier(scaled_df.drop(columns="Transported").values, scaled_df["Transported"].values)


In [None]:
predictor.categorical_model_analysis(lightgbmclf[0], scaled_df.drop(columns="Transported").values, scaled_df["Transported"].values)

# Add XGBoost

In [None]:
has_xgboost = True
try:
   import xgboost
except ImportError:
   has_xgboost = False

def xgboost_classifier(X : np.ndarray, y : np.ndarray) -> typing.Tuple:
    """
    Apply a xgboost classifier to the give features and target
    
    Parameters
    ----------
    X : np.ndarray
        The features of the data as a numpy array
    y : np.ndarray
        The target to classify the given features against
        
    Returns
    -------
    typing.Tuple
        The xgboost classifier and its score
    """
    if has_xgboost:
        classifier = xgboost.XGBClassifier(use_label_encoder=False, eval_metric="logloss")
        kfold = KFold(n_splits=5)
        scores = []
        for train_index, test_index in kfold.split(X, y):
            classifier.fit(X[train_index], y[train_index])
            scores.append(classifier.score(X[test_index], y[test_index]))
        print("XGBoost", "Score:", sum(scores) / len(scores))    
        return (classifier, sum(scores) / len(scores))

xgbclassifier = xgboost_classifier(scaled_df.drop(columns="Transported").values, scaled_df["Transported"].values)

In [None]:
predictor.categorical_model_analysis(xgbclassifier[0], scaled_df.drop(columns="Transported").values, scaled_df["Transported"].values)

In [None]:
from sklearn.ensemble import StackingClassifier

def stacked_classifier(X : np.ndarray, y : np.ndarray, estimators : typing.List, final_estimator : BaseEstimator) -> typing.Tuple:
    """
    Apply stacked estimators as a classifier to the given features and target
    
    Parameters
    ----------
    X : np.ndarray
        The features of the data as a numpy array
    y : np.ndarray
        The target to classify the given features against
    estimators : typing.List
        List of instantiated estimators to use
    final_estimator : BaseEstimator
        The estimator to use against the results of the stack
        
    Returns
    -------
    typing.Tuple
        The stacked classifier and its score
    """
    classifier = StackingClassifier(estimators=estimators, final_estimator=final_estimator)
    kfold = KFold(n_splits=5)
    scores = []
    for train_index, test_index in kfold.split(X, y):
        classifier.fit(X[train_index], y[train_index])
        scores.append(classifier.score(X[test_index], y[test_index]))
    print("Stacked Classifier", "Score:", sum(scores) / len(scores))    
    return (classifier, sum(scores) / len(scores))

stacked = stacked_classifier(scaled_df.drop(columns="Transported").values, scaled_df["Transported"].values, estimators=[("histgradientbooster", classifiers[0][0]),("XGB", xgbclassifier[0])], final_estimator=linear_model.LogisticRegression())
display()

In [None]:
predictor.categorical_model_analysis(stacked[0], scaled_df.drop(columns="Transported").values, scaled_df["Transported"].values)

# Submission

In [None]:
scaled_df.head()

In [None]:
test_df.head()

In [None]:
classifiers[0][0].fit(final_scaled_df.drop(columns="Transported").values, final_scaled_df["Transported"].values)

In [None]:
submission_df = pd.read_csv("../input/spaceship-titanic/sample_submission.csv")

In [None]:
submission_df["Transported"] = classifiers[0][0].predict(final_scaled_test_df.values)
submission_df.to_csv("submission.csv", index=False)

# Conclusion

I am happy with what I have achieved here. The feature selection and prediction for classification has improved markedly, I will add the work today to the categorical pipeline. I can now expect reasonably good performance out the box for categorical tasks. I think it would be interesting to work more on model interpretability in the future. Potentially with work similar to what is being done with the EDA. A comprehensive feature selection pipeline would be a good addition as well.