# Module 4 Guidance

This notebook is a template for module 4b and 4c, which will be tested in Google Colab, your code needs to run there.
The structure has been provided to improve consistency and make it easier for markers to understand your code but still give students the flexibility to be creative.  You need to populate the required functions to solve this problem.  All dependencies should be documented in the next cell.

You can:
    add further cells or text blocks to extend or further explain your solution
    add further functions

Dont:
    rename functions
   

# Nomenclature

In [1]:
# Object Nomenclature
# df        -     Data Frame
# lis       -     List
# dict      -     Dictionary
# arr       -     Numpy N-dim Array
# str       -     String
# func      -     Function
# int       -     Integer


# Variable Nomenclature
# cat       -     Category/Categorical
# uni       -     Unique
# feat      -     Feature
# mod       -     Modify
# dt        -     Date Time
# cont      -     Continuous
# num       -     Numerical
# ind       -     Independent
# dep       -     Dependent
# proc      -     Processed
# nm(s)     -     Name(s)

# Module Import

In [2]:
# Fixed dependencies - do not remove or change.
import pytest
import pandas as pd
import numpy as np
from google.colab import drive
# drive.mount('/content/gdrive/')

# Import your dependencies
!pip install xlrd==1.2.0
import datetime as dt
from plotly import graph_objects as go, express as px, subplots as subp
import math
from contextlib import suppress

#Sklearn classes - Preprocessing & Pipeline
from sklearn.model_selection import train_test_split as Tts
from sklearn.compose import make_column_transformer as Ct
from sklearn.preprocessing import OneHotEncoder as Ohe
from sklearn.preprocessing import OrdinalEncoder as Oe
from sklearn.preprocessing import StandardScaler as Ss
from sklearn.pipeline import make_pipeline as Mp

#sklearn classes - feature selection
from sklearn.feature_selection import RFECV

#sklearn classes - cross validation
from sklearn.model_selection import cross_val_score as Cvs

#Sklearn classes - Classification Models
from sklearn.linear_model import LogisticRegression as LogRe
from sklearn.neighbors import KNeighborsClassifier as KNe
from sklearn.svm import SVC
from sklearn.naive_bayes import GaussianNB as GNB
from sklearn.tree import DecisionTreeClassifier as DTC
from sklearn.ensemble import RandomForestClassifier as RFC

#Sklearn Classes - Metrics
from sklearn.metrics import confusion_matrix, accuracy_score




Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting xlrd==1.2.0
  Downloading xlrd-1.2.0-py2.py3-none-any.whl (103 kB)
[K     |████████████████████████████████| 103 kB 12.1 MB/s 
[?25hInstalling collected packages: xlrd
  Attempting uninstall: xlrd
    Found existing installation: xlrd 1.1.0
    Uninstalling xlrd-1.1.0:
      Successfully uninstalled xlrd-1.1.0
Successfully installed xlrd-1.2.0


In [3]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [4]:
# Import data

def import_local_data(file_path):
    """This function needs to import the data file into colab and return a pandas dataframe
    """
    raw_df = pd.read_excel(file_path)
    return raw_df

In [5]:
local_file_path = "/content/drive/MyDrive/Colab Notebooks/JHub/Module 4/breast-cancer.xls"

In [6]:
# Dont change
raw_data = import_local_data(local_file_path)

# duplicate raw_data with dataframe nomenclature
df_raw = raw_data.copy(deep=True)

# Exploratory Analysis

Conduct exploratory data analysis and explain your key findings - Examine the data, explain its key features and what they look like.  Highlight any fields that are anomalous.

Functions to generate data frame of unique values for each feature and a list of features containing date time or time values.

In [7]:
#function to return a data frame of unique categories for each feature - requires dataframe passing to it
def func_cat_uni(df):
  """Compiles Data frame of unique values for features in raw data
  
  Input:
  1) Data frame with raw data

  Output:
  1) Data frame of unique values for each feature
  """
  
  dict_uni  = {}
  for header in df.columns:
    dict_uni[header] = list(df[header].unique())

  #compile into new data frame of different length arrays for each column
  df_uni = pd.DataFrame(dict([(key, pd.Series(value)) for key, value in dict_uni.items()]))

  return(df_uni)

#function to return a list of features containing date or date-time categories - assuming they encoded this way in error
def func_feat_time_date(df):
  """Compiles List of features containing date time

  Input:
  1) Data frame

  Output:
  1) List of features with date or date time categories
  """
  
  
  #extract column headers where there is 1 more value date time or date value
  lis_feat_date_time = []

  for header in df.columns:
    for value in df[header]:
      if isinstance(value, dt.datetime) | isinstance(value,dt.date):
        lis_feat_date_time.append(header)
        break

  return(lis_feat_date_time)

Function to convert date time/date values to text string

In [8]:
#function to convert date time values to string number range
def func_conv_dt_str(df, lis_dt_feat):
  """Converts any date time or date values to a string number range

  Input:
  1) Data frame with raw data
  2) List of features with date time or time encoded values - compiled by func_feat_time_date

  Output:
  1) Data frame with date time encoded values converted to text string
  2) Text output detailing features containing date tune or date encoded values
  """

  #Generate empty lists to capture row references for 
  lis_total_dt = [];   lis_total_dt_remain = []

  #Generate a copy of the dataframe
  df_wk2 = df.copy(deep=True)

  #iterate through all features that have date time in them
  for feature in lis_dt_feat:

    # make working data frame for column header and remove any rows containing nan
    df_wk = pd.DataFrame(df[feature]).dropna(subset=[feature])

    # add lower and upper bound columns for value range and convert to integer, then calculate difference and sort by lower value in each range
    df_wk[['L','R']] = df_wk[feature].str.split("-" , expand=True).apply(pd.to_numeric)
    df_wk['diff'] = df_wk['R'] - df_wk['L']
    df_wk['map'] = np.nan
    df_wk = df_wk.sort_values(['L']).reset_index(drop = True)

    #form a list of differences between lower and upper values - when converting date times to ranges, it will check that the calculated difference matches a value in this array
    lis_diff = list(np.int_((df_wk['diff'].unique())[~np.isnan(df_wk['diff'].unique())]))

    #create empty dictionary to populate with key pair values of date time code and associated string format, that will be used to map datetime to string later in script
    dict_date_time_map = {}

    #Iterates through the Index of all date time values in df_wk
    for row in [a for a in df_wk.index if isinstance(df_wk[feature].loc[a], dt.datetime) | isinstance(df_wk[feature].loc[a], dt.date)]:

      #Appends row to lis_total_dt
      lis_total_dt.append(row)

      #extracts 2 digit integers from datetime values in dataframe - row by row
      lis_nums = [int(val) for val in [val for vals in [[val[i:i+2] for i in range(0, len(val),2)] for val in str(df_wk[feature].loc[row]).split(" ")[0].split("-")] for val in vals]]

      #Calculates differences between all combinations of two digit integers in lis_nums, if the difference calculated is in lis_diff, the numbers are appended to the dataframe
      for val_1 in reversed(lis_nums):
        for val_2 in reversed(lis_nums):
          if val_2 - val_1 in lis_diff:
            df_wk.loc[row,['L','R','diff','map']] = [val_1,val_2,(val_2-val_1),("{}-{}").format(val_1,val_2)]
            dict_date_time_map[df_wk[feature].loc[row]] = ("{}-{}").format(val_1,val_2)

    #Print the feature and corresponding number of values formatted as datetime or date
    print('Feature: {} - Total datetime/date values: {}'.format(feature,len([a for a in df_wk.index if isinstance(df_wk[feature].loc[a], dt.datetime) | isinstance(df_wk[feature].loc[a], dt.date)])))

    #Map date time values to string formatted ranges
    df_wk2[feature] = df_wk2[feature].replace(dict_date_time_map.keys(),dict_date_time_map.values())

    #Calculate the number of values still formatted as date time after attempted conversion
    [lis_total_dt_remain.append(val) for val in [a for a in df_wk2.index if isinstance(df_wk2[feature].loc[a], dt.datetime) | isinstance(df_wk2[feature].loc[a], dt.date)]]
    
  print('Features containing datetime/date values: {}'.format(len(lis_dt_feat)))
  print('Total datetime/date formatted values: {}'.format(len(lis_total_dt)))
  print('Total remaining datetime/date values: {}'.format(len(lis_total_dt_remain)))

  return(df_wk2)


Function to drop rows

In [9]:
def func_drop_dt(df):
  """Drops any rows that still contain a date time or date value, or a nan value. 
  
  Input: 
  1) Data frame
  
  Output:
  1) Data frame with all datetime/date value/nan rows dropped
  """

  #Make copy of dataframe
  df_wk = df.copy(deep=True)

  #iterate through dataframe to check if a value in the row is date time or date and append row index to list
  lis_row_drop = []

  for row in df_wk.index:
    for elm in df_wk.loc[row]:
      if isinstance(elm, dt.datetime) or isinstance(elm, dt.date) or elm==np.nan:
        lis_row_drop.append(row)
        break

  #drop rows
  df_wk = df_wk.drop(df_wk.loc[lis_row_drop].index)

  print("Datetime/date/ formatted rows or NaN rows dropped: {}".format(len(lis_row_drop)))

  return(df_wk, lis_row_drop)

Function to plot histograms

In [10]:
def func_plot_histo(df, str_dep_feat, lis_ind_feat_no_plot):
  """Determines if a feature can be converted into into continuous numerical data - adds to a dictionary of continuous or categorical features
  Plots each feature (as either continuous or categorical) against the dependent variable.
  Will not work with date time/date encoded values, must either convert to string or drop date time/date values from data frame before plotting
  
  Input: 
  1) Data frame
  2) Dependent feature - should always be "Class"
  3) Features to not include in the plot - if not relevant.
  """

  #Generate copy of dataframe
  df_wk = df.copy(deep=True)

  #Compile a dictionary of features with values stating whether or not plot can be numerical continuous or catgeorical
  dict_feat_cont_cat = {}

  #iterate through features 
  for feature in df_wk.columns.drop(str(str_dep_feat)):

    #checks that all the data for a particular feature is split into list of length 2 and both values are numeric - then data is classified as numerical continuous, otherwise catgeorical
    try:
      if all([len(elm.split("-")) == 2 and all([elm_split.isnumeric() for elm_split in elm.split("-")]) for elm in df_wk[feature][df_wk.index]]):
        dict_feat_cont_cat[feature] = "cont"
      else:
        dict_feat_cont_cat[feature] = "cat"
    except:
      dict_feat_cont_cat[feature] = "cat"

  #list of indendpent features to plot
  lis_ind_feat_plot = list(df_wk.columns.drop(lis_ind_feat_no_plot).drop(str_dep_feat))

  #calculate number of rows and columns required for grid of plots
  int_ind_feats = len(lis_ind_feat_plot)
  r, c = int(round(math.sqrt(int_ind_feats))), int(math.ceil(math.sqrt(int_ind_feats)))

  #Generate Plot space
  fig = subp.make_subplots(rows = r, cols = c, subplot_titles=lis_ind_feat_plot)
  fig.update_layout(height=1000, width=1500, title_text='Normalised Probability')

  for feature,col,row in zip(lis_ind_feat_plot,[1+i-c*math.floor(i/c) for i in range(int_ind_feats)],[1+math.floor(i/c) for i in range(int_ind_feats)]):
    
    #categorical plot - sort method is list of categories in aplhanumeric order
    if dict_feat_cont_cat[feature] == 'cat':
      sort_method = feature
     
    #continuous plot - sort method creates new column with lower end of value range for continuous features and sorts by that
    elif dict_feat_cont_cat[feature] == 'cont':
      df_wk['sort'] = df_wk[feature].str.split("-" , expand=True).apply(pd.to_numeric)[0]
      sort_method = 'sort'

    #passes sort method to plot for recrurrent and non recurrent events below
    fig.add_trace(go.Histogram(x=df_wk.sort_values(by=sort_method)[feature][df_wk['Class'] == 'no-recurrence-events'], histnorm='probability',name="No Recurrence", marker=dict(color='green')),row = row, col=col)
    fig.add_trace(go.Histogram(x=df_wk.sort_values(by=sort_method)[feature][df_wk['Class'] == 'recurrence-events'], histnorm='probability', name = "Recurrence", marker=dict(color='red')), row = row, col=col)

  return(fig.show())


Function to convert string number ranges to mean value of number range, and floats to integers

In [11]:
def func_conv_str_int(df):
  """Splits string range and calculates mean value, adds to dictionary then maps to the dataframe in place of the previous string

  Input:
  1) Modified Data frame with string ranges/strings/floats only - no datetime/date values

  Output:
  1) Data frame with string ranges replaced by mean value of range.
  2) Dictionary mapping string number range values to numerical mean values
  """

  #Generate working copy of data frame
  df_wk = df.copy(deep=True)

  #Generate unique values from df_wk
  dict_uni  = {}

  for header in df_wk.columns:
    dict_uni[header] = list(df[header].unique())

  #compile into new data frame of different length arrays for each column
  df_wk2 = pd.DataFrame(dict([(key, pd.Series(value)) for key, value in dict_uni.items()]))

  #checks that all the data for a particular feature is split into list of length 2 and both values are numeric
  dict_int_cat = {}

  for feature in df_wk2.columns:

    with suppress(Exception):
      if all([len(elm.split("-")) == 2 and all([elm_split.isnumeric() for elm_split in elm.split("-")]) for elm in df_wk2[feature].dropna()[df_wk2[feature].dropna().index]]):
        #print('Range condition satisfied with {}'.format(feature))
        dict_wk = {}

        for elm in df_wk2[feature].dropna()[df_wk2[feature].dropna().index]:
          #print(elm)
          dict_wk[elm] = (int(elm.split("-")[1]) + int(elm.split("-")[0]))/2

        dict_int_cat[feature] = dict_wk
    
      else:
        pass

  #Map values in dict_int_cat to Data frame df_wk
  for feature in dict_int_cat.keys():
    df_wk[feature] = df_wk[feature].replace(dict_int_cat[feature].keys(),dict_int_cat[feature].values())

  return(df_wk,dict_int_cat)

## Plot Data

In [12]:
#Generate data frame of unique categories for each feature - allows to inspect categorical/continuous data
df_uni = func_cat_uni(df_raw)

#Generate list of features where there are date values - passed to function to convert to string from datetime/date
lis_dt_feat = func_feat_time_date(df_raw)

#Generate data frame of converted date time to string ranges 
df_mod = func_conv_dt_str(df_raw, lis_dt_feat)

#Generate data frame with all rows containing date time values removed - any rows unsuccessfully converted to str to be removed
df_drop, lis_row_drop = func_drop_dt(df_mod)

#Determine independent feature - in this case class - i.e. recurrent or non recurrent
str_dep_feat = 'Class'

#complile list of independent features not to plot - i.e. if assessed that they have no effect on the dependent feature
lis_ind_feat_no_plot = []

#Plot function - produces normalised histograms for dependent feature - split between recurrent and non-recurrent
func_plot_histo(df_mod, str_dep_feat, lis_ind_feat_no_plot)

#Convert string number range features to numerical mean value
#df_mod2, dict_str_int_map = func_conv_str_int(df_mod)
#func_plot_histo(df_mod2,str_dep_feat, lis_ind_feat_no_plot)



Feature: tumor-size - Total datetime/date values: 32
Feature: inv-nodes - Total datetime/date values: 66
Features containing datetime/date values: 2
Total datetime/date formatted values: 98
Total remaining datetime/date values: 0
Datetime/date/ formatted rows or NaN rows dropped: 0


In [13]:
#Convert string number range features to numerical mean value
df_mod2, dict_str_int_map = func_conv_str_int(df_mod)
func_plot_histo(df_mod2,str_dep_feat, lis_ind_feat_no_plot)

Findings from histogram analysis:
1. Breast or breast-quad likely not related to the dependent variable
2. Increase in deg-malig correlate with an increased probability of recurrence

# Data Preprocessing

Create any data pre-processing that you will conduct on seen and unseen data.  Regardless of the model you use, this dataframe must contain only numeric features and have a strategy for any expected missing values. Any objects can that are needed to handle the test data that are dependent on the training data can be stored in the model class.  You are recommended to use sklearn Pipelines or similar functionality to ensure reproducibility.

In [14]:
class Module4_Model:
    
    def __init__(self):  
      self.model = None
    
    def split_data(self, df, str_dep_feat, lis_ind_feat_no_analyse, test_size, strategy):
      """splits data and drops features

      Input:
      1) Data frame
      2) String of dependent feature - in this case class
      3) List of features that will not be considered in the analysis
      4) Test size for Training Test Split
      5) Strategy - if int, it will call the function to convert str ranges to mean values of numerical string ranges

      Output:
      1) Training data frame
      2) Test data frame

      """
      df_wk = df.copy(deep=True)

      #Generate list of features where there are date values - passed to function to convert to string from datetime/date
      lis_dt_feat = func_feat_time_date(df_wk)

      #Generate data frame of converted date time to string ranges 
      df_wk = func_conv_dt_str(df_wk, lis_dt_feat)

      #drop any remaining time date rows
      df_wk = func_drop_dt(df_wk)[0] 

      #convert string number range features to numerical mean value
      if strategy == 'int':
        df_wk, dict_str_int_map = func_conv_str_int(df_wk)

      else:
        pass

      #store dataframes for whole independent and dependent feature data sets
      self.df_x = df_wk.drop(list(lis_ind_feat_no_analyse)+[str_dep_feat],axis=1)
      self.df_y = pd.DataFrame(df_wk[str_dep_feat])

      #split using sklearn and drop features not being considered. 
      df_x_train, df_x_test, df_y_train, df_y_test = Tts(df_wk.drop(list(lis_ind_feat_no_analyse)+[str_dep_feat],axis=1), df_wk[str_dep_feat], test_size = test_size, random_state=1)

      #Assign attributes to the class instance
      self.dependent_feature = str_dep_feat
      self.features_not_analysed = lis_ind_feat_no_analyse
      self.test_size = test_size
      self.strategy = strategy

      #Return respective y and x dataframes joined to meet requirements of Module4_Model, the outputs of which will be training_df and test_df
      return(df_x_train.join(df_y_train), df_x_test.join(df_y_test))       

    def model_selection(self, model):
      self.model = model

    def preprocess_training_data(self, training_df):
      """
      """
      #Generates copy of data frame
      df_wk = training_df.copy(deep=True) 
      
      #determine encoding type for independent features: One Hot Encoder for strings and standard scaler for integers and floats
      lis_ohe = [feature for feature in df_wk.columns.drop(self.dependent_feature) if all([isinstance(elm, str) for elm in df_wk[feature]])]
      lis_ss = [feature for feature in df_wk.columns.drop(self.dependent_feature) if all([isinstance(elm, (int,float)) for elm in df_wk[feature]])]     

      #make column transformer object using onehot encoding for string categorical and standard scaler for integers and floats
      self.ct_x = Ct((Ohe(sparse=False, drop=None, handle_unknown='ignore'), lis_ohe), (Ss(),lis_ss), remainder='passthrough' )

      #make 2 step pipeline object - column transformer and model, model will be passed to the pipeline in the train_predict method
      self.pipe = Mp(self.ct_x, self.model)

      #generate data frame for dependent feature - no encoding required
      self.df_training_y = pd.DataFrame(df_wk[self.dependent_feature])

      #generate data frame for transformed independent features
      self.arr_training_x = self.pipe[0].fit_transform(df_wk.drop(self.dependent_feature, axis=1))
      self.arr_training_x_nms = self.pipe[0].get_feature_names_out()
      
      return self.arr_training_x   

    def preprocess_test_data(self,test_df):
      """
      """
      #Generates copy of data frame
      df_wk = test_df.copy(deep=True)  
      
      #Apply column transformer to Independent features
      self.arr_test_x = self.pipe[0].transform(df_wk.drop(self.dependent_feature, axis=1))
      self.arr_test_x_nms = self.ct_x.get_feature_names_out()

      #Generate dataframe of dependent feature
      self.df_test_y = pd.DataFrame(df_wk[self.dependent_feature])

      return self.arr_test_x

    def train_predict(self):
      
      self.pipe[1].fit(self.arr_training_x, np.array(self.df_training_y).ravel())
      self.df_predict_y = pd.DataFrame(self.pipe[1].predict(self.arr_test_x))
    
    def cross_validate(self,model):
      #remake pipeline
      self.pipe_cv = Mp(self.ct_x, model)
      #10 fold cross validation
      self.cvs = Cvs(self.pipe_cv, self.df_x, np.array(self.df_y).ravel(), cv=10, scoring='accuracy')

    def metrics(self):
      cm = confusion_matrix(self.df_test_y, self.df_predict_y)
      print(cm)
      print(accuracy_score(self.df_test_y, self.df_predict_y))

    def tune(self):
      pass

    def feature_selection(self, model):
      #remake pipeline
      self.pipe_rfecv = Mp(self.ct_x, model)
      
      #create recursive feature elimination object
      self.rfecv = RFECV(estimator=model, step=1, min_features_to_select=1, cv=10,scoring='accuracy')
      self.rfecv.fit(self.pipe_rfecv[0].fit_transform(self.df_x), np.array(self.df_y).ravel())
      

## Single Model Case

Split Data, requires a dependent feature, in this case 'Class', a list of features to exlude from the analysis if required, a strategy for value conversion and a test size for training test split

In [25]:
# Dont change - initialise the model
my_model = Module4_Model()

#Determine independent feature - in this case class - i.e. recurrent or non recurrent
str_dep_feat = 'Class'

#complile list of independent features not to analyse
lis_ind_feat_no_analyse = []#['breast', 'breast-quad']

#setting strategy to 'int' will call the function func_conv_str_int.
strategy = 'str'

#test size is passed to the training test split class
test_size = 0.2

#Assign Models to the analysis
my_model.model_selection(LogRe(random_state=1))
my_model.model_selection(KNe(n_neighbors=10))

#Split the data into training and test data
x_train, x_test = my_model.split_data(df_raw, str_dep_feat, lis_ind_feat_no_analyse, test_size, strategy)

# Dont change
x_train_processed = my_model.preprocess_training_data(x_train)

# Dont change
x_test_processed = my_model.preprocess_test_data(x_test)

# Predict outcome
my_model.train_predict()

#analyse metrics
my_model.metrics()


Feature: tumor-size - Total datetime/date values: 32
Feature: inv-nodes - Total datetime/date values: 66
Features containing datetime/date values: 2
Total datetime/date formatted values: 98
Total remaining datetime/date values: 0
Datetime/date/ formatted rows or NaN rows dropped: 0
[[38  0]
 [17  3]]
0.7068965517241379


## Cross validate multiple models

In [16]:
#loop through multiple models:

model_list = [LogRe(random_state=1), 
              KNe(n_neighbors=10), 
              SVC(kernel = 'linear', random_state = 1), 
              SVC(kernel = 'rbf', random_state = 1), 
              GNB(), 
              DTC(criterion = 'entropy', random_state = 0),
              RFC(n_estimators = 10, criterion = 'entropy', random_state = 1) ]
              
for model in model_list:
  my_model.cross_validate(model)
  print(my_model.strategy, model, my_model.cvs.mean())

str LogisticRegression(random_state=1) 0.705911330049261
str KNeighborsClassifier(n_neighbors=10) 0.7307881773399014
str SVC(kernel='linear', random_state=1) 0.6785714285714286
str SVC(random_state=1) 0.7447044334975369
str GaussianNB() 0.636576354679803
str DecisionTreeClassifier(criterion='entropy', random_state=0) 0.6607142857142858
str RandomForestClassifier(criterion='entropy', n_estimators=10, random_state=1) 0.7131773399014778


##Feature Selection

## Model Tuning

# Unit tests:

Checking training and test data for null values. This will work for both pd dataframes and np arrays, and ensures no null values exist.

In [18]:
def test_no_nulls(data):
    """ Assert no null values within pd dataframe or np array """
    
    # if data is numpy array, handle accordingly
    if isinstance(data, (np.ndarray)):
        assert not np.isnan(np.min(data))
    
    # if not np array, assume data is pandas dataframe
    else:
        assert data.isna().sum().sum() == 0

In [19]:
# run null data unit test on both training and test data
test_no_nulls(x_train_processed)
test_no_nulls(x_test_processed)

# Unused Code

Function to split data

In [20]:
def func_split(df, str_dep_feat, lis_ind_feat_no_analyse, test_size, strategy):
  """splits data and drops features

  Input:
  1) Data frame
  2) String of dependent feature - in this case class
  3) List of features that will not be considered in the analysis
  4) Test size for Training Test Split
  5) Strategy - if int, it will call the function to convert str ranges to mean values of numerical string ranges

  Output:
  1) Training data frame
  2) Test data frame

  """
  #Generate list of features where there are date values - passed to function to convert to string from datetime/date
  lis_dt_feat = func_feat_time_date(df)

  #Generate data frame of converted date time to string ranges 
  df_wk = func_conv_dt_str(df, lis_dt_feat)

  #Generate data frame with all rows containing date time values or nan removed - any rows unsuccessfully converted to str to be removed
  df_wk = func_drop_dt(df_wk)[0] 

  #convert string number range features to numerical mean value
  if strategy == 'int':
    df_wk, dict_str_int_map = func_conv_str_int(df_wk)

  else:
    pass

  #split using sklearn and drop features not being considered. 
  df_x_train, df_x_test, df_y_train, df_y_test = Tts(df_wk.drop(list(lis_ind_feat_no_analyse)+[str_dep_feat],axis=1), df[str_dep_feat], test_size = test_size, random_state=1)

  #Return respective y and x dataframes joined to meet requirements of Module4_Model, the outputs of which will be training_df and test_df

  return(df_x_train.join(df_y_train), df_x_test.join(df_y_test))


In [21]:
def func_pre_process(df, str_dep_feat):
  """


  """
  #Generates copy of data frame
  df_wk = df.copy(deep=True) 
  
  #determine encoding type: Label Encoder, One Hot Encoder or nothing, drops features not considered
  lis_ohe, lis_ss = [],[]

  for feature in df_wk:

    # check all string - One Hot Encoder
    if all([isinstance(elm, str) for elm in df_wk[feature]]):
      lis_ohe.append(feature)
      
    # check all floats or integers - standar scaler
    elif all([isinstance(elm, (int, float)) for elm in df_wk[feature]]):
      lis_ss.append(feature)

    else:
      pass

  #column transformer for independent features
  ct_x = Ct((Ohe(sparse=False, drop=None), [elm for elm in lis_ohe if elm !=str_dep_feat]), (Ss(),[elm for elm in lis_ss if elm != str_dep_feat]), remainder='passthrough' )
  arr_x_proc = ct_x.fit_transform(df_wk.drop(str_dep_feat, axis=1))
  arr_x_feat_nms = ct_x.get_feature_names_out()

  #create a column transformer object for dependent features
  ct_y = Ct((Ohe(sparse=False, drop=None), [elm for elm in lis_ohe if elm == str_dep_feat]), (Ss(),[elm for elm in lis_ss if elm == str_dep_feat]), remainder='passthrough')
  arr_y_proc = ct_y.fit_transform(pd.DataFrame(df_wk[str_dep_feat]))
  arr_y_feat_nms = ct_y.get_feature_names_out()
  
  return(arr_x_proc, arr_x_feat_nms, arr_y_proc, arr_y_feat_nms)

In [22]:
      # #column transformer for independent features
      # ct_x = Ct((Ohe(sparse=False, drop=None), lis_ohe), (Ss(),lis_ss), remainder='passthrough' )
      # self.training_x = ct_x.fit_transform(df_wk.drop(self.dependent_feature, axis=1))
      # self.training_x_nms = ct_x.get_feature_names_out()

      # #dependent feature
      # self.training_y = pd.DataFrame(df_wk[self.dependent_feature])

      # #Assign attributes to the instance - column transformer objects
      # self.ct_x = ct_x

      #       #rewrite for generator
      # for feature in df_wk.columns.drop(self.dependent_feature):

      #   # check all string - One Hot Encoder
      #   if all([isinstance(elm, str) for elm in df_wk[feature]]):
      #     lis_ohe.append(feature)
          
      #   # check all floats or integers - standard scaler
      #   elif all([isinstance(elm, (int, float)) for elm in df_wk[feature]]):
      #     lis_ss.append(feature)

      #   else:
      #     pass

