### Data Validation

##### Import Libraries

In [None]:
import os
import json
from pyspark.sql.types import *
from pyspark.sql import functions as F
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd  
from random import sample 

#### Config

(Provide the Data Paths, Directory path for results and data fromat if it is not "delta")

In [None]:
# Define Data Paths

base_data_path = '/mnt/data_timepoint1'

data_to_validate_path = '/mnt/data_timepoint2'

# Define Data Format (delta/csv/..)

data_format = "delta"

# Define directory to store results

dir_path = "/mnt"

###----------------------------------- Optional ------------------------------------------

## Change criterion & threshold values as needed

threshold_setting = {  "Data Type" : {'criterion': "equality"},
                     
                       "Data size" : {'criterion': "equality"},
                     
                       "No of Features" : {'criterion': "equality"},
                     
                       "Presence"  : {'criterion': "within", 
                                      "Min_fraction" : {'criterion': "reverse_difference",
                                                        'value' : 0.05}},
                     
                       'isComplete' : {'criterion': "equality"},
                     
                       'isPositive' : {'criterion': "equality"},
                     
                       'Minimum' : {'criterion': "less than"},
                     
                       "Average"   : {'criterion': "ratio",
                                      'value': 1.5},
                     
                       'Standard Deviation' : {'criterion': "ratio",
                                               'value': 2},
                     
                       'Range' : {'criterion': "inRange",
                                  'value': 1.5},
                     
                       'Skewness' : {'criterion': "ratio",
                                     'value': 1.5},
                     
                       "Mininum_categories" : {'criterion': "less_than"}}


## Provide the columns/features of interest (if any)

drift_for_particular_features_only = []


## To update any rules (either standard values for a feature or the threshold used for the feature)

change_default_standard_values_for_features = {}  # Eg -  {"Age", {"Mean": 23,...}}

change_default_threshold_for_features = {} # Eg - {"Age": {"Mean": {"Criterion": 'ratio', 'value': 1.5}}, "Height" : {...}} (for any feature and stat, set the                                                      criterion and value)

#### Read Data

In [None]:
sdf = spark.read.format(data_format).load(base_data_path)

### Create Rules

In [None]:
def create_rules(statistics, default_threshold_setting = threshold_setting):
  
  rules_dic = {"Dataset": {}, "Numerical":{}, "Categorical": {}}
  
  dataset_stat_keys= ["Data size",
                      "No of Features"]
  
  numerical_stat_keys = ["Data Type",
                         "Presence",
                         "Average",
                         'Standard Deviation',
                         'Range',
                         'Skewness',
                         'isPositive',
                         'isComplete']

        
  categorical_stat_keys = ["Data Type",
                           "Presence",
                           "Mininum_categories",
                           'isComplete']

  
  #-----------------------------------------Function to compute other required Statistics -----------------------------------------------
  
  
  def generate_stat(stats, key):
    
    if key == "Range":
      
      return ([stats["Minimum"], stats['Maximum']])
    
    elif key =="isComplete":
      
      if int(stats["Missing Value Percentage"]) == 0 :
          
          return True
        
      else:
          
          return False
    
    elif key == "Presence":
      
      return {"Min_fraction" : (1 - stats["Missing Value Percentage"] /100)}
    
    elif key == "isPositive":
      
      if int(stats["Minimum"]) >= 0 :
          
          return True
        
      else:
          
          return False      
    
    elif key =="Mininum_categories":
      
      return stats["No of Distinct Categories"]
    
    elif key == "isContainedIn" :
      
      return stats["distinct_categories"]
  
  #----------------------------------------------------------Define Threshold---------------------------------------------------------------------
  
  rules_dic["Threshold"] = {"Dataset": {k : default_threshold_setting[k]  for k in dataset_stat_keys},
                            
                              
                            "Numerical" : {"Feature-wise": {feat: {k : default_threshold_setting[k]  for k in numerical_stat_keys} for feat in statistics['Numerical'].keys()},
                              
                              
                                            "Global": {k : default_threshold_setting[k]  for k in numerical_stat_keys}},
                            
                            "Categorical" : {"Feature-wise": {feat: {k : default_threshold_setting[k]  for k in categorical_stat_keys} for feat in statistics['Categorical'].keys()},
                              
                              "Global": {k : default_threshold_setting[k]  for k in categorical_stat_keys}}}

                            
                            
  #--------------------------------------- Generate Rules from Statistics ----------------------------------------------
  
  for  key in statistics.keys():
    
    if key == "Numerical":
      
      for feature in statistics["Numerical"].keys():
        
        rules_dic["Numerical"][feature] = {}
        
        for k in numerical_stat_keys:
          
          try:

            rules_dic["Numerical"][feature][k] = statistics["Numerical"][feature][k]
          
          except:
            
            rules_dic["Numerical"][feature][k] = generate_stat(statistics["Numerical"][feature], k)
            
            
    elif key == "Categorical":
      
      for feature in statistics["Categorical"].keys():
        
        rules_dic["Categorical"][feature] = {}
        
        for k in categorical_stat_keys:
            
          try:
          
            rules_dic["Categorical"][feature][k] = statistics["Categorical"][feature][k]
          
          except:
            
            rules_dic["Categorical"][feature][k] = generate_stat(statistics["Categorical"][feature], k)
    
    else:
        
        rules_dic["Dataset"] = {k : statistics["Dataset"][k]  for k in dataset_stat_keys}
  
  return rules_dic

#### Update Rules Function

In [None]:
def update_rules(rules, feature_name, dic, threshold = False):
  
  if threshold == False:
    
    if feature_name in rules["Numerical"].keys():
      
      for k in dic.keys():

        rules["Numerical"][feature_name][k] = dic[k]

    elif feature_name in rules["Categorical"].keys():

      for k in dic.keys():

        rules["Categorical"][feature_name][k] = dic[k]
        
    else:

      print("Wrong Feature Name provided to update the Rules")
      print("Name provided:", feature_name)
      
  elif threshold == True:
    
    if feature_name in rules['Threshold']["Numerical"]['Feature-wise'].keys():
      
      for k in dic.keys():

        rules['Threshold']["Numerical"]['Feature-wise'][feature_name][k] = dic[k]
        
    elif feature_name in rules['Threshold']["Categorical"]['Feature-wise'].keys():
      
      for k in dic.keys():

        rules['Threshold']["Categorical"]['Feature-wise'][feature_name][k] = dic[k]

    else:

      print("Wrong Feature Name provided to update the Threshold Rules")
      print("Name provided:", feature_name)
  
  return rules

#### Anomaly Message Function

In [None]:
def anomaly_message(dic, Type, detailed_msg = False, default_setting = threshold_setting):
  
  if Type in default_setting.keys():
    
    try:
    
      obs = dic[Type]["Observed"]
      exp = dic[Type]["Expected"]
      
    except:
      
      pass
      
  
  else:
    
    return
  
  if Type == "Data size":
    
    print("\t** Significant Change in  No of Rows/Data Points")
    
    if detailed_msg == True :
      
      if  obs >  exp:
        
        print("\t\t- No of Rows/Data Points has increased from {} to {}\n".format(exp, obs))
      
      else:
        
        print("\t\t- No of Rows/Data Points has decreased from {} to {}\n".format(exp, obs))
        
  
  elif Type == "No of Features":
    
    print("\t** Significant Change in  No of Columns/Features")
    
    if detailed_msg == True :
      
      if  obs >  exp:
        
        print("\t\t- No of Columns/Features has increased from {} to {}\n".format(exp, obs))
      
      else:
        
        print("\t\t- No of Columns/Features has decreased from {} to {}\n".format(exp, obs))
  
  elif Type == "Data Type":
    
    print("\t** Data Type of the Column/Feature has changed")
    
    if detailed_msg == True :
      
      print("\t\t-{} Type when expected {} Type\n".format(obs, exp))
    
  elif Type == "Presence":
    
    for k in dic[Type].keys():
      
      if k == "Min_fraction":
        
        print("\t** The fraction of examples containing the feature is too small")
        
        if detailed_msg == True :
          
          print("\t\t- Expected atleast {} fraction of Examples but observed {}\n".format(dic[Type][k]["Expected"], dic[Type][k]["Observed"]))
  
  elif Type == 'isComplete':
    
    print("\t** The feature/columnn has Missing values whereas there should be no Missing value")
    
  elif Type == 'isPositive':
    
    print("\t** The feature is supposed to have positive/zero values but there are some Negative values")
    
  elif Type == "Mininum_categories":
    
    print("\t** The Feature has less no of Categories than expected")
    
    if detailed_msg == True :
      
      print("\t\t- Expected atleast {} Categories but {} Categories present\n".format(exp, obs))
      
  elif Type == "Average":
    
    print("\t** The Feature Mean has changed Significantly")
    
    if detailed_msg == True :
      
      print("\t\t- The Feature Mean has changed from {} to {}\n".format(exp, obs))
    
  elif Type == "Standard Deviation":
    
    print("\t** The Feature Standard Deviation has changed Significantly")
    
    if detailed_msg == True :
      
      print("\t\t- The Feature SD has changed from {} to {}\n".format(exp, obs))
  
  
  elif Type == "Skewness":
    
    print("\t** The Skewness of the Feature has changed Significantly")
    
    if detailed_msg == True :
      
      print("\t\t- The Skewness has changed from {} to {}\n".format(exp, obs))
      
  elif Type =="Range":
    
    print("\t** Some values found to be outside the Range of Feature")
      
  else:
    
    print("Anomaly is there. Detailed msg is not there")

#### Function for Numerical Statistics

In [None]:
def num_stat_fn(df, num_features):
  
  stat_dic = {}
  dtype_dic = dict(df.dtypes)

  
  for feature in num_features:
    
    df_stats = df.select(
    F.mean(F.col(feature)).alias('mean'),
    F.stddev(F.col(feature)).alias('std'),
    F.min(F.col(feature)).alias('min'),
    F.max(F.col(feature)).alias('max'),
    F.count(F.col(feature)).alias('count'),
    F.skewness(F.col(feature)).alias('skewness'),
    F.kurtosis(F.col(feature)).alias('kurtosis'),
    F.count(F.when(F.isnan(feature) | F.col(feature).isNull(), feature)).alias("missing_val")
).collect()
    
    stat_dic[feature] = {"Feature Type": "Numerical",
                         "Data Type": dtype_dic[feature],
                         "Count": df_stats[0]["count"],"Average": df_stats[0]["mean"],
                         "Standard Deviation":df_stats[0]["std"],
                         "Minimum": df_stats[0]["min"],
                         "Maximum": df_stats[0]["max"],
                         "Skewness": df_stats[0]["skewness"],
                         "Kurtosis": df_stats[0]["kurtosis"],
                         "Missing Value Percentage": (df_stats[0]["missing_val"]) * 100/df.count()}
  
  return stat_dic

#### Function for Categorical Statistics

In [None]:
def cat_stat_fn(df, cat_features):
  
  stat_dic = {}
  dtype_dic = dict(df.dtypes)
  
  for feature in cat_features:
    
    df_stats = df.select(
    F.count(F.col(feature)).alias('count'),
    F.count(F.when(F.isnan(feature) | F.col(feature).isNull(), feature)).alias("missing_val"),
    F.countDistinct(F.col(feature)).alias("categories")
).collect()
    
    stat_dic[feature] = {"Feature Type": "Categorical",
                         "Data Type": dtype_dic[feature],
                         "Count": df_stats[0]["count"],
                         "No of Distinct Categories": df_stats[0]["categories"],
                         "Missing Value Percentage": (df_stats[0]["missing_val"]) * 100 /df.count()}
  
  return stat_dic

#### Separate Numerical & Categorical Features

In [None]:
def num_cat_features(df, category = 10):
  
  categoricalColumns = [item[0] for item in df.dtypes if item[1].startswith('string')]

  rest_cols = list(set(df.columns) - set(categoricalColumns))
  
  for feature in rest_cols:
    if df.select(F.countDistinct(F.col(feature))).collect()[0][0] <= category:
      categoricalColumns.append(feature)

  categoricalColumns = list(set(categoricalColumns))  
  numericalColumns = list(set(df.columns) - set(categoricalColumns))
  
  return(categoricalColumns,numericalColumns)

##### Data Stat

In [None]:
def dataset_statistics(df, num_count, cat_count):
  
  num_cat_features(df, category = 10)
  stat_dic = {"Data size": df.count(), "No of Features": len(df.columns), "No of Numerical Features": num_count, "No of Categorical Features": cat_count}
  return stat_dic

### Generate Statistics Function

In [None]:
def generate_statistics(df):
  
  categoricalColumns,numericalColumns = num_cat_features(df, category = 4)
  feature_stat = {"Numerical": num_stat_fn(df, numericalColumns), "Categorical" :cat_stat_fn(df, categoricalColumns)}
  data_stat = dataset_statistics(df, len(categoricalColumns), len(numericalColumns))
  stat = {"Dataset" :data_stat, **feature_stat}
  
  return stat

#### Generate Statistics & Rules

In [None]:
rules_path = dir_path + "/rules.json"
base_stat_path = dir_path + "/base_statistics.json"

base_stat = generate_statistics(sdf)

rules_dic = create_rules(base_stat)

out_file = open(rules_path, "w")  
json.dump(rules_dic, out_file)
out_file.close()

##### Print Rules (Json File)

In [None]:
rules = json.load(open(rules_path,))
rules

#### Data Validation

##### Drift Function

In [None]:
def measure_drift(rules, statistics, include_features = []):
  
  #------------------------------------------ Function 1-------------------------------------
  
  def generate_stat(stats, key):
    
    if key == "Range":
      
      return ([stats["Minimum"], stats['Maximum']])
    
    elif key =="isComplete":
      
      if int(stats["Missing Value Percentage"]) == 0 :
          
          return True
        
      else:
          
          return False
    
    elif key == "Presence":
      
      return {"Min_fraction" : (1 - stats["Missing Value Percentage"] /100)}
    
    elif key == "isPositive":
      
      if int(stats["Minimum"]) >= 0 :
          
          return True
        
      else:
          
          return False      
    
    elif key =="Mininum_categories":
      
      return stats["No of Distinct Categories"]
    
    elif key == "isContainedIn" :
      
      return stats["distinct_categories"]
  
  
  def intersection(lst1, lst2):
    lst3 = [value for value in lst1 if value in lst2]
    return lst3
  
  #------------------------------------------ Extract Stat Function  -------------------------------------
  
  def extract_stat_fn(statistics, numerical_stat_keys, categorical_stat_keys, dataset_stat_keys):

    dic = { "Dataset":{}, "Numerical": {}, "Categorical": {}}

    for  key in statistics.keys():
    
      if key == "Numerical":

        for feature in statistics["Numerical"].keys():

          dic["Numerical"][feature] = {}

          for k in numerical_stat_keys:

            try:

              dic["Numerical"][feature][k] = statistics["Numerical"][feature][k]

            except:

              dic["Numerical"][feature][k] = generate_stat(statistics["Numerical"][feature], k)


      elif key == "Categorical":

        for feature in statistics["Categorical"].keys():

          dic["Categorical"][feature] = {}

          for k in categorical_stat_keys:

            try:

              dic["Categorical"][feature][k] = statistics["Categorical"][feature][k]

            except:

              dic["Categorical"][feature][k] = generate_stat(statistics["Categorical"][feature], k)

      else:

          dic["Dataset"] = {k : statistics["Dataset"][k]  for k in dataset_stat_keys}


    return dic

  
  #------------------------------------------ Sub Compare Function  -------------------------------------
  
  
  def sub_compare(dic1, dic2, threshold):

    change_dic = {}

    for key in dic1.keys():
      
      if threshold[key]['criterion'] == "within":
        
        for k in threshold[key].keys():
          
          if key != "criterion":
            
            temp = sub_compare(dic1[key], dic2[key], threshold[key])
            
            if temp != {}:
              
              change_dic[key] = temp
              

      elif threshold[key]['criterion'] == "equality":

        if dic1[key] != dic2[key]:

          change_dic[key] = {"Expected": dic1[key],
                             "Observed": dic2[key]}
          
      elif threshold[key]['criterion'] == "less_than":

        if dic2[key] < dic1[key]:

          change_dic[key] = {"Expected": dic1[key],
                             "Observed": dic2[key]}

      elif threshold[key]['criterion'] == "difference":

        temp_val = (dic2[key] - dic1[key])

        if temp_val > threshold[key]['value']:

          change_dic[key] = {"Expected": dic1[key],
                             "Observed": dic2[key]}
      
      elif threshold[key]['criterion'] == "reverse_difference":

        temp_val = (dic1[key] - dic2[key])

        if temp_val > threshold[key]['value']:

          change_dic[key] = {"Expected": dic1[key],
                             "Observed": dic2[key]}
          

      elif threshold[key]['criterion'] == "ratio":

          maxi, mini = np.max([dic1[key],dic2[key]]), np.min([dic1[key],dic2[key]])

          if  (maxi/mini) > threshold[key]['value']:

            change_dic[key] = {"Expected": dic1[key],
                               "Observed": dic2[key]}

      elif threshold[key]['criterion'] == "inRange":
        
        if (dic2[key][0] >= dic1[key][0]) and (dic2[key][1] <= dic1[key][1]):
          
          pass
        
        else:
          
          change_dic[key] = {"Expected": dic1[key],
                               "Observed": dic2[key]}
          
        
    return change_dic

  #------------------------------------------------- Comapre function ---------------------------------------

  def compare_stat(rules, current_dic):
    
    drift_dic = {}
    base_dic = {key: rules[key] for key in rules.keys() if key!= "Threshold"}
    threshold = rules['Threshold']

    for k1 in base_dic.keys():

        if k1 == "Dataset":
          
          drift_dic[k1] = sub_compare(base_dic[k1], current_dic[k1], threshold[k1])

        else:
          
          drift_dic[k1] = {}
          
          if include_features == []:
            
            new_list = base_dic[k1].keys()
          
          else:
            
            new_list = intersection(base_dic[k1].keys(), include_features)
            
          for k2 in new_list:
                        
            try:
              
              temp_dic = sub_compare(base_dic[k1][k2], current_dic[k1][k2], threshold[k1]["Feature-wise"][k2])

              if temp_dic == {}:

                pass

              else:

                drift_dic[k1][k2] = temp_dic
            except:
              pass
              

    return drift_dic
  
  
  temp = {"Extra Features": {"Numerical" : list(set(statistics["Numerical"].keys()) - set(rules["Numerical"].keys())) ,"Categorical": list(set(statistics["Categorical"].keys()) - set(rules["Categorical"].keys()))},
          
   "Missing Features": {"Numerical" : list(set(rules["Numerical"].keys()) - set(statistics["Numerical"].keys())) ,"Categorical": list(set(rules["Categorical"].keys()) - set(statistics["Categorical"].keys()))}}
  
  if temp["Extra Features"]["Numerical"] == [] and temp["Extra Features"]["Categorical"] == []:
    
    if temp["Missing Features"]["Numerical"] == [] and temp["Missing Features"]["Categorical"] == []:

      temp = {}
    
    else:

      temp = {"Missing Features" : temp["Missing Features"]}
      
  num_stat_keys = rules['Threshold']['Numerical']['Global'].keys()
  cat_stat_keys = rules['Threshold']['Categorical']['Global'].keys()
  data_stat_keys = rules['Threshold']['Dataset'].keys()
  
  extracted_stat = extract_stat_fn(statistics, num_stat_keys, cat_stat_keys, data_stat_keys)
  
  drift_dic = compare_stat(rules, extracted_stat)
   
  drift_dic['Dataset'] = {**drift_dic['Dataset'], **temp}
          
  return drift_dic

##### Data Validation Class

In [None]:
class data_validation:
  
  def __init__(self, current_data_path, rules_path, feature_statistics_path):
    
    self.current_data_path = current_data_path
    self.rules_path = rules_path
    self.feature_statistics_path = feature_statistics_path
    
  def read_data(self, data_format):
    
    sdf = spark.read.format(data_format).load(self.current_data_path)
    return sdf

  
  
  def generate_statistics_fn(self, data, generate_statistics):
    
    return generate_statistics(data)
  
  
  
  def detect_drift(self, base_stat, drift_fn, include_features = [], create_rules_fn= None, Type = None):
    
    try:
      
      rules = json.load(open(self.rules_path,))  
      
    except:   
      
      rules = self.create_rules_fn(base_stat)
      
    if include_features == []:
      
      drift = drift_fn(rules, base_stat,include_features)
    
    else:
      
      temp_drift = drift_fn(rules, base_stat,include_features)
      drift = {**temp_drift["Numerical"], **temp_drift["Categorical"]}
      
      return drift
    
    if Type != None:
      
      return drift[Type]          
    
    else:
      
      return drift
  
  def display_drift(self, drift_dic, Type = None, limit = None):
    
    if list(drift_dic.keys()) != ["Dataset", "Numerical", "Categorical"]:
      
      for k in drift_dic.keys():
        
        print("\033[1m Feature Name:  {} -".format(k) + '\033[0m')
        print(display(pd.DataFrame(drift_dic[k])))
        
      return
    
    
    if limit == None:
      
      limit = len(drift[Type].keys())
    
    else:
      
      pass
    
    if Type == None:
      
      return drift_dic
    
    for key in sample(list(drift[Type].keys()), limit):
        print("\033[1m Feature Name:  {} -".format(key) + '\033[0m')
        
        try:
          print(display(pd.DataFrame(drift[Type][key])))
        
        except:
          
          print("{}:".format(key))
          for k in drift[Type][key].keys():
            
            print(display(pd.DataFrame(drift[Type][key][k])))
          
        print("_"*100)
    
    return
  
  def display_textual_msg(self, anomaly_msg_fn, drift_dic, detailed_msg = True):
    
    if list(drift_dic.keys())!= ["Dataset", "Numerical", "Categorical"]:
      
      for k in drift_dic.keys():
        
        print("\033[1m Feature/Column Name : {} -\033[0m\n".format(k))
        
        for l in drift_dic[k].keys():
          
          anomaly_msg_fn(drift_dic[k], l, detailed_msg = detailed_msg )
      
      print("------------------------------------------------------"*2)
      return
          
    for key in drift_dic.keys():

      if key == "Dataset":
        
        print("\033[1m For Dataset (Not Feature wise) - \033[0m\n")
        
        for k in drift_dic[key].keys():
          
          anomaly_msg_fn(drift_dic[key], k, detailed_msg = detailed_msg )
          
        print("\n\n----------------------No more Dataset Anomaly------------------- \n")
        
      else:
        
        print("\033[1m For {} Features -\033[0m\n".format(key))
        
        for k in drift_dic[key].keys():
          
          print("\033[1m Feature/Column Name : {} -\033[0m\n".format(k))
          
          for l in drift_dic[key][k].keys():
            
             anomaly_msg_fn(drift_dic[key][k], l, detailed_msg =True)
          
        print("\n\n----------------------No more Anomaly for {} Feature------------------- \n".format(key))
    return
  
  
  
  
  def update_rules(self, update_rules_fn, update_standard_values_dic, update_threshold_dic):
    
    rules = json.load(open(self.rules_path,))  
    
    for key in update_standard_values_dic.keys():
      
      rules = update_rules_fn(rules, key, update_standard_values_dic[key])
      #print(rules)
      
    for key in update_threshold_dic.keys():
      
      rules = update_rules_fn(rules, key, update_threshold_dic[key], threshold = True)
      
    
    out_file = open(self.rules_path, "w")  
    json.dump(rules, out_file)
    out_file.close()    
    
    
    
  def drift_flag(self, drift):
    
    if list(drift.keys())== ["Dataset", "Numerical", "Categorical"]:
      
      if {**drift["Dataset"], **drift["Numerical"], **drift["Categorical"]} == {}:
        
        return("Success! No Drift.")
      
      else:
        
        return("Failure! There is Drift")
      
    else:
      
      if drift  == {}:
        
        return("Success! No Drift.")
      
      else:
        
        return("Failure! There is Drift.")

#### Run Data Validation

In [None]:
## Data Validation
validation = data_validation(data_to_validate_path, rules_path, base_stat_path)
val_df = validation.read_data(data_format = data_format)

# Update Rules
validation.update_rules(update_rules, change_default_standard_values_for_features, change_default_threshold_for_features)

## Statistics for Current Dataset
current_stat = validation.generate_statistics_fn(val_df, generate_statistics)

## Calculate Drift
drift = validation.detect_drift(current_stat, measure_drift, include_features = drift_for_particular_features_only)

#### Display Results

##### Check there is any Drift or not

In [None]:
validation.drift_flag(drift)

##### Check Drift

In [None]:
## Drift Json 
drift

##### Drift(Textual Message)

In [None]:
validation.display_textual_msg(anomaly_message, drift, detailed_msg = True)

#### Other ways to Display Drift (Visualize)

In [None]:
validation.display_drift(drift, Type = "Numerical", limit = None)

Range,Skewness
"List(-84.0, 0.0)",0.1357880372709621
"List(-94.0, 0.0)",0.5260179062076012


Range
"List(0, 16)"
"List(0, 33)"


Average,Standard Deviation,Range,Skewness
954844.59,1996573.1652504208,"List(22, 13122628)",4.066695812881047
4916504.73125,49184215.03960988,"List(38, 620017927)",12.348214619277217


Skewness
4.422620521496445
2.327260327064081


Range
"List(148, 17096048)"
"List(158, 20741467)"


Range,Skewness
"List(1, 12)",2.68561443691425
"List(1, 19)",4.458950265331163


In [None]:
validation.display_drift(drift, Type = "Categorical", limit = None)

Mininum_categories
32
28


Mininum_categories
2
1


Mininum_categories
3
2


Mininum_categories,isComplete
38,True
34,False


Mininum_categories
3
2


In [None]:
## Run the following if particular features are provided to detect drift

#validation.display_drift(drift)