#### 1. Installing the required python packages

In [0]:
!python -m pip install --upgrade pip

Collecting pip
  Downloading pip-22.3.1-py3-none-any.whl (2.1 MB)
[?25l[K     |▏                               | 10 kB 3.3 MB/s eta 0:00:01[K     |▎                               | 20 kB 1.2 MB/s eta 0:00:02[K     |▌                               | 30 kB 1.8 MB/s eta 0:00:02[K     |▋                               | 40 kB 711 kB/s eta 0:00:03[K     |▉                               | 51 kB 770 kB/s eta 0:00:03[K     |█                               | 61 kB 923 kB/s eta 0:00:03[K     |█▏                              | 71 kB 974 kB/s eta 0:00:03[K     |█▎                              | 81 kB 1.0 MB/s eta 0:00:02[K     |█▍                              | 92 kB 1.2 MB/s eta 0:00:02[K     |█▋                              | 102 kB 901 kB/s eta 0:00:03[K     |█▊                              | 112 kB 901 kB/s eta 0:00:03[K     |██                              | 122 kB 901 kB/s eta 0:00:03[K     |██                              | 133 kB 901 kB/s eta 0:00:03[K     |██

In [0]:
!pip install -r requirements38.txt

Collecting absl-py==0.7.1
  Downloading absl-py-0.7.1.tar.gz (99 kB)
[?25l     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/99.9 kB[0m [31m?[0m eta [36m-:--:--[0m[2K     [91m━━━━━━━━━━━━[0m[90m╺[0m[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m30.7/99.9 kB[0m [31m1.7 MB/s[0m eta [36m0:00:01[0m[2K     [91m━━━━━━━━━━━━━━━━━━━━━━━[0m[91m╸[0m[90m━━━━━━━━━━━━━━━[0m [32m61.4/99.9 kB[0m [31m838.9 kB/s[0m eta [36m0:00:01[0m[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m99.9/99.9 kB[0m [31m981.6 kB/s[0m eta [36m0:00:00[0m
[?25h  Preparing metadata (setup.py) ... [?25l- done
[?25hCollecting astor==0.8.0
  Downloading astor-0.8.0-py2.py3-none-any.whl (27 kB)
Collecting boto==2.49.0
  Downloading boto-2.49.0-py2.py3-none-any.whl (1.4 MB)
[?25l     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/1.4 MB[0m [31m?[0m eta [36m-:--:--[0m[2K     [91m━━━[0m[90m╺[0m[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m

In [0]:
!pip install azure-storage-file-datalake
!pip install adlfs
!pip install fsspec



#### 2. Importing the required python packages

In [0]:
import multiprocessing as mp
import sys
from datetime import datetime

import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
from ast import literal_eval



In [0]:
print(f'Started at {datetime.now()}')

Started at 2023-01-17 05:01:16.221169


#### 3. Connect to Azure Data Storage

In [0]:
import os, uuid, sys
from azure.storage.filedatalake import DataLakeServiceClient
from azure.core._match_conditions import MatchConditions
from azure.storage.filedatalake._models import ContentSettings
from azure.storage.blob import BlobServiceClient

In [0]:
def initialize_storage_account(storage_account_name, storage_account_key):
    
    try:  
        global service_client

        service_client = DataLakeServiceClient(account_url="{}://{}.dfs.core.windows.net".format(
            "https", storage_account_name), credential=storage_account_key)
    
    except Exception as e:
        print(e)

In [0]:
storage_account = "legoaistorage"
storage_account_key = "vOHAjE9vOHaxqmTRxIYETQbYlPvvFpJQ7xfky8tuWBRE9E6IbfM87ERkGcqqiHfMHs+WnEt907r6+AStjIYXlA=="
initialize_storage_account(storage_account,storage_account_key)

In [0]:
### Spark Configuration
spark.conf.set("fs.azure.account.key."+ storage_account +".dfs.core.windows.net", storage_account_key)
spark.conf.set("spark.sql.execution.arrow.pyspark.enabled", "true")

In [0]:
prefix_path = "abfss://datascience-dataset@legoaistorage.dfs.core.windows.net/"

In [0]:
filepath = 'Model_Data/model_feats_data.csv'
filepath_upd = prefix_path + filepath
feats_data_spark = spark.read.format("csv").option("header","false").load(filepath_upd)
feats_data = feats_data_spark.toPandas()
feats_data.columns = feats_data.iloc[0]
feats_data = feats_data.drop(feats_data.index[0])

In [0]:
filepath = 'Model_Data/labelled_data.csv'
filepath_upd = prefix_path + filepath
labelled_data_spark = spark.read.format("csv").option("header","true").load(filepath_upd)
labelled_data = labelled_data_spark.toPandas()

In [0]:
labelled_data.shape

Out[13]: (49790, 4)

In [0]:
model_data = pd.merge(feats_data,labelled_data[['master_id','datatype']],on=['master_id'],how='inner')

In [0]:
model_data.shape

Out[15]: (49790, 2059)

#### 4. Data Subset & Analysis

In [0]:
char_features = [cols for cols in model_data.columns if cols.startswith('n_[')]
par_features = [cols for cols in model_data.columns if cols.startswith('par_vec_')]
word_features = [cols for cols in model_data.columns if '_word_embedding_avg' in cols]

In [0]:
rest_features = ["col_entropy","frac_unique_sample","uniq_values_sample","numeric_cell_nz_count","text_cell_nz_count",                   "alphanum_cell_nz_count","frac_numcells","frac_textcells","frac_alphanumcells","avg_num_cells","std_num_cells","avg_text_cells",
"std_text_cells","avg_alphanum_cells","std_alphanum_cells","avg_spec_cells","std_spec_cells","avg_word_cells","std_word_cells",
"length-agg-any","length-agg-all","length-agg-mean","length-agg-var","length-agg-min","length-agg-max","length-agg-median",
"length-agg-sum","length-agg-kurtosis","length-agg-skewness","none-agg-has_sample","none-agg-percent_sample","none-agg-num_sample",
"none-agg-all_sample","dateRatio","intRatio","floatRatio","rangeRatio","zero_flag","mean_before_float","mean_after_float","mean_uppercase",
"mean_lowercase","wordlen_mean","wordlen_variance","table_sample","none-agg-has_population","none-agg-percent_population",
"none-agg-num_population","none-agg-all_population","uniq_samp_pop_ratio","samp_pop_ratio"]

In [0]:
def group_labels(datatype):
    if datatype in ['Short_Integer','Long_Integer']:
        return 'Integer'
    elif datatype in ['Short_Float','Long_Float']:
        return 'Float'
    elif datatype in ['Short_Alphanumeric','Long_Alphanumeric']:
        return 'Alphanumeric'
    elif datatype in ['Open_ended_long_text','Open_ended_short_text']:
        return 'Open_ended_text'
    elif datatype == 'Close_ended_short_text':
        return 'Close_ended_text'
    else:
        return datatype

In [0]:
model_data['grouped_datatype'] = model_data['datatype'].apply(group_labels)

In [0]:
# label = 'datatype'
label = 'grouped_datatype'
model_data_subset = model_data[char_features+par_features+word_features+rest_features+[label]+['file_name','column_id']]
print(model_data_subset.shape)

In [0]:
model_data_subset[label].value_counts()

In [0]:
model_data_subset[label].unique(),model_data_subset[label].nunique()

In [0]:
### Label Frequency Distribution
plt.hist(model_data_subset[label],orientation='horizontal',rwidth=0.8)
plt.title('DataType Distribution')
plt.xlabel('Label')
plt.ylabel('Count')
plt.show()

In [0]:
### Label Frequency Distribution
plt.pie(model_data_subset[label].value_counts(),labels=model_data_subset[label].value_counts().index,labeldistance=1.2)
plt.show()

#### 5. Train-Test Data Split and Subset

In [0]:
feats_cols = char_features+par_features+word_features+rest_features
print(len(feats_cols))

In [0]:
### Features conversion to Float type
model_data_subset[feats_cols] = model_data_subset[feats_cols].astype("float32")

In [0]:
model_data_subset[label].unique(),model_data_subset[label].nunique()

#### Train Test Data Split

In [0]:
### Train/Test Data
train_file_name = ['sherlock_data_feats','web_data_common_feats','kaggle_sql_data_feats']
train_val_data = model_data_subset[model_data_subset['file_name'].isin(train_file_name)]
test_data = model_data_subset[~model_data_subset['file_name'].isin(train_file_name)]

In [0]:
train_val_data.shape,test_data.shape

In [0]:
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split

le = LabelEncoder()
train_val_data['le_encoded'] = le.fit_transform(train_val_data[label])

In [0]:
### New train/Valid Split
# train_data,valid_data = train_test_split(train_val_data,train_size=0.8,stratify=train_val_data['le_encoded'])

### Existing train/Valid Split
filepath = 'Model_Results/20230111_01_XGB/valid_data.csv'
filepath_upd = prefix_path + filepath
valid_data_spark = spark.read.format("csv").option("header","true").load(filepath_upd)
valid_data_df = valid_data_spark.toPandas()
valid_uniq_id = valid_data_df['column_id'].unique()

train_data = train_val_data[~train_val_data['column_id'].isin(valid_uniq_id)]
valid_data = train_val_data[train_val_data['column_id'].isin(valid_uniq_id)]

In [0]:
test_data['le_encoded'] = le.transform(test_data[label])

In [0]:
train_data.shape, valid_data.shape, test_data.shape

#### Rule based prediction

In [0]:
def ruleBasedPrediction(data,param_dict):
    
    if data['none-agg-percent_sample'] > param_dict['none-agg_percent-sample']:
        return 'Others'
    
    elif data['dateRatio'] > param_dict['dateRatio']:
        return 'Date & Time'
    
    elif data['rangeRatio'] > param_dict['rangeRatio']:
        return 'Range_Type'
    
    elif data['frac_alphanumcells']> param_dict['frac_alphanumcells']:
        return 'Alphanumeric'
    
    elif data['frac_textcells']>param_dict['frac_textcells']:
        
        if data['frac_unique_sample']<param_dict['frac_unique_sample']:
            return 'Close_ended_text'
        else:
            return 'Open_ended_text'
        
    elif data['floatRatio'] >param_dict['floatRatio']:
        if data['zero_flag'] ==1:
            return 'Integer'
        else:
            return 'Float'
        
    elif data['intRatio'] >param_dict['intRatio']:
        return 'Integer'    
    else:
        return 'Others'

In [0]:
### Rule Simulation for better accuracy in precision
import itertools
range_vals = [val for val in np.arange(0,1,0.1)]
comb_vals = [range_vals for i in range(8)]
combinations= list(itertools.product(*comb_vals))

In [0]:
for comb in combinations[0]:
    param_dict{
    'none-agg_percent-sample': comb[0],
    'dateRatio': comb[1],
    'rangeRatio': comb[2],
    'frac_alphanumcells': comb[3],
    'frac_textcells': comb[4],
    'frac_unique_sample': comb[5],
    'floatRatio': comb[6],
    'intRatio': comb[7],
    }
    
    fearuleBasedPrediction()

#### 6. Model Building

In [0]:
from sklearn.metrics import f1_score, classification_report

In [0]:
start = datetime.now()
print(f'Started at {start}')

X_train = train_data[feats_cols]
X_train.columns = ['feats_col_'+str(i) for i, cols in enumerate(X_train.columns)]
y_train = train_data['le_encoded'].values.flatten()

print(f'Load data (train) process took {datetime.now() - start} seconds.')

In [0]:
print('Distinct types for columns in the Dataframe (should be all float32):')
print(set(X_train.dtypes))

In [0]:
start = datetime.now()
print(f'Started at {start}')

X_validation = valid_data[feats_cols]
X_validation.columns = ['feats_col_'+str(i) for i, cols in enumerate(X_validation.columns)]
y_validation = valid_data['le_encoded'].values.flatten()

print(f'Load data (validation) process took {datetime.now() - start} seconds.')

In [0]:
start = datetime.now()
print(f'Started at {start}')

X_test = test_data[feats_cols]
X_test.columns = ['feats_col_'+str(i) for i, cols in enumerate(X_test.columns)]
y_test = test_data['le_encoded'].values.flatten()

print(f'Load data (test) process took {datetime.now() - start} seconds.')

In [0]:
print(X_train.shape,X_test.shape,X_validation.shape)

In [0]:
set(y_train),set(y_test),set(y_validation)

#### 6.1. XGBoost Model

In [0]:
!pip install xgboost

In [0]:
from xgboost import XGBClassifier
import re

clf=XGBClassifier()

#Printing all the parameters of XGBoost
print(clf)

In [0]:
#Creating the model on Training Data
XGB=clf.fit(X_train,y_train)

##### Model Performance Metrics

In [0]:
### Test Dataset Prediction
prediction_test=XGB.predict(X_test)

In [0]:
### Validation Dataset Prediction
prediction_validation=XGB.predict(X_validation)

In [0]:
y_test_label = [le.inverse_transform([x])[0] for x in y_test.tolist()]
y_validation_label = [le.inverse_transform([x])[0] for x in y_validation.tolist()]
y_train_label = [le.inverse_transform([x])[0] for x in y_train.tolist()]

In [0]:
prediction_test_label = [le.inverse_transform([x])[0] for x in prediction_test.tolist()]
prediction_validation_label = [le.inverse_transform([x])[0] for x in prediction_validation.tolist()]

In [0]:
print(classification_report(y_test_label, prediction_test_label))

In [0]:
print(classification_report(y_validation_label, prediction_validation_label))

In [0]:
### Train Dataset Prediction
prediction_train=XGB.predict(X_train)

In [0]:
prediction_train_label = [le.inverse_transform([x])[0] for x in prediction_train.tolist()]

In [0]:
print(classification_report(y_train_label, prediction_train_label))

In [0]:
train_data['predicted_prob'] = [max(pred) for pred in XGB.predict_proba(X_train)]
test_data['predicted_prob'] = [max(pred) for pred in XGB.predict_proba(X_test)]
valid_data['predicted_prob'] = [max(pred) for pred in XGB.predict_proba(X_validation)]

In [0]:
train_data['predicted_datatype'] = prediction_train_label
test_data['predicted_datatype'] = prediction_test_label
valid_data['predicted_datatype'] = prediction_validation_label

In [0]:
feature_importances = pd.Series(XGB.feature_importances_, index=feats_cols)
feature_importances.nlargest(20).plot(kind='barh')

In [0]:
### Getting the data and other required information from each source
container_name = 'datascience-dataset'
connect_str = 'DefaultEndpointsProtocol=https;AccountName={};AccountKey={}'.format(storage_account,storage_account_key)

In [0]:
def save_df_to_blob(df,output_file):

    try:
        output = df.to_csv(index=False, encoding = "utf-8")
        
        # Instantiate a new BlobClient
        blob_client = container_client.get_blob_client(output_file)
        
        # upload data
        blob_client.upload_blob(output, blob_type="BlockBlob")
        
        return 1
    
    except:
        return 0

In [0]:
feature_names = pd.DataFrame(feats_cols,columns=['feature_list'])

##### Writing the results to ADLS

In [0]:
from datetime import datetime
datetime_str = str(datetime.now())[:10].replace('-','')#+'_01'

In [0]:
datetime_str

In [0]:
## Blob storage based configurations
blob_service_client = BlobServiceClient.from_connection_string(connect_str)
container_client = blob_service_client.get_container_client('datascience-dataset/Model_Results/'+datetime_str+'_XGB')

## Train Data
# filepath = "train_data.csv"
# save_df_to_blob(train_data[['column_id','file_name','datatype','predicted_datatype','predicted_prob']],filepath)

# ## Test Data
# filepath = "test_data.csv"
# save_df_to_blob(test_data[['column_id','file_name','datatype','predicted_datatype','predicted_prob']],filepath)

# ## Valid Data
# filepath = "valid_data.csv"
# save_df_to_blob(valid_data[['column_id','file_name','datatype','predicted_datatype','predicted_prob']],filepath)

## Feature Names
filepath = 'feature_names.csv'
save_df_to_blob(feature_names,filepath)

In [0]:
# feature_names = pd.DataFrame(model_data.columns,columns=['feature_list'])
# filepath = 'feature_names.csv'
# save_df_to_blob(feature_names,filepath)

#### Writing the model object to DBFS

In [0]:
import pickle
filepath = '/dbfs/'+datetime_str+'_XGB_model.pkl'
pickle.dump(XGB, open(filepath, 'wb'))

#### 6.2. Keras based Multi Input Model

In [0]:
!pip install tensorflow

In [0]:
import tensorflow as tf
from tensorflow.keras.layers import *
from tensorflow.keras.models import Sequential, Model
from tensorflow.keras.optimizers import Adam, RMSprop
import numpy as np

In [0]:
input1 = Input(shape=(960,))
input2 = Input(shape=(400,))
input3 = Input(shape=(81,))
input = Concatenate()([input1, input2, input3])
x = Dense(500)(input)
x = Dense(250)(x)
x = Dense(100)(x)
x = Dense(12)(x)
model = Model(inputs=[input1, input2, input3], outputs=x)
model.summary()

In [0]:
model.compile(optimizer = Adam(learning_rate=0.001),loss = 'categorical_crossentropy',metrics=['accuracy'])

In [0]:
y_train_cat = tf.keras.utils.to_categorical(y_train)
y_validation_cat = tf.keras.utils.to_categorical(y_validation)

In [0]:
model.fit([X_train[[column_mapping[col] for col in char_col]], X_train[[column_mapping[col] for col in par_col]], X_train[[column_mapping[col] for col in rest_col]]], y_train_cat,epochs=10,validation_data=([X_validation[[column_mapping[col] for col in char_col]], X_validation[[column_mapping[col] for col in par_col]], X_validation[[column_mapping[col] for col in rest_col]]],y_validation_cat))

In [0]:
predicted = model.predict([X_test[[column_mapping[col] for col in char_col]], X_test[[column_mapping[col] for col in par_col]], X_test[[column_mapping[col] for col in rest_col]]])

In [0]:
pred_labels = [np.argmax(pred) for pred in predicted]

In [0]:
print(classification_report(y_test,pred_labels))

#### 6.3. Sherlock Model

In [0]:
!pip install tensorflow

In [0]:
from sherlock.deploy.model import SherlockModel

In [0]:
model_id = "retrained_sherlock"

In [0]:
import os

import numpy as np
import pandas as pd
import tensorflow as tf

from sklearn.preprocessing import LabelEncoder
from tensorflow.keras.callbacks import EarlyStopping
from tensorflow.keras.layers import (
    Input,
    Dense,
    Dropout,
    BatchNormalization,
    concatenate,
)
from tensorflow.keras.models import Model, model_from_json

from sherlock.deploy import helpers


class SherlockModel:
    def __init__(self):
        self.lamb = 0.0001
        self.do = 0.35
        self.lr = 0.0001

        self.model_files_directory = "model_files/"

    def fit(
        self, X_train: pd.DataFrame, y_train, X_val: pd.DataFrame, y_val, model_id: str
    ):
        if model_id == "sherlock":
            raise ValueError(
                "`model_id` cannot be `sherlock` to avoid overwriting the original model weights."
            )
        num_classes = len(set(y_train))
        
        encoder = LabelEncoder()
        encoder.fit(y_train)

        feature_cols = helpers.categorize_features()

        X_train_char = X_train[feature_cols["char"]]
        X_train_word = X_train[feature_cols["word"]]
        X_train_par = X_train[feature_cols["par"]]
        X_train_rest = X_train[feature_cols["rest"]]
        X_val_char = X_val[feature_cols["char"]]
        X_val_word = X_val[feature_cols["word"]]
        X_val_par = X_val[feature_cols["par"]]
        X_val_rest = X_val[feature_cols["rest"]]

        y_train_int = encoder.transform(y_train)
        y_val_int = encoder.transform(y_val)
        y_train_cat = tf.keras.utils.to_categorical(y_train_int)
        y_val_cat = tf.keras.utils.to_categorical(y_val_int)
        
        callbacks = [EarlyStopping(monitor="val_loss", patience=5)]

        char_model_input, char_model = self._build_char_submodel(X_train_char.shape[1])
        word_model_input, word_model = self._build_word_submodel(X_train_word.shape[1])
        par_model_input, par_model = self._build_par_submodel(X_train_par.shape[1])
        rest_model_input, rest_model = self._build_rest_submodel(X_train_rest.shape[1])

        # Merge submodels and build main network
        merged_model1 = concatenate([char_model, word_model, par_model, rest_model])

        merged_model_output = self._add_main_layers(merged_model1, num_classes)

        model = Model(
            [char_model_input, word_model_input, par_model_input, rest_model_input],
            merged_model_output,
        )

        model.compile(
            optimizer=tf.keras.optimizers.Adam(learning_rate=self.lr),
            loss="categorical_crossentropy",
            metrics=["categorical_accuracy"],
        )

        model.fit(
            [
                X_train_char.values,
                X_train_word.values,
                X_train_par.values,
                X_train_rest.values,
            ],
            y_train_cat,
            validation_data=(
                [
                    X_val_char.values,
                    X_val_word.values,
                    X_val_par.values,
                    X_val_rest.values,
                ],
                y_val_cat,
            ),
            callbacks=callbacks,
            epochs=10,
            batch_size=256,
        )
    
        self.model = model

        _ = helpers._get_categorical_label_encodings(y_train, y_val, model_id)

    def custom_predict(self, X: pd.DataFrame, model_id: str = model_id) -> np.array:
        """Use sherlock model to generate predictions for X.
        Parameters
        ----------
        X
            Featurized dataframe to generate predictions for.
        model_id
            ID of the model used for generating predictions.
        Returns
        -------
        Array with predictions for X.
        """
        y_pred = self.predict_proba(X, model_id)
        y_pred_classes = helpers._proba_to_classes(y_pred, model_id)

        return y_pred_classes

    def predict_proba(self, X: pd.DataFrame, model_id: str = "sherlock") -> np.array:
        """Use sherlock model to generate predictions for X.
        Parameters
        ----------
        X
            Featurized data set to generate predictions for.
        model_id
            Identifier of a trained model to use for generating predictions.
        Returns
        -------
        Array with predictions for X.
        """
        feature_cols_dict = helpers.categorize_features()

        y_pred = self.model.predict(
            [
                X[feature_cols_dict["char"]].values,
                X[feature_cols_dict["word"]].values,
                X[feature_cols_dict["par"]].values,
                X[feature_cols_dict["rest"]].values,
            ]
        )

        return y_pred

    def initialize_model_from_json(
        self, with_weights: bool, model_id: str = "sherlock"
    ):
        """Load model architecture and populate with pretrained weights.
        Parameters
        ----------
        with_weights
            Whether to populate the model with trained weights.
        model_id
            The ID of the model file to build, defaults to `sherlock` for using the
            sherlock model with the original weights.
        """
        # callbacks = [EarlyStopping(monitor="val_loss", patience=5)]

        model_filename = os.path.join(
            self.model_files_directory, f"{model_id}_model.json"
        )
        if not os.path.exists(model_filename):
            raise ValueError(
                f"""
                No model file associated with this ID: {model_id}, was found.
                The desired model should be specified and stored first before it can be used.
                """
            )

        file = open(model_filename, "r")
        model = model_from_json(file.read())
        file.close()

        if with_weights:
            weights_filename = os.path.join(
                self.model_files_directory, f"{model_id}_weights.h5"
            )
            if not os.path.exists(weights_filename):
                raise ValueError(
                    f"""
                    There are no weights associated with this model ID: {model_id}.
                    The desired model should be trained first before it can be initialized.
                    """
                )
            model.load_weights(weights_filename)

        model.compile(
            optimizer=tf.keras.optimizers.Adam(learning_rate=self.lr),
            loss="categorical_crossentropy",
            metrics=["categorical_accuracy"],
        )

        self.model = model

    def store_weights(self, model_id: str):
        if model_id == "sherlock":
            raise ValueError(
                "Cannot save model weights with `sherlock` model ID. Choose an alternative."
            )

        weights_filename = os.path.join(
            self.model_files_directory, f"{model_id}_weights.h5"
        )

        self.model.save_weights(weights_filename)

    def _build_char_submodel(self, char_shape):
        n_weights = 300

        char_model_input = Input(shape=(char_shape,))
        char_model1 = BatchNormalization(axis=1)(char_model_input)
        char_model2 = Dense(
            n_weights,
            activation=tf.nn.relu,
            kernel_regularizer=tf.keras.regularizers.l2(self.lamb),
        )(char_model1)
        char_model3 = Dropout(self.do)(char_model2)
        char_model4 = Dense(
            n_weights,
            activation=tf.nn.relu,
            kernel_regularizer=tf.keras.regularizers.l2(self.lamb),
        )(char_model3)

        return char_model_input, char_model4

    def _build_word_submodel(self, word_shape):
        n_weights = 200

        word_model_input = Input(shape=(word_shape,))
        word_model1 = BatchNormalization(axis=1)(word_model_input)
        word_model2 = Dense(
            n_weights,
            activation=tf.nn.relu,
            kernel_regularizer=tf.keras.regularizers.l2(self.lamb),
        )(word_model1)
        word_model3 = Dropout(self.do)(word_model2)
        word_model4 = Dense(
            n_weights,
            activation=tf.nn.relu,
            kernel_regularizer=tf.keras.regularizers.l2(self.lamb),
        )(word_model3)

        return word_model_input, word_model4

    def _build_par_submodel(self, par_shape):
        n_weights = 400

        par_model_input = Input(shape=(par_shape,))
        par_model1 = BatchNormalization(axis=1)(par_model_input)
        par_model2 = Dense(
            n_weights,
            activation=tf.nn.relu,
            kernel_regularizer=tf.keras.regularizers.l2(self.lamb),
        )(par_model1)
        par_model3 = Dropout(self.do)(par_model2)
        par_model4 = Dense(
            n_weights,
            activation=tf.nn.relu,
            kernel_regularizer=tf.keras.regularizers.l2(self.lamb),
        )(par_model3)

        return par_model_input, par_model4

    def _build_rest_submodel(self, rest_shape):

        # Build submodel for remaining features
        rest_model_input = Input(shape=(rest_shape,))
        rest_model1 = BatchNormalization(axis=1)(rest_model_input)

        return rest_model_input, rest_model1

    def _add_main_layers(self, merged_model1, num_classes):
        n_weights = 500

        merged_model2 = BatchNormalization(axis=1)(merged_model1)
        merged_model3 = Dense(
            n_weights,
            activation=tf.nn.relu,
            kernel_regularizer=tf.keras.regularizers.l2(self.lamb),
        )(merged_model2)
        merged_model4 = Dropout(self.do)(merged_model3)
        merged_model5 = Dense(
            n_weights,
            activation=tf.nn.relu,
            kernel_regularizer=tf.keras.regularizers.l2(self.lamb),
        )(merged_model4)
        merged_model_output = Dense(
            num_classes,
            activation=tf.nn.softmax,
            kernel_regularizer=tf.keras.regularizers.l2(self.lamb),
        )(merged_model5)

        return merged_model_output

In [0]:
start = datetime.now()
print(f'Started at {start}')

model = SherlockModel()
# model.initialize_model_from_json(with_weights=True, model_id="sherlock");

# Model will be stored with ID `model_id`
model.fit(X_train, y_train, X_validation, y_validation, model_id=model_id)

print('Trained and saved new model.')
print(f'Finished at {datetime.now()}, took {datetime.now() - start} seconds')

In [0]:
model.store_weights(model_id=model_id)

In [0]:
predicted_labels = model.custom_predict(X_test)
predicted_labels = np.array([x.lower() for x in predicted_labels])

In [0]:
set(predicted_labels)

In [0]:
print(f'prediction count {len(predicted_labels)}, type = {type(predicted_labels)}')

size=len(y_test)

# Should be fully deterministic too.
f1_score(y_test[:size], predicted_labels[:size], average="weighted")

In [0]:
print(classification_report(y_test, predicted_labels, digits=3))

In [0]:
test_data['rule_datatype'] = test_data.apply(lambda x: ruleBasedPrediction(x),axis=1)
print(classification_report(test_data['grouped_datatype'], test_data['rule_datatype']))

In [0]:
valid_data['rule_datatype'] = valid_data.apply(lambda x: ruleBasedPrediction(x),axis=1)
print(classification_report(valid_data['grouped_datatype'], valid_data['rule_datatype']))

In [0]:
# regex = re.compile(r"\[|\]|<", re.IGNORECASE)