Follow the Sherlock data repo for complete installation instruction: https://github.com/mitmedialab/sherlock-project/tree/8d6411d793dfcfacae0bd300b806e023d0644e95
        

In [None]:
import pandas as pd
import numpy as np
import sys
import tensorflow as tf
import matplotlib.pyplot as plt
from ast import literal_eval
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import f1_score
import re
sys.path.append("..")

In [None]:
from src.features.build_features import build_features
from src.features.build_features import _get_data
from src.deploy.train_sherlock import train_sherlock
from src.deploy.predict_sherlock import predict_sherlock

In [None]:
_get_data()

In [None]:
testdf = pd.read_csv('../../Benchmark-Labeled-Data/data_test.csv')
test_metadata = pd.read_csv('../../RawCSV/Metadata/meta_data.csv')

test_merged = pd.merge(testdf,test_metadata,on='Record_id')
y_true = test_merged.y_act.values.tolist()

In [None]:
test_merged['list_vals'] = ""
test_merged

In [None]:
for row in test_merged.itertuples():
    if row.Index%100 == 0: print(row.Index)
    col = getattr(row,'Attribute_name')
    csv_name = '../../RawCSV/RawCSVFiles/' + getattr(row,'name')
    
    df = pd.read_csv(csv_name,encoding='latin1')
    
    try:
        df_col = df[col].tolist()
        test_merged.at[row.Index,'list_vals'] = df_col
    except KeyError: test_merged.at[row.Index,'list_vals'] = []

In [None]:
sherlock_df = test_merged[["list_vals", "y_act"]]

for index, row in sherlock_df.iterrows():
    if row["list_vals"] == []:
        sherlock_df.at[index, "list_vals"] = [""]
    else:
        templst = []
        flag = 1
        for x in row["list_vals"]:
            if x != x: continue
        
            try: abc = int(x)
            except:
                flag = 0
                break
            try: abc = float(x)
            except:
                flag = 0
                break
            
        for x in row["list_vals"]:
            if x != x:
                templst.append("0")
                continue
            
            if flag: temp = x
            else: temp = x
                
            templst.append(temp)
        sherlock_df.at[index, "list_vals"] = templst
    
sherlock_df

In [None]:
sherlock_df1 = sherlock_df['list_vals']
sherlock_df1

In [None]:
X_test = build_features(sherlock_df1)

In [None]:
predicted_labels = predict_sherlock(X_test, nn_id='sherlock')
predicted_labels

In [None]:
label_df = pd.DataFrame(predicted_labels,columns=['label'])

df = test_merged[['y_act', 'Attribute_name','sample_1','sample_2','sample_3','sample_4','sample_5','total_vals', 'num_nans', '%_nans', 'num_of_dist_val' ,'%_dist_val']]
df = df.fillna(0)

df['label'] = label_df['label']
df['ColumnA'] = df[df.columns[2:7]].apply(lambda x: '$#$'.join(x.dropna().astype(str)), axis=1 )


In [None]:
curdf = pd.read_csv('Semantic2FeatureType_Mapping.csv') # Load Semantic Types to Feature Type Mappings

In [None]:
curdic = {}
for i,row in curdf.iterrows():
    if row['type'] not in curdic:
        curdic[row['type']] = []
        if row['l0'] == row['l0']: curdic[row['type']].append(row['l0'])
        if row['l1'] == row['l1']: curdic[row['type']].append(row['l1'])
        if row['l2'] == row['l2']: curdic[row['type']].append(row['l2'])            
        if row['l3'] == row['l3']: curdic[row['type']].append(row['l3'])

In [None]:
delimeters = r"(,|;|\|)"
delimeters = re.compile(delimeters)

del_pattern = r"\b[0-9]+[a-zA-Z \% \$]+"
del_reg = re.compile(del_pattern)

def func(lst):
    lst = list(lst.split('$#$'))
    try:
        lst = [float(i) for i in lst]
    except ValueError: f=1
    
    if all(isinstance(x, int) for x in lst) or all(isinstance(x, float) for x in lst): return 1
    else: return 0

df['isNumeric'] = df['ColumnA'].apply(lambda x: func(x))


In [None]:
ysherlock = []
yact = []

for i,row in df.iterrows():
    if len(curdic[row['label']]) == 1: 
        ysherlock.append(curdic[row['label']][0])
        yact.append(row['y_act'])
        
        
    elif row['label'] in ['age', 'result', 'plays', 'ranking']:
        if row['isNumeric'] == 1: ysherlock.append('Numeric')
        elif del_reg.match(str(row['sample_1'])) or del_reg.match(str(row['sample_2'])) or del_reg.match(str(row['sample_3'])) or del_reg.match(str(row['sample_4'])) or del_reg.match(str(row['sample_5'])):
            ysherlock.append('Embedded Number')
        else: ysherlock.append('Categorical')
        yact.append(row['y_act'])
        
        
    elif row['label'] in ['sales', 'rank', 'elevation', 'weight'] :
        if row['isNumeric'] == 1: ysherlock.append('Numeric')
        else: ysherlock.append('Embedded Number')
        yact.append(row['y_act'])
        
        
    elif row['label'] in ['area','position', 'depth']: 
        if row['isNumeric'] == 1: ysherlock.append('Numeric')
        else: ysherlock.append('Categorical')
        yact.append(row['y_act'])   
        

    elif row['label'] in ['command']:
        templst = [len(str(row['sample_1']).split(' ')), len(str(row['sample_2']).split(' ')), len(str(row['sample_3']).split(' ')), len(str(row['sample_4']).split(' ')), len(str(row['sample_5']).split(' '))]
#         print(templst)
        if np.mean(templst) > 3: ysherlock.append('Sentence')
        else: ysherlock.append('Categorical')
        yact.append(row['y_act'])
        
    elif row['label'] in ['code']:
        if row['%_dist_val'] > 99.99 or row['num_of_dist_val'] == 1 or row['total_vals'] == row['num_nans']: ysherlock.append('Not-Generalizable')
        else: ysherlock.append('Categorical')
        yact.append(row['y_act'])
        
    elif row['label'] in ['day','duration', 'year']:
        try: 
            pd.Timestamp(row['sample_1'])
            ysherlock.append('Datetime')
        except ValueError:  ysherlock.append('Categorical')
        yact.append(row['y_act'])
        
    elif row['label'] in ['order']: 
        if row['isNumeric'] == 1: ysherlock.append('Context-Specific')
        else: ysherlock.append('Categorical')
        yact.append(row['y_act'])   
                
    elif row['label'] in ['range']: 
        if del_reg.match(str(row['sample_1'])) or del_reg.match(str(row['sample_2'])) or del_reg.match(str(row['sample_3'])) or del_reg.match(str(row['sample_4'])) or del_reg.match(str(row['sample_5'])):
            ysherlock.append('Embedded Number')
        else: ysherlock.append('Categorical')
        yact.append(row['y_act'])
        
    elif row['label'] in ['genre','collection']:
        if len(delimeters.findall(str(str(row['sample_1'])))) > 1: ysherlock.append('List')
        else: ysherlock.append('Categorical')
        yact.append(row['y_act'])
        
#     else:
#         print(row)
#         print(row['label'])

In [None]:
len(ysherlock)

In [None]:
dict_label = {
    'Numeric': 0,
    'Categorical': 1,
    'Datetime':2, 
    'Sentence':3, 
    'URL': 4, 
    'Embedded Number': 5, 
    'List': 6,
    'Not-Generalizable': 7,
    'Custom Object': 8,    
    'Context-Specific': 8    
}

ysherlock1 = [dict_label[x] for x in ysherlock]

In [None]:
from sklearn.metrics import accuracy_score, confusion_matrix
print(accuracy_score(yact, ysherlock1))
print(confusion_matrix(yact, ysherlock1))