## Train weighted LR model on t2dv2 data
- Features used in this model: 'aligned_pagerank', 'smallest_qnode_number', 'monge_elkan', 'des_cont_jaccard_normalized'
- Trained model should be saved on disk for future load
- This part is not included in the actual table linker pipeline

## Dataset 
- Train: https://drive.google.com/drive/u/2/folders/1bUDGEI4gNEw6VY4Bquy0zy10ga-7Mb2M 
- Dev: https://drive.google.com/drive/u/2/folders/1RgxmFI7QGVh53Y_2OwvBk1Uyzu0Y2ftX

In [1]:
import numpy as np
import pandas as pd

In [22]:
# helper method we use
def clean_dataset(df):
    print(f"length of df: {len(df)}")
    assert isinstance(df, pd.DataFrame), "df needs to be a pd.DataFrame"
    # df.dropna(inplace=True)
    print(f"length of df: {len(df)} after dropping na")
    indices_to_keep = ~df.loc[:, ['aligned_pagerank', 'smallest_qnode_number', 'monge_elkan', 'des_cont_jaccard_normalized']].isin([np.nan, np.inf, -np.inf]).any(1)
    print(f"length of indices_to_keep: {sum(indices_to_keep)}")
    return df[indices_to_keep]

# merge all eval files in one df
def merge_df(file_names: list):
    df_list = []
    for fn in file_names:
        fid = fn.split('/')[-1].split('.csv')[0]
        df = pd.read_csv(fn)
        df['table_id'] = fid
        # df = df.fillna('')
        df_list.append(df)
    
    return pd.concat(df_list)

#### Generate training data with 4 required features

In [18]:
!which tl

/Users/summ7t/dev/novartis/novartis_env/bin/tl


##### Script I used for generating vote files - voting_test_pipline.sh
```
filename=$1

tl smallest-qnode-number /Users/summ7t/dev/novartis/table-linker/t2dv2-dev-candidates-dwd-v2/$filename.csv \
/ align-page-rank \
/ string-similarity -i --method symmetric_monge_elkan:tokenizer=word -o monge_elkan \
/ string-similarity -i --method jaccard:tokenizer=word -c kg_descriptions context -o des_cont_jaccard \
/ normalize-scores -c des_cont_jaccard \
> /Users/summ7t/dev/novartis/table-linker/t2dv2-dev-score/votes/$filename.csv
```

In [21]:
# List all train dataset files
import os

file_names = []
file_ids = []

for (dirpath, dirnames, filenames) in os.walk('/Users/summ7t/dev/novartis/table-linker/t2dv2-train-candidates-dwd-v2/'):
    for fn in filenames:
        if "csv" not in fn:
            continue
        abs_fn = dirpath + fn
        assert os.path.isfile(abs_fn)
        if os.path.getsize(abs_fn) == 0:
            continue
        file_names.append(abs_fn)
        file_ids.append(fn.split('.csv')[0])
len(file_names), file_ids[:3]

(44,
 ['58891288_0_1117541047012405958',
  '39173938_0_7916056990138658530',
  '10579449_0_1681126353774891032'])

In [None]:
for idx, fid in enumerate(eval_file_ids):
    print(f"Generating {idx}th training dataset: {fid}")
    os.system(f'bash /Users/summ7t/dev/novartis/table-linker/voting/voting_test_pipline.sh {fid}')
    assert os.path.isfile(f'/Users/summ7t/dev/novartis/table-linker/t2dv2-train-score/votes/{fid}.csv')

In [20]:
!ls /Users/summ7t/dev/novartis/table-linker/t2dv2-train-score/votes/ | wc -l

      44


In [None]:
all_data = merge_df(file_names)

In [None]:
all_data.to_csv('./t2dv2-train-score/training_all_data_v2.csv', index=False)

In [5]:
# get training data
all_data = pd.read_csv('/Users/summ7t/dev/novartis/table-linker/t2dv2-train-score/training_all_data_v2.csv')
all_data.head()

  has_raised = await self.run_ast_nodes(code_ast.body, cell_name,


Unnamed: 0,column,row,label,context,label_clean,kg_id,kg_labels,kg_aliases,method,kg_descriptions,...,aligned_pagerank,monge_elkan.1,des_cont_jaccard.1,des_cont_jaccard_normalized,vote_aligned_pagerank,vote_smallest_qnode_number,vote_monge_elkan,vote_des_cont_jaccard,votes,table_id
0,1,0,The Godfather,1|1972|Francis Ford Coppola|1,The Godfather,Q13033687,The Godfather,,exact-match,Grimm,...,7.11985e-09,1.0,0.0,0.0,0,0,1,0,1,58891288_0_1117541047012405958
1,1,0,The Godfather,1|1972|Francis Ford Coppola|1,The Godfather,Q1158135,The Godfather,,exact-match,soundtrack of the 1972 crime film of the same ...,...,5.141773e-09,1.0,0.125,0.25,0,0,1,0,1,58891288_0_1117541047012405958
2,1,0,The Godfather,1|1972|Francis Ford Coppola|1,The Godfather,Q1139696,The Godfather,The Godfather: The Game|The Godfather: Mob War...,exact-match,2006 open world action-adventure video game,...,5.155339e-09,1.0,0.0,0.0,0,0,1,0,1,58891288_0_1117541047012405958
3,1,0,The Godfather,1|1972|Francis Ford Coppola|1,The Godfather,Q1066512,Charles Wright,The Godfather|Papa Shango,exact-match,American professional wrestler,...,3.811774e-09,0.499339,0.0,0.0,0,0,0,0,0,58891288_0_1117541047012405958
4,1,0,The Godfather,1|1972|Francis Ford Coppola|1,The Godfather,Q20655440,The Godfather,,exact-match,novel series,...,8.133001e-09,1.0,0.0,0.0,0,0,1,0,1,58891288_0_1117541047012405958


#### Prepare features for training

In [6]:
training_data = all_data.loc[:, ['aligned_pagerank', 'smallest_qnode_number', 'monge_elkan', 'des_cont_jaccard_normalized', 'evaluation_label']]
training_data.loc[training_data['evaluation_label'].astype(int) != 1, 'evaluation_label'] = -1
training_data = clean_dataset(training_data)

length of df: 774370
length of df: 774370 after dropping na
length of indices_to_keep: 774292


In [7]:
# balance the positive / negative cases
balanced_training_data = pd.DataFrame()
balanced_training_data = balanced_training_data.append(training_data[training_data['evaluation_label'] == 1])
balanced_training_data = balanced_training_data.append(training_data[training_data['evaluation_label'] != 1].sample(9500))

In [8]:
training_features = balanced_training_data.loc[:, ['aligned_pagerank', 'smallest_qnode_number', 'monge_elkan', 'des_cont_jaccard_normalized']]
training_labels = balanced_training_data.loc[:, ['evaluation_label']]

#### Train classifier using training data

In [9]:
# train model
from sklearn.linear_model import LogisticRegression

lr = LogisticRegression(class_weight={-1:0.3, 1:0.7})
# Train our classifier
model = lr.fit(training_features, training_labels)

  return f(*args, **kwargs)


In [10]:
# dump trained model to disk
import pickle

with open('weighted_lr.pkl', 'wb') as fid:
    pickle.dump(model, fid)

In [11]:
# load model from disk
with open('weighted_lr.pkl', 'rb') as fid:
    model_loaded = pickle.load(fid)

#### Test using dev dataset

In [12]:
import os

eval_file_names = []
eval_file_ids = []

for (dirpath, dirnames, filenames) in os.walk('/Users/summ7t/dev/novartis/table-linker/t2dv2-dev-score/votes/'):
    for fn in filenames:
        if "csv" not in fn:
            continue
        abs_fn = dirpath + fn
        assert os.path.isfile(abs_fn)
        if os.path.getsize(abs_fn) == 0:
            continue
        eval_file_names.append(abs_fn)
        eval_file_ids.append(fn.split('.csv')[0])
len(eval_file_names), eval_file_ids[:3]

(9,
 ['39759273_0_1427898308030295194',
  '45073662_0_3179937335063201739',
  '29414811_2_4773219892816395776'])

In [13]:
# generate testing data
testing_data = merge_df(eval_file_names)
testing_data.loc[testing_data['evaluation_label'].astype(int) != 1, 'evaluation_label'] = -1
testing_data = clean_dataset(testing_data)
testing_features = testing_data.loc[:, ['aligned_pagerank', 'smallest_qnode_number', 'monge_elkan', 'des_cont_jaccard_normalized']]
testing_labels = testing_data.loc[:, ['evaluation_label']]

length of df: 101127
length of df: 101127 after dropping na
length of indices_to_keep: 101125


In [14]:
res = {}
for tid in eval_file_ids:
    test_data = testing_data[testing_data['table_id'] == tid]
    test_features = test_data.loc[:, ['aligned_pagerank', 'smallest_qnode_number', 'monge_elkan', 'des_cont_jaccard_normalized']]
    test_labels = test_data.loc[:, ['evaluation_label']]

    preds = model_loaded.predict(test_features)
    assert len(preds) ==  len(test_labels)

    prob = model_loaded.predict_proba(test_features)
    
    test_df = test_data.copy()
    test_df['pred'] = preds
    test_df['prob_0'] = [p[0] for p in prob]
    test_df['prob_1'] = [p[1] for p in prob]
    res[tid] = test_df

In [17]:
res['39759273_0_1427898308030295194']

Unnamed: 0,column,row,label,context,label_clean,kg_id,kg_labels,kg_aliases,method,kg_descriptions,...,des_cont_jaccard_normalized,vote_aligned_pagerank,vote_smallest_qnode_number,vote_monge_elkan,vote_des_cont_jaccard,votes,table_id,pred,prob_0,prob_1
0,1,0,The Social Network,1|2010|David Fincher|8.3|45993,The Social Network,Q16353709,The Social Network,,exact-match,Wikimedia disambiguation page,...,0.000000,0,0,1,0,1,39759273_0_1427898308030295194,1,0.216134,0.783866
1,1,0,The Social Network,1|2010|David Fincher|8.3|45993,The Social Network,Q185888,The Social Network,Social Network,exact-match,2010 film by David Fincher,...,0.800000,0,1,1,1,3,39759273_0_1427898308030295194,1,0.000005,0.999995
2,1,0,The Social Network,1|2010|David Fincher|8.3|45993,The Social Network,Q1952928,The Social Network,,exact-match,2010 soundtrack album by Trent Reznor and Atti...,...,0.222222,1,0,1,0,2,39759273_0_1427898308030295194,1,0.033429,0.966571
3,1,0,The Social Network,1|2010|David Fincher|8.3|45993,The Social Network,Q185888,The Social Network,Social Network,fuzzy-augmented,2010 film by David Fincher,...,0.800000,0,1,1,1,3,39759273_0_1427898308030295194,1,0.000005,0.999995
4,1,0,The Social Network,1|2010|David Fincher|8.3|45993,The Social Network,Q1952928,The Social Network,,fuzzy-augmented,2010 soundtrack album by Trent Reznor and Atti...,...,0.222222,0,0,1,0,1,39759273_0_1427898308030295194,1,0.033429,0.966571
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
11988,1,99,Casablanca,100|1942|Michael Curtiz|8.8|182487,Casablanca,Q1046658,Casa Blanca,,fuzzy-augmented,human settlement,...,0.000000,0,0,0,0,0,39759273_0_1427898308030295194,-1,0.695099,0.304901
11989,1,99,Casablanca,100|1942|Michael Curtiz|8.8|182487,Casablanca,Q2160914,,,fuzzy-augmented,,...,0.000000,0,0,0,0,0,39759273_0_1427898308030295194,-1,0.999802,0.000198
11990,1,99,Casablanca,100|1942|Michael Curtiz|8.8|182487,Casablanca,Q649078,Casablanca Prefecture,Casablanca Province,fuzzy-augmented,prefecture of Morocco,...,0.000000,0,0,0,0,0,39759273_0_1427898308030295194,-1,0.551121,0.448879
11991,1,99,Casablanca,100|1942|Michael Curtiz|8.8|182487,Casablanca,Q844124,Government House of the Russian Federation,White House (Moscow),fuzzy-augmented,government building in Moscow,...,0.000000,0,0,0,0,0,39759273_0_1427898308030295194,-1,0.984839,0.015161


In [None]:
# for each dev table: write out to votes-model/
for tid in res:
    tmp_df = res[tid].to_csv(f'/Users/summ7t/dev/novartis/table-linker/t2dv2-dev-score/pipeline/votes-model/{tid}.csv', index=False)