### used - https://github.com/google-research/tapas/blob/master/notebooks/sqa_predictions.ipynb as reference

## clone github and install tapas (on GPU - change runtime to GPU)

In [1]:
! git clone https://github.com/google-research/tapas.git

Cloning into 'tapas'...
remote: Enumerating objects: 221, done.[K
remote: Counting objects: 100% (221/221), done.[K
remote: Compressing objects: 100% (150/150), done.[K
remote: Total 221 (delta 113), reused 170 (delta 64), pack-reused 0[K
Receiving objects: 100% (221/221), 201.70 KiB | 11.21 MiB/s, done.
Resolving deltas: 100% (113/113), done.


In [2]:
!pip install ./tapas

Processing ./tapas
Collecting apache-beam[gcp]==2.20.0
[?25l  Downloading https://files.pythonhosted.org/packages/4b/0d/0979ad626578a52887f7df60492ac6759089a9da261ac4c88b112b3f6a5a/apache_beam-2.20.0-cp36-cp36m-manylinux1_x86_64.whl (3.5MB)
[K     |████████████████████████████████| 3.5MB 4.9MB/s 
[?25hCollecting frozendict==1.2
  Downloading https://files.pythonhosted.org/packages/4e/55/a12ded2c426a4d2bee73f88304c9c08ebbdbadb82569ebdd6a0c007cfd08/frozendict-1.2.tar.gz
Collecting tensorflow~=2.2.0
[?25l  Downloading https://files.pythonhosted.org/packages/3d/be/679ce5254a8c8d07470efb4a4c00345fae91f766e64f1c2aece8796d7218/tensorflow-2.2.0-cp36-cp36m-manylinux2010_x86_64.whl (516.2MB)
[K     |████████████████████████████████| 516.2MB 30kB/s 
[?25hCollecting tf-models-official~=2.2.0
[?25l  Downloading https://files.pythonhosted.org/packages/99/8e/6db83bab2f86475fa69289848379f642746314131527d8a4ced47a6396af/tf_models_official-2.2.2-py2.py3-none-any.whl (711kB)
[K     |█████████████

## Fetch models from google storage

In [1]:
!gsutil cp gs://tapas_models/2020_04_21/tapas_sqa_base.zip . && unzip tapas_sqa_base.zip

Copying gs://tapas_models/2020_04_21/tapas_sqa_base.zip...
/ [0 files][    0.0 B/  1.0 GiB]                                                Download already complete for ./tapas_sqa_base.zip component 1, skipping download but will run integrity checks.
Couldn't read download tracker file (/root/.gsutil/tracker-files/download_component_TRACKER_5ca8f4bd9c5c0e2c4409321719711beb1d2c2c7a.ip__JSON__2.etag): Expecting value: line 1 column 1 (char 0). Restarting download from scratch.
Couldn't read download tracker file (/root/.gsutil/tracker-files/download_component_TRACKER_74156aa3b89ec914b2490b153218b7c49beb7c40.ip__JSON__0.etag): Expecting value: line 1 column 1 (char 0). Restarting download from scratch.
Download already complete for ./tapas_sqa_base.zip component 3, skipping download but will run integrity checks.
- [1 files][  1.0 GiB/  1.0 GiB]                                                
Operation completed over 1 objects/1.0 GiB.                                      
Archive:  tap

## imports

In [2]:
import tensorflow.compat.v1 as tf
import os 
import shutil
import csv
import pandas as pd
import IPython

tf.get_logger().setLevel('ERROR')

In [3]:
from tapas.utils import tf_example_utils
from tapas.protos import interaction_pb2
from tapas.utils import number_annotation_utils
from tapas.scripts import prediction_utils

## Load checkpoint

In [4]:

os.makedirs('results/sqa/tf_examples', exist_ok=True)
os.makedirs('results/sqa/model', exist_ok=True)
with open('results/sqa/model/checkpoint', 'w') as f:
  f.write('model_checkpoint_path: "model.ckpt-0"')
for suffix in ['.data-00000-of-00001', '.index', '.meta']:
  shutil.copyfile(f'tapas_sqa_base/model.ckpt{suffix}', f'results/sqa/model/model.ckpt-0{suffix}')

## load custom data

In [None]:
from google.colab import drive
drive.mount('/content/drive')

In [6]:
path = "/content/drive/My Drive/Placement_Data.csv"
df = pd.read_csv(path)

In [7]:
df['salary'] = df['salary'].fillna(0)

In [8]:
df = df.astype(str)

In [9]:
df = df.head(20)
df.head()
#placement data downloaded from kaggle

Unnamed: 0,sl_no,gender,ssc_p,ssc_b,hsc_p,hsc_b,hsc_s,degree_p,degree_t,workex,etest_p,specialisation,mba_p,status,salary
0,1,M,67.0,Others,91.0,Others,Commerce,58.0,Sci&Tech,No,55.0,Mkt&HR,58.8,Placed,270000.0
1,2,M,79.33,Central,78.33,Others,Science,77.48,Sci&Tech,Yes,86.5,Mkt&Fin,66.28,Placed,200000.0
2,3,M,65.0,Central,68.0,Central,Arts,64.0,Comm&Mgmt,No,75.0,Mkt&Fin,57.8,Placed,250000.0
3,4,M,56.0,Central,52.0,Central,Science,52.0,Sci&Tech,No,66.0,Mkt&HR,59.43,Not Placed,0.0
4,5,M,85.8,Central,73.6,Central,Commerce,73.3,Comm&Mgmt,No,96.8,Mkt&Fin,55.5,Placed,425000.0


In [10]:
#rename columns
col_names = ['Sr No', 'Gender','SSC Score','SSC Board','HSC Score','HSC Board','HSC Stream','Degree Score','Degree Stream',
             'Work Experience','Test Score','MBA Stream','MBA Score','Placement Status','Salary']
df.columns  = col_names

In [11]:
df.head()

Unnamed: 0,Sr No,Gender,SSC Score,SSC Board,HSC Score,HSC Board,HSC Stream,Degree Score,Degree Stream,Work Experience,Test Score,MBA Stream,MBA Score,Placement Status,Salary
0,1,M,67.0,Others,91.0,Others,Commerce,58.0,Sci&Tech,No,55.0,Mkt&HR,58.8,Placed,270000.0
1,2,M,79.33,Central,78.33,Others,Science,77.48,Sci&Tech,Yes,86.5,Mkt&Fin,66.28,Placed,200000.0
2,3,M,65.0,Central,68.0,Central,Arts,64.0,Comm&Mgmt,No,75.0,Mkt&Fin,57.8,Placed,250000.0
3,4,M,56.0,Central,52.0,Central,Science,52.0,Sci&Tech,No,66.0,Mkt&HR,59.43,Not Placed,0.0
4,5,M,85.8,Central,73.6,Central,Commerce,73.3,Comm&Mgmt,No,96.8,Mkt&Fin,55.5,Placed,425000.0


In [12]:
df.shape

(20, 15)

In [13]:
complete_list = [[]]
#set columns names
complete_list[0] = list(df.columns)
complete_list.extend(df.values.tolist())


In [14]:
#complete_list

In [15]:
max_seq_length = 512
vocab_file = "tapas_sqa_base/vocab.txt"
config = tf_example_utils.ClassifierConversionConfig(
    vocab_file=vocab_file,
    max_seq_length=max_seq_length,
    max_column_id=max_seq_length,
    max_row_id=max_seq_length,
    strip_column_names=False,
    add_aggregation_candidates=False,
)
converter = tf_example_utils.ToClassifierTensorflowExample(config)

def convert_interactions_to_examples(tables_and_queries):
  """Calls Tapas converter to convert interaction to example."""
  for idx, (table, queries) in enumerate(tables_and_queries):
    interaction = interaction_pb2.Interaction()
    for position, query in enumerate(queries):
      question = interaction.questions.add()
      question.original_text = query
      question.id = f"{idx}-0_{position}"
    for header in table[0]:
      interaction.table.columns.add().text = header
    for line in table[1:]:
      row = interaction.table.rows.add()
      for cell in line:
        row.cells.add().text = cell
    number_annotation_utils.add_numeric_values(interaction)

    #print('len = ',len(interaction.questions))
    #print('range = ',range(len(interaction.questions)))
    
    for i in range(len(interaction.questions)):
      print('i=',i)
      try:
        yield converter.convert(interaction, i)
      except ValueError as e:
        print(f"Can't convert interaction: {interaction.id} error: {e}")
        
def write_tf_example(filename, examples):
  with tf.io.TFRecordWriter(filename) as writer:
    for example in examples:
      writer.write(example.SerializeToString())

def predict(table_data, queries):
  table = table_data # <----- pass the list directly
  examples = convert_interactions_to_examples([(table, queries)])
  write_tf_example("results/sqa/tf_examples/test.tfrecord", examples)
  write_tf_example("results/sqa/tf_examples/random-split-1-dev.tfrecord", [])
  
  ! python tapas/tapas/run_task_main.py \
    --task="SQA" \
    --output_dir="results" \
    --noloop_predict \
    --test_batch_size={len(queries)} \
    --tapas_verbosity="ERROR" \
    --compression_type= \
    --init_checkpoint="tapas_sqa_base/model.ckpt" \
    --bert_config_file="tapas_sqa_base/bert_config.json" \
    --mode="predict" 2> error


  results_path = "results/sqa/model/test_sequence.tsv"
  all_coordinates = []
  df = pd.DataFrame(table[1:], columns=table[0])
  display(IPython.display.HTML(df.to_html(index=False)))
  print()
  with open(results_path) as csvfile:
    reader = csv.DictReader(csvfile, delimiter='\t')
    for row in reader:
      coordinates = prediction_utils.parse_coordinates(row["answer_coordinates"])
      all_coordinates.append(coordinates)
      answers = ', '.join([table[row + 1][col] for row, col in coordinates])
      position = int(row['position'])
      print(">", queries[position])
      print(answers)
  return all_coordinates

In [16]:
result = predict(complete_list,["what is highest test score?","what is highest Salary?","What is MBA stream for it?","What is degree stream for it"])

i= 0
i= 1
i= 2
i= 3
is_built_with_cuda: True
is_gpu_available: True
GPUs: [PhysicalDevice(name='/physical_device:GPU:0', device_type='GPU')]
Training or predicting ...
Evaluation finished after training step 0.


Sr No,Gender,SSC Score,SSC Board,HSC Score,HSC Board,HSC Stream,Degree Score,Degree Stream,Work Experience,Test Score,MBA Stream,MBA Score,Placement Status,Salary
1,M,67.0,Others,91.0,Others,Commerce,58.0,Sci&Tech,No,55.0,Mkt&HR,58.8,Placed,270000.0
2,M,79.33,Central,78.33,Others,Science,77.48,Sci&Tech,Yes,86.5,Mkt&Fin,66.28,Placed,200000.0
3,M,65.0,Central,68.0,Central,Arts,64.0,Comm&Mgmt,No,75.0,Mkt&Fin,57.8,Placed,250000.0
4,M,56.0,Central,52.0,Central,Science,52.0,Sci&Tech,No,66.0,Mkt&HR,59.43,Not Placed,0.0
5,M,85.8,Central,73.6,Central,Commerce,73.3,Comm&Mgmt,No,96.8,Mkt&Fin,55.5,Placed,425000.0
6,M,55.0,Others,49.8,Others,Science,67.25,Sci&Tech,Yes,55.0,Mkt&Fin,51.58,Not Placed,0.0
7,F,46.0,Others,49.2,Others,Commerce,79.0,Comm&Mgmt,No,74.28,Mkt&Fin,53.29,Not Placed,0.0
8,M,82.0,Central,64.0,Central,Science,66.0,Sci&Tech,Yes,67.0,Mkt&Fin,62.14,Placed,252000.0
9,M,73.0,Central,79.0,Central,Commerce,72.0,Comm&Mgmt,No,91.34,Mkt&Fin,61.29,Placed,231000.0
10,M,58.0,Central,70.0,Central,Commerce,61.0,Comm&Mgmt,No,54.0,Mkt&Fin,52.21,Not Placed,0.0



> what is highest test score?
96.8
> what is highest Salary?
0.0, 425000.0
> What is MBA stream for it?
Mkt&Fin
> What is degree stream for it
Comm&Mgmt
