In [1]:
import pandas as pd
import numpy as np
from datasets import Dataset
from transformers import AutoTokenizer, TFAutoModelForSequenceClassification, DataCollatorWithPadding

In [2]:
questions = pd.read_csv('data/test_questions.csv')
answers = pd.read_csv('data/test_answers.csv')

In [34]:
number_of_qa = len(questions)

# Create a range from 1 to 100
range_ids = np.arange(number_of_qa)

# Create a meshgrid
col1, col2 = np.meshgrid(range_ids, range_ids)

# Flatten the arrays and create a DataFrame
df = pd.DataFrame({'question_row_id': col1.flatten(), 'answer_row_id': col2.flatten()})

# Display the DataFrame
df

Unnamed: 0,question_row_id,answer_row_id
0,0,0
1,1,0
2,2,0
3,3,0
4,4,0
...,...,...
266251,511,515
266252,512,515
266253,513,515
266254,514,515


In [4]:
questions.head()

Unnamed: 0,question_id,question,course,year,candidate_answers
0,707,How much of an effort would it be to use AWS i...,Data Engineering Zoomcamp,2023,33623233766925830447681767296
1,534450,Can you talk about linear regression and regul...,Machine Learning Zoomcamp,2022,23120828207286769573165138373
2,996163,Can you please explain the Python Black setup ...,Data Engineering Zoomcamp,2023,57189281655947681337669336232
3,860215,How many portfolio projects apart from the cou...,Machine Learning Zoomcamp,2022,643931988549918931235894608866
4,980124,Can you talk more about the final project? Wha...,Data Engineering Zoomcamp,2023,38438133766925830447681747722


In [5]:
answers.head()

Unnamed: 0,answer_id,answer,course,year,attachments_files
0,767296,Alexey\nProbably more than you want to put in....,Data Engineering Zoomcamp,2023,
1,573165,"Yes, I can. There is actually an entire module...",Machine Learning Zoomcamp,2022,
2,571892,Jeff\nI can try. I like Black a lot. If you ju...,Data Engineering Zoomcamp,2023,
3,988549,"Again, you’ll probably hate me soon for saying...",Machine Learning Zoomcamp,2022,
4,384381,Alexey\nThe first thing about the dataset – wh...,Data Engineering Zoomcamp,2023,


In [35]:
df['question_id'] = df['question_row_id'].map(questions['question_id'])
df['question'] = df['question_row_id'].map(questions['question'])
df['question_course'] = df['question_row_id'].map(questions['course'])
df['question_year'] = df['question_row_id'].map(questions['year'])

df['answer_id'] = df['answer_row_id'].map(answers['answer_id'])
df['answer'] = df['answer_row_id'].map(answers['answer'])
df['answer_course'] = df['answer_row_id'].map(answers['course'])
df['answer_year'] = df['answer_row_id'].map(answers['year'])

In [38]:
df = df[df['question_course'] == df['answer_course']]
df = df[df['question_year'] == df['answer_year']]
df.index = df.index.rename('idx')
df.drop(columns=['question_row_id', 'answer_row_id', 'question_id', 'answer_id', 'question_course', 'answer_course', 'question_year', 'answer_year'], inplace=True)

In [39]:
df

Unnamed: 0_level_0,question,answer
idx,Unnamed: 1_level_1,Unnamed: 2_level_1
0,How much of an effort would it be to use AWS i...,Alexey\nProbably more than you want to put in....
2,Can you please explain the Python Black setup ...,Alexey\nProbably more than you want to put in....
4,Can you talk more about the final project? Wha...,Alexey\nProbably more than you want to put in....
8,I read a book that said the data engineering l...,Alexey\nProbably more than you want to put in....
9,If we consider professional certification for ...,Alexey\nProbably more than you want to put in....
...,...,...
266245,How to get out of tutorial hell?,I need more information to actually help you h...
266246,Can you explain what exactly model.fit(x_train...,I need more information to actually help you h...
266247,What to do after courses? Where do you see us ...,I need more information to actually help you h...
266248,What would be the main skills/algorithms/tools...,I need more information to actually help you h...


In [20]:
raw_datasets = Dataset.from_pandas(df)
raw_datasets

Dataset({
    features: ['question', 'answer', 'idx'],
    num_rows: 137178
})

In [21]:
checkpoint = "bert-base-uncased"
tokenizer = AutoTokenizer.from_pretrained(checkpoint)

def tokenize_function(record):
    return tokenizer(record["question"], record["answer"], truncation=True)

tokenized_datasets = raw_datasets.map(tokenize_function, batched=True)
tokenized_datasets

huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)


Map:   0%|          | 0/137178 [00:00<?, ? examples/s]

Dataset({
    features: ['question', 'answer', 'idx', 'input_ids', 'token_type_ids', 'attention_mask'],
    num_rows: 137178
})

In [49]:
data_collator = DataCollatorWithPadding(tokenizer=tokenizer, return_tensors="tf")

In [53]:
dataset = tokenized_datasets.to_tf_dataset(
    columns=["attention_mask", "input_ids", "token_type_ids"],
    shuffle=True,
    collate_fn=data_collator,
    batch_size=100,
)

In [51]:
model = TFAutoModelForSequenceClassification.from_pretrained('model.h5')

Some layers from the model checkpoint at model.h5 were not used when initializing TFBertForSequenceClassification: ['dropout_37']
- This IS expected if you are initializing TFBertForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing TFBertForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
All the layers of TFBertForSequenceClassification were initialized from the model checkpoint at model.h5.
If your task is similar to the task the model of the checkpoint was trained on, you can already use TFBertForSequenceClassification for predictions without further training.


In [54]:
preds = model.predict(dataset)

   2/1372 [..............................] - ETA: 11:16:24

In [12]:
from py_scripts.model_interface import predict_probabilities

In [14]:
df_test = df.head(100)
def get_pred(q, ans) -> int :
    return predict_probabilities(q, ans)[1]

df_test['prob'] = df_test.apply(lambda row: get_pred(row['question'], row['answer']), axis=1)
df_test

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_test['prob'] = df_test.apply(lambda row: get_pred(row['question'], row['answer']), axis=1)


Unnamed: 0_level_0,question,answer,prob
idx,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
0,How much of an effort would it be to use AWS i...,Alexey\nProbably more than you want to put in....,0.972288
2,Can you please explain the Python Black setup ...,Alexey\nProbably more than you want to put in....,0.067522
4,Can you talk more about the final project? Wha...,Alexey\nProbably more than you want to put in....,0.157995
8,I read a book that said the data engineering l...,Alexey\nProbably more than you want to put in....,0.189315
9,If we consider professional certification for ...,Alexey\nProbably more than you want to put in....,0.199016
...,...,...,...
206,Data lakes are like folders. We save data befo...,Alexey\nProbably more than you want to put in....,0.103819
209,What is the relationship between Prefect agent...,Alexey\nProbably more than you want to put in....,0.075144
218,How many finishers do you expect at the end of...,Alexey\nProbably more than you want to put in....,0.140364
223,Can you share with us some real-work data clea...,Alexey\nProbably more than you want to put in....,0.086829


10 -> 2.6s

50 -> 13.2s

100 -> 27s

100,000 -> 27000s

In [22]:
len(df) * 27 / 100 / 3600

10.28835

In [41]:
df

Unnamed: 0_level_0,question,answer
idx,Unnamed: 1_level_1,Unnamed: 2_level_1
0,How much of an effort would it be to use AWS i...,Alexey\nProbably more than you want to put in....
2,Can you please explain the Python Black setup ...,Alexey\nProbably more than you want to put in....
4,Can you talk more about the final project? Wha...,Alexey\nProbably more than you want to put in....
8,I read a book that said the data engineering l...,Alexey\nProbably more than you want to put in....
9,If we consider professional certification for ...,Alexey\nProbably more than you want to put in....
...,...,...
266245,How to get out of tutorial hell?,I need more information to actually help you h...
266246,Can you explain what exactly model.fit(x_train...,I need more information to actually help you h...
266247,What to do after courses? Where do you see us ...,I need more information to actually help you h...
266248,What would be the main skills/algorithms/tools...,I need more information to actually help you h...
