In [1]:
import pandas as pd
import numpy as np

In [2]:
questions = pd.read_csv('data/train_questions.csv')
answers = pd.read_csv('data/train_answers.csv')

In [3]:
questions

Unnamed: 0,question_id,question,course,year,candidate_answers,answer_id
0,79062,"For categorical target set, where the distribu...",Machine Learning Zoomcamp,2021,156400754877105368643810912439,156400
1,468946,Is there anything that we are not allowed to u...,Machine Learning Zoomcamp,2021,641330634887912439425941642829,634887
2,968800,I have been catching up and have been doing ho...,Data Engineering Zoomcamp,2022,9540161678567591936798838013,954016
3,688404,Could you please explain what code we should l...,Data Engineering Zoomcamp,2022,1986616298986865773699141765,3699
4,63921,Is it just me or does the model have really ba...,Machine Learning Zoomcamp,2021,754877604487912439858915425941,858915
...,...,...,...,...,...,...
392,241788,Can the model with the ROC AUC score of around...,Machine Learning Zoomcamp,2021,274012831391912439596854214199,831391
393,595103,When I click tab in the parentheses of the iPy...,Machine Learning Zoomcamp,2021,325935651754478055214199912439,651754
394,450348,Can you please explain the use cases of Splunk...,Data Engineering Zoomcamp,2022,432981908368296080131069733226,733226
395,864660,Why did you use model2bin in the last question...,Machine Learning Zoomcamp,2021,4042042229732377169051623076,422297


In [4]:
answers

Unnamed: 0,answer_id,answer,course,year,attachments_files
0,156400,Alexey\nShould we use something non-standard t...,Machine Learning Zoomcamp,2021,
1,634887,"No, I don't think there is anything you cannot...",Machine Learning Zoomcamp,2021,
2,954016,"Alexey\nYes, you will be. You can submit the p...",Data Engineering Zoomcamp,2022,
3,3699,Alexey\nI think the question refers to the hom...,Data Engineering Zoomcamp,2022,
4,858915,"Dmitry\nIt's fine, because this is the showcas...",Machine Learning Zoomcamp,2021,
...,...,...,...,...,...
392,831391,"Yes, it can. It's really dataset dependent. Fo...",Machine Learning Zoomcamp,2021,
393,651754,"Let's say I do “import numpy as np” and then, ...",Machine Learning Zoomcamp,2021,
394,733226,Alexey\nSplunk – I don’t know. It's not a data...,Data Engineering Zoomcamp,2022,
395,422297,"Yes, it was not mentioned. But what was mentio...",Machine Learning Zoomcamp,2021,


In [5]:
number_of_qa = len(questions)

# Create a range from 1 to 100
range_ids = np.arange(number_of_qa)

# Create a meshgrid
col1, col2 = np.meshgrid(range_ids, range_ids)

# Flatten the arrays and create a DataFrame
df = pd.DataFrame({'question_row_id': col1.flatten(), 'answer_row_id': col2.flatten()})

# Display the DataFrame
df

Unnamed: 0,question_row_id,answer_row_id
0,0,0
1,1,0
2,2,0
3,3,0
4,4,0
...,...,...
157604,392,396
157605,393,396
157606,394,396
157607,395,396


In [6]:
df['question_id'] = df['question_row_id'].map(questions['question_id'])
df['answer_id'] = df['answer_row_id'].map(answers['answer_id'])

In [7]:
df

Unnamed: 0,question_row_id,answer_row_id,question_id,answer_id
0,0,0,79062,156400
1,1,0,468946,156400
2,2,0,968800,156400
3,3,0,688404,156400
4,4,0,63921,156400
...,...,...,...,...
157604,392,396,241788,558889
157605,393,396,595103,558889
157606,394,396,450348,558889
157607,395,396,864660,558889


In [8]:
def get_label(question_id: int, answer_id: int) -> int:
    # print(questions[questions['question_id'] == question_id]['answer_id'].values[0])
    if answer_id == questions[questions['question_id'] == question_id]['answer_id'].values[0]:
        return 1
    if answer_id in list(map(int,questions[questions['question_id'] == question_id]['candidate_answers'].values[0].split(","))):
        return 1
    return 0

In [9]:
df['question_id']

0          79062
1         468946
2         968800
3         688404
4          63921
           ...  
157604    241788
157605    595103
157606    450348
157607    864660
157608    205640
Name: question_id, Length: 157609, dtype: int64

In [10]:
df['label'] = df.apply(lambda row: get_label(row['question_id'], row['answer_id']), axis=1)

In [11]:
df[df['label'] == 1]

Unnamed: 0,question_row_id,answer_row_id,question_id,answer_id,label
0,0,0,79062,156400,1
398,1,1,468946,634887,1
796,2,2,968800,954016,1
1194,3,3,688404,3699,1
1592,4,4,63921,858915,1
...,...,...,...,...,...
156812,394,394,450348,733226,1
156948,133,395,482291,422297,1
157145,330,395,63486,422297,1
157210,395,395,864660,422297,1


In [12]:
df['label'].value_counts()

label
0    155621
1      1988
Name: count, dtype: int64

In [13]:
1988 / (155621 + 1988) * 100

1.26134928842896

In [14]:
seed = 22

# Assume 'df' is your DataFrame and 'label' is the column with labels
positive_samples = df[df['label'] == 1]

# Get the number of negative samples to select (1-2% of total negative samples)
num_negative_samples = int(0.015 * len(df[df['label'] == 0]))

# Randomly select negative samples
negative_samples = df[df['label'] == 0].sample(n=num_negative_samples, random_state=seed)

# Concatenate positive and negative samples to create the training set
training_set = pd.concat([positive_samples, negative_samples])

# Shuffle the training set
training_set = training_set.sample(frac=1, random_state=seed)

training_set

Unnamed: 0,question_row_id,answer_row_id,question_id,answer_id,label
35604,271,89,841549,971755,0
90328,209,227,314363,797668,0
42864,385,107,593121,843322,1
90651,135,228,485693,457354,0
105279,74,265,651883,75919,1
...,...,...,...,...,...
94719,233,238,830729,432981,1
155411,184,391,864931,985972,0
140526,385,353,593121,426370,0
34515,373,86,567770,296080,1


In [15]:
training_set['label'].value_counts()

label
0    2334
1    1988
Name: count, dtype: int64