#### Setup

In [1]:
import pandas as pd
import csv
import math

#### Load-in data

In [2]:
all_annot_df = pd.read_csv('data/Annotation_Mastersheet.csv')[:1150]
all_annot_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1150 entries, 0 to 1149
Data columns (total 14 columns):
 #   Column       Non-Null Count  Dtype 
---  ------       --------------  ----- 
 0   question     1148 non-null   object
 1   answer       1150 non-null   object
 2   Ahsan        533 non-null    object
 3   Unnamed: 3   550 non-null    object
 4   Unnamed: 4   550 non-null    object
 5   Oksana       516 non-null    object
 6   Unnamed: 6   515 non-null    object
 7   Unnamed: 7   539 non-null    object
 8   Ruoyao       537 non-null    object
 9   Unnamed: 9   504 non-null    object
 10  Unnamed: 10  504 non-null    object
 11  Jeremy       535 non-null    object
 12  Unnamed: 12  535 non-null    object
 13  Unnamed: 13  534 non-null    object
dtypes: object(14)
memory usage: 125.9+ KB


#### Seperate annotations

In [3]:
questions = all_annot_df['question'].values
answers = all_annot_df['answer'].values
annotations = all_annot_df.loc[:, ~all_annot_df.columns.isin(['question', 'answer'])]

In [4]:
print('Question samples:')
questions[:5]

Question samples:


array(['What is the use of the yield keyword in Python? What does it do?',
       'Given the following code, what does the if __name__ == "__main__": do?',
       'Given the following code, what does the if __name__ == "__main__": do?',
       'In Python, what are metaclasses and what do we use them for?',
       'In Python, what are metaclasses and what do we use them for?'],
      dtype=object)

In [5]:
print('Answer samples:')
answers[:5]

Answer samples:


array(['To understand what yield does, you must understand what generators are. And before you can understand generators, you must understand iterables.',
       "It's boilerplate code that protects users from accidentally invoking the script when they didn't intend to. Here are some common problems when the guard is omitted from a script:",
       'if __name__ == "__main__" is the part that runs when the script is run from (say) the command line using a command like python myscript.py.',
       'Before understanding metaclasses, you need to master classes in Python. And Python has a very peculiar idea of what classes are, borrowed from the Smalltalk language.',
       'A metaclass is the class of a class. A class defines how an instance of the class (i.e. an object) behaves while a metaclass defines how a class behaves. A class is an instance of a metaclass.'],
      dtype=object)

In [6]:
print('All annotation samples:')
annotations.head()

All annotation samples:


Unnamed: 0,Ahsan,Unnamed: 3,Unnamed: 4,Oksana,Unnamed: 6,Unnamed: 7,Ruoyao,Unnamed: 9,Unnamed: 10,Jeremy,Unnamed: 12,Unnamed: 13
0,iterator,beginner,direct,iterator,beginner,direct,objects,,,,,
1,classes,beginner,direct,built-in,beginner,direct,variables,,,,,
2,classes,beginner,direct,built-in,beginner,direct,variables,,,,,
3,classes,beginner,direct,classes,beginner,direct,classes,,,,,
4,classes,beginner,direct,classes,beginner,direct,classes,,,,,


In [7]:
classifications = annotations.iloc[:,[i for i in range(12) if i % 3 == 0]].values
complexity = annotations.iloc[:,[i for i in range(12) if i % 3 == 1]].values
directness = annotations.iloc[:,[i for i in range(12) if i % 3 == 2]].values

In [8]:
agg_classifications = []
for annots in classifications:
    agg_classifications.append(', '.join(list(set([annot for annot in annots if type(annot) == str]))))
print('Classification annotation samples:')
agg_classifications[:5]

Classification annotation samples:


['objects, iterator',
 'built-in, classes, variables',
 'built-in, classes, variables',
 'classes',
 'classes']

In [9]:
agg_complexity = []
for annots in complexity:
    if 'expert' in annots:
        agg_complexity.append('expert')
        continue
    elif 'advanced' in annots:
        agg_complexity.append('advanced')
    elif 'intermediate' in annots:
        agg_complexity.append('intermediate')
    else:
        agg_complexity.append('beginner')
print('Complexity annotation samples:')
agg_complexity[:5]

Complexity annotation samples:


['beginner', 'beginner', 'beginner', 'beginner', 'beginner']

In [10]:
agg_directness = []
for annots in directness:
    if 'adding on' in annots:
        agg_directness.append('adding on')
    elif 'reference' in annots:
        agg_directness.append('reference')
    else:
        agg_directness.append('direct')
print('Directness annotation samples:')
agg_directness[:5]

Directness annotation samples:


['direct', 'direct', 'direct', 'direct', 'direct']

#### Aggregate annotations

In [11]:
aggregate_df = pd.DataFrame(columns=['Question', 'Answer', 'Classification', 'Complexity', 'Directness'])
aggregate_df['Question'] = questions
aggregate_df['Answer'] = answers
aggregate_df['Classification'] = agg_classifications
aggregate_df['Complexity'] = agg_complexity
aggregate_df['Directness'] = agg_directness

In [12]:
aggregate_df.head()

Unnamed: 0,Question,Answer,Classification,Complexity,Directness
0,What is the use of the yield keyword in Python...,"To understand what yield does, you must unders...","objects, iterator",beginner,direct
1,"Given the following code, what does the if __n...",It's boilerplate code that protects users from...,"built-in, classes, variables",beginner,direct
2,"Given the following code, what does the if __n...","if __name__ == ""__main__"" is the part that run...","built-in, classes, variables",beginner,direct
3,"In Python, what are metaclasses and what do we...","Before understanding metaclasses, you need to ...",classes,beginner,direct
4,"In Python, what are metaclasses and what do we...",A metaclass is the class of a class. A class d...,classes,beginner,direct


In [13]:
aggregate_df.to_csv('data/final_annotations.csv')