# Data preparation

# Extract variables from raw experiment data

In [23]:
import pandas as pd
from limepy.wrangle import Survey
import pandas as pd
import os
import textwrap

from util import getExperimentSequence, getTaskCorrectness, getTaskDurationSeconds, getTaskFromLocation

cnx = sqlite3.connect('data/data_anon.db')

df = pd.read_sql_query("SELECT * FROM events", cnx)

df['task'] = df['location'].map(getTaskFromLocation)

results = pd.DataFrame(columns=[
    'student_id_anon',
    'seq',
    't1_time',
    't1_correctness',
    't2_time',
    't2_correctness',
    'jv_time',
    'py_time',
    'jv_correctness',
    'py_correctness',
])

for experimentId in df['experiment'].unique():
    seq = getExperimentSequence(experimentId, df)

    t1time = getTaskDurationSeconds(experimentId, 1, df)
    t2time = getTaskDurationSeconds(experimentId, 2, df)

    t1correctness = getTaskCorrectness(experimentId, 1, df)
    t2correctness = getTaskCorrectness(experimentId, 2, df)

    results = pd.concat([
        results,
        pd.DataFrame({
            'student_id_anon': experimentId,
            'seq': seq,
            't1_time': t1time,
            't1_correctness': t1correctness,
            't2_time': t2time,
            't2_correctness': t2correctness,
            'jv_time': t1time if seq == 'AB' else t2time,
            'py_time': t2time if seq == 'AB' else t1time,
            'jv_correctness': t1correctness if seq == 'AB' else t2correctness,
            'py_correctness': t2correctness if seq == 'AB' else t1correctness,
        }, index=[experimentId]
    )], ignore_index=True)

  results = pd.concat([


# Merge results and entry survey data

In [24]:
entrySurvey = pd.read_csv('./data/participants.csv', dtype = {'student_id_anon':str})

# set all dataframes to use the anon student id as index
entrySurvey.set_index('student_id_anon', inplace=True, drop=True)
results.set_index('student_id_anon', inplace=True, drop=False)

# merge the group assignments into the survey responses
df = results.merge(right=entrySurvey, left_index=True, right_index=True, how='left')

df.to_csv('./data/generated/results_anon.csv', index=False)

df

Unnamed: 0_level_0,student_id_anon,seq,t1_time,t1_correctness,t2_time,t2_correctness,jv_time,py_time,jv_correctness,py_correctness,ex_prog,ex_class,ex_PY,ex_JV,ex_DE
student_id_anon,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1
159,159,BA,532.274,1.0,326.722,0.909091,326.722,532.274,0.909091,1.0,7,3,3,1,3
133,133,BA,402.512,0.916667,147.372,1.0,147.372,402.512,1.0,0.916667,7,3,4,1,3
124,124,AB,351.3654,1.0,299.2236,0.909091,351.3654,299.2236,1.0,0.909091,3,2,2,1,1
83,83,BA,359.546,1.0,259.602,1.0,259.602,359.546,1.0,1.0,7,4,3,1,4
13,13,BA,152.411,0.916667,195.244,1.0,195.244,152.411,1.0,0.916667,7,3,4,1,3
55,55,AB,559.642,0.916667,254.156,0.909091,559.642,254.156,0.916667,0.909091,8,4,4,2,4
163,163,AB,405.224,1.0,207.346,1.0,405.224,207.346,1.0,1.0,6,3,4,1,3
82,82,AB,504.253,0.916667,355.263,0.909091,504.253,355.263,0.916667,0.909091,4,3,3,1,2
43,43,BA,494.025,0.916667,524.672,1.0,524.672,494.025,1.0,0.916667,7,3,4,1,4
155,155,AB,502.051,0.833333,699.271,1.0,502.051,699.271,0.833333,1.0,6,3,3,1,2


## Convert survey data readable dataframes

In [25]:
with open('./data/limesurvey_survey_structure.lss') as surveyStructure:
    structure = surveyStructure.read()
with open('./data/limesurvey_survey_answers.csv') as surveyAnswers:
     df = pd.read_csv('./data/limesurvey_survey_answers.csv', sep=',')

survey = Survey(df, structure, language='en')
survey.readable_df

readableSurvey = survey.readable_df.rename(columns={
    'How difficult was it to understand the data pipeline written in <strong>Jayvee</strong>?': 'difficultyJV',
    'How difficult was it to understand the data pipeline written in <strong>Python</strong>?': 'difficultyPY',
    'What makes data pipelines written in <strong>Jayvee</strong> difficult/easy to understand?': 'reasonsJV',
    'What makes data pipelines written in <strong>Python</strong> difficult/easy to understand?': 'reasonsPY',
    'What are the <strong>differences</strong> <strong>between Jayvee and Python</strong> that influence\xa0how easy / hard it is to understand data pipelines?': 'differences',
})

readableSurvey = readableSurvey[[
     'difficultyJV',
     'difficultyPY',
     'reasonsJV',
     'reasonsPY',
     'differences',
]]

readableSurvey.to_csv('./data/generated/survey.csv', index=False)

readableSurvey

Unnamed: 0,difficultyJV,difficultyPY,reasonsJV,reasonsPY,differences
0,Easy,Medium,easy:\n- apart from the block definitions you ...,easy:\n- the syntax of Python is very clean fo...,- the functional approach of Jayvee may be new...
1,Easy,Medium,According to me line by line approach make jay...,"In python, we have to understand the code prop...",in jayvee we have different block for the pipe...
2,Medium,Easy,,,
3,Easy,Very easy,new language.\nmust write it many times to get...,just about practicing.,I think Jayvee has more scripts and blocks.\n...
4,Easy,Medium,"I think validating a columns values, would be ...",,"Compared to Jayvee, many variables and reassig..."
5,Medium,Medium,I find the descriptions in the documentation t...,Just the amount of online resources available ...,The documentation and support from the python ...
6,Medium,Medium,,,
7,Easy,Medium,the difference in syntax was so interesting to...,python is the language i have written and work...,the differencies in syntax sometimes make it h...
8,Easy,Medium,The best part in jayvee is block type coding i...,python is little difficult to read and its dif...,"jayvee is kind a block type language, its spec..."
9,Easy,Easy,You can see the overview of pipeline at the be...,Syntax in python is really easy to understand,python documentation is more understandable an...


# Export free text responses

In [26]:
survey = pd.read_csv('./data/generated/survey.csv')
survey = survey.fillna('')

os.makedirs('./data/generated/survey_responses', exist_ok=True)

def wrap_text_with_max_width(text):
    wrapped_text = ""
    for line in text.splitlines():
        wrapped_text += textwrap.fill(line, subsequent_indent='    ') + '\n'
    return wrapped_text

for id, row in survey.iterrows():
    if (len(row['reasonsJV']) + len(row['reasonsPY']) + len(row['differences']) == 0):
            print(f'No content in response {id}, skipping.')
            continue
    
    with open(f'./data/generated/survey_responses/{id}.txt', mode='w') as f:
        f.writelines(
        f'''_____ What makes data pipelines written in Jayvee difficult/easy to understand? _____
_____ JV ({row['difficultyJV']}) _____

{wrap_text_with_max_width(row['reasonsJV'])}

_____ What makes data pipelines written in Python difficult/easy to understand? _____
_____ PY ({row['difficultyPY']}) _____

{wrap_text_with_max_width(row['reasonsPY'])}

_____ What are the differences between Jayvee and Python that influence how easy / hard it is to understand data pipelines? _____
_____ (JV {row['difficultyJV']}, PY {row['difficultyPY']}) _____

{wrap_text_with_max_width(row['differences'])}
'''
)

No content in response 2, skipping.
No content in response 6, skipping.
No content in response 47, skipping.
