# Data Preparation

## Convert survey data readable dataframes

In [4]:
#dataRootFolder = './data/ss23'

dataRootFolder = './data/ws2324'

In [5]:
import pandas as pd
import os
from limepy.wrangle import Survey

def getSurveyDataframeForExercise(number: int):
    if not os.path.exists(f'{dataRootFolder}/exercise{number}.lss') or not os.path.exists(f'{dataRootFolder}/answers-ex{number}.csv'):
        return pd.DataFrame()
    
    with open(f'{dataRootFolder}/exercise{number}.lss') as surveyStructure:
        structure = surveyStructure.read()

    with open(f'{dataRootFolder}/answers-ex{number}.csv') as surveyAnswers:
        dfEx = pd.read_csv(f'{dataRootFolder}/answers-ex{number}.csv', sep=',')

    survey = Survey(dfEx, structure, language='en')
    survey.readable_df['survey'] = number

    surveyUsableAnswers = survey.readable_df[
        survey.readable_df['Can we use your anonymous response for research, including publications, and future development of Jayvee?'] == 'Yes'
        ]

    return surveyUsableAnswers


allSurveys = pd.concat([getSurveyDataframeForExercise(x) for x in range(1, 6)])

allSurveys = allSurveys.rename(columns={
    'What programming language did you use to solve the exercise?': 'language',
    'Can we use your anonymous response for research, including publications, and future development of Jayvee?': 'optin',
    'How many hours did you spend to solve the exercise?': 'time',
    'How difficult was it to solve the exercise using your programming language?': 'difficulty',
    'How would you rate the quality of the resulting data pipeline?': 'quality',
    'What problems with the programming language did you encounter during this exercise?': 'problems',
    'What language features or libraries would have made solving the exercise easier?': 'features',
})

# Export free text fields to text files

In [6]:
def exportQuestion(df, question: str, questionCode: str, exerciseNumber: int):
    exportString = f'# {question}\n\n'
    
    dfToExport = df[
                    [
                        'language',
                        questionCode
                    ]
                ].dropna().sort_values('language')
    
    exportString += '## Jayvee\n\n'

    for answer in dfToExport[dfToExport['language'] == 'Jayvee'][questionCode]:
        exportString += answer + '\n-----------\n'
    
    exportString += '## Python\n\n'

    for answer in dfToExport[dfToExport['language'] == 'Python'][questionCode]:
        exportString += answer + '\n-----------\n'

    with open(f'{dataRootFolder}/generated/freetext/{"".join([x if x.isalnum() else "_" for x in question][:20])}{exerciseNumber}.txt', 'w+') as f:
        f.writelines(exportString)

for surveyId in range(1, 6):
    exportQuestion(
        allSurveys[allSurveys['survey'] == surveyId],
        'What problems with the programming language did you encounter during this exercise?',
        'problems',
        surveyId
    )
    exportQuestion(
        allSurveys[allSurveys['survey'] == surveyId],
        'What language features or libraries would have made solving the exercise easier?',
        'features',
        surveyId
    )

# Drop not needed data

In [7]:
# drop not needed data
data = allSurveys.drop(['id', 'lastpage', 'submitdate', 'startlanguage', 'seed', 'optin', 'problems', 'features'], axis=1)

data.to_csv(f'{dataRootFolder}/generated/survey-responses.csv', index=False)

# Combine all semesters

In [8]:
import pandas as pd

dataRootFolder = './data'

allEntryExitSurveys = {
    'ss23': f'{dataRootFolder}/ss23/generated/entry-exit-anon.csv',
    'ws2324': f'{dataRootFolder}/ws2324/generated/entry-exit-anon.csv'
}

allEntryExitSurveysDfs = []

for semester in allEntryExitSurveys:
    df = pd.read_csv(allEntryExitSurveys[semester])
    df['semester'] = semester
    allEntryExitSurveysDfs.append(df)
    
dfCombined = pd.concat(allEntryExitSurveysDfs, ignore_index=True, verify_integrity=True).fillna(False)

dfCombined.to_csv(f'{dataRootFolder}/generated/entry-exit-anon.csv', index=False)

In [9]:
import pandas as pd

dataRootFolder = './data'

allSurveyResponses = {
    'ss23': f'{dataRootFolder}/ss23/generated/survey-responses.csv',
    'ws2324': f'{dataRootFolder}/ws2324/generated/survey-responses.csv'
}


allSurveyResponsesDfs = []

for semester in allSurveyResponses:
    df = pd.read_csv(allSurveyResponses[semester])
    df['semester'] = semester
    allSurveyResponsesDfs.append(df)
    
dfCombined = pd.concat(allSurveyResponsesDfs, ignore_index=True, verify_integrity=True).fillna(False)

dfCombined.to_csv(f'{dataRootFolder}/generated/survey-responses.csv', index=False)

In [10]:
import shutil
import os

dataRootFolder = './data'

allFreetextResponses = {
    'ss23': f'{dataRootFolder}/ss23/generated/freetext',
    'ws2324': f'{dataRootFolder}/ws2324/generated/freetext'
}

for semester in allFreetextResponses:
    folder = allFreetextResponses[semester]
    shutil.copytree(folder, f'{dataRootFolder}/generated/freetext', dirs_exist_ok=True)

    filenames = [filename for filename in os.listdir(f'{dataRootFolder}/generated/freetext') if not filename.startswith("semester_")]

    for filename in filenames:
        os.rename(f'{dataRootFolder}/generated/freetext/{filename}', f'{dataRootFolder}/generated/freetext/semester_{semester}_{filename}')