In [18]:
import re
import os
import json
import pandas as pd

# Removing the HTML tags from the code
TAG_RE = re.compile(r'<[^>]+>')

def remove_tags(text):
    return TAG_RE.sub('', text)

pd.set_option('display.max_rows', None)

In [19]:
def generate_data(data):
    exercises = data['labs'][0]['exercises']

    id_with_exercises = []

    # Declaring certain types to not include
    for exercise in exercises:
        if(exercise['type'] == 'WLC' \
           or exercise['type'] == 'STI' \
           or exercise['type'] == 'DWS' \
           or exercise['type'] == 'RSLT' \
           or len(exercise['questions']) == 0):
            continue

        id = exercise['id']

        exercise_type = exercise['type']

        questions = exercise['questions']
        quiz_data = exercise['_data']
        quiz_flag = False

        if('quiz' in str(quiz_data).lower()):
            quiz_flag = True

        exercise_question_list = []
        exercise_correct_response_list = []
        exercise_incorrect_response_list = []
        exercise_type_list = []
        exercise_is_quiz = []

        for question in questions:
            contents = question['_data']
            responses = question['responses']
            Q = ''
            for content in contents:
                if('contents' in content):
                    if(content['type'] == 'question'):
                        Q += remove_tags(content['contents'][0]['content'])
                    elif(content['type'] == 'post_question'):
                        Q += remove_tags(content['contents']['content'])
            IR = ''
            for response in responses:
                if(response['correct'] == 1):
                    CR = response['_data'][0]['content']
                else:
                    IR += response['_data'][0]['content'] + ","

            exercise_question_list.append(Q)
            exercise_correct_response_list.append(CR)
            exercise_incorrect_response_list.append(IR)
            exercise_type_list.append(exercise_type)
            exercise_is_quiz.append(quiz_flag)
        
        # JSON structure with the required keys and values
        id_with_exercises.append({
            'exercise_id': id,
            'questions': exercise_question_list,
            'correct_responses': exercise_correct_response_list,
            'incorrect_responses': exercise_incorrect_response_list,
            'type': exercise_type_list,
            'is_quiz': exercise_is_quiz
        })

    return id_with_exercises

In [20]:
def make_dataframe(id_with_exercises):
    import pandas as pd
    df = pd.DataFrame.from_records(id_with_exercises)
    return df

In [21]:
# Directory for all the json data

directory = 'lesson_jsons'
data_frames = []

for root, dirs, files in os.walk(directory):
    for filename in files:
        file = os.path.join(root, filename)
        with open(file, encoding="utf-8") as json_data:
            data = json.load(json_data)
            id_with_exercises = generate_data(data)
            df = make_dataframe(id_with_exercises)
            data_frames.append(df)

In [22]:
result = pd.concat(data_frames)
result.reset_index(drop=True)
result.set_index(['exercise_id']).apply(pd.Series.explode).reset_index()

Unnamed: 0,exercise_id,questions,correct_responses,incorrect_responses,type,is_quiz
0,2,What's your name?,My name is Vanessa.,",",MT1,False
1,2,Try my coffee.\t,It's delicious.,",",MT1,False
2,2,Mmm ... you're right!,Try mine!,",",MT1,False
3,2,Take a seat.,It's yours now.,",",MT1,False
4,3,engenn2le03ob07re3aud01some work.,doing,",",SFL,False
5,3,engenn2le03ob07re3aud02.,studying,",",SFL,False
6,3,engenn2le03ob07re3aud03you drinking a cappuccino?,Are,",",SFL,False
7,3,engenn2le03ob07re3aud04drinking a cappuccino. ...,not,",do ,am,drink,study,",SFL,False
8,4,"Use these words: I'm, He, is, What.are you doing?",What,",",TYP,False
9,4,2. Henry: reading a book.,I'm,",",TYP,False


In [23]:
# import session_info
# session_info.show()