# The Impact of Generative AI on Children's Creative Learning Transfer

Analysis conducted by Sachin Allums

### Loading Qualtrics Data Frame

In [68]:
import pandas as pd
import numpy as np

# may need to run pip install pandas first

In [69]:
data = pd.read_csv('honors_data_test.csv')
print(data.shape)
print(data.columns)

(73, 286)
Index(['StartDate', 'EndDate', 'Status', 'IPAddress', 'Progress',
       'Duration (in seconds)', 'Finished', 'RecordedDate', 'ResponseId',
       'RecipientLastName',
       ...
       'Item3', 'Item4', 'Item5', 'AlternateUsesTask5', 'AssignedCondition',
       'alreadyIntroducedToAI', 'StratifiedGroup', 'AlternateUsesTask2',
       'AlternateUsesTask3', 'AlternateUsesTask4'],
      dtype='object', length=286)


In [70]:
def recode_likert_data(data):
    """
    Transforms all likert questions into numeric scales
    """
    mapping = {
        "Strongly agree": 5,
        "Somewhat agree": 4,
        "Neither agree nor disagree": 3,
        "Somewhat disagree": 2,
        "Strongly disagree": 1
    }
    
    return data.replace(mapping)

In [71]:
data = recode_likert_data(data)

In [72]:
def get_creative_self_efficacy(data):
    pre_efficacy_ids = ["Q24_2", "Q24_3", "Q24_4", "Q25_1", "Q25_3", "Q25_4"]
    post_efficacy_ids = ["Q42_3", "Q42_4", "Q42_5", "Q43_1", "Q43_3", "Q43_4"]

    # Convert columns to numeric (coerce non-numeric values to NaN)
    data[pre_efficacy_ids + post_efficacy_ids] = data[pre_efficacy_ids + post_efficacy_ids].apply(pd.to_numeric, errors="coerce")

    # Count valid (non-NaN) responses for each participant
    pre_valid_counts = data[pre_efficacy_ids].notna().sum(axis=1)
    post_valid_counts = data[post_efficacy_ids].notna().sum(axis=1)

    # Sum scores while ignoring NaN values
    pre_score = data[pre_efficacy_ids].sum(axis=1, skipna=True)
    post_score = data[post_efficacy_ids].sum(axis=1, skipna=True)

    # Normalize by the available number of responses
    pre_score_normalized = (pre_score - pre_valid_counts) / (4 * pre_valid_counts)
    post_score_normalized = (post_score - post_valid_counts) / (4 * post_valid_counts)

    # Compute difference
    data["pre_creative_self_efficacy"] = pre_score_normalized
    data["post_creative_self_efficacy"] = post_score_normalized
    data["difference_creative_self_efficacy"] = post_score_normalized - pre_score_normalized
    return data

In [73]:
def get_creative_personal_identity(data):
    pre_cpi_ids = ["Q234", "Q24_1", "Q25_2", "Q25_5", "Q26_1"]
    post_cpi_ids = ["Q42_1", "Q42_2", "Q43_2", "Q43_5", "Q44_1"]

    # Convert columns to numeric (coerce non-numeric values to NaN)
    data[pre_cpi_ids + post_cpi_ids] = data[pre_cpi_ids + post_cpi_ids].apply(pd.to_numeric, errors="coerce")

    # Count valid (non-NaN) responses for each participant
    pre_valid_counts = data[pre_cpi_ids].notna().sum(axis=1)
    post_valid_counts = data[post_cpi_ids].notna().sum(axis=1)

    # Sum scores while ignoring NaN values
    pre_score = data[pre_cpi_ids].sum(axis=1, skipna=True)
    post_score = data[post_cpi_ids].sum(axis=1, skipna=True)

    # Normalize by the available number of responses
    pre_score_normalized = (pre_score - pre_valid_counts) / (4 * pre_valid_counts)
    post_score_normalized = (post_score - post_valid_counts) / (4 * post_valid_counts)

    # Compute difference
    data["pre_creative_personal_identity"] = pre_score_normalized
    data["post_creative_personal_identity"] = post_score_normalized
    data["difference_creative_personal_identity"] = post_score_normalized - pre_score_normalized
    return data

In [74]:
data = get_creative_self_efficacy(data)
data = get_creative_personal_identity(data)
print(data.shape)
print(data.columns)
print(data["pre_creative_self_efficacy"])
print(data["post_creative_self_efficacy"])
print(data["difference_creative_self_efficacy"])

(73, 292)
Index(['StartDate', 'EndDate', 'Status', 'IPAddress', 'Progress',
       'Duration (in seconds)', 'Finished', 'RecordedDate', 'ResponseId',
       'RecipientLastName',
       ...
       'StratifiedGroup', 'AlternateUsesTask2', 'AlternateUsesTask3',
       'AlternateUsesTask4', 'pre_creative_self_efficacy',
       'post_creative_self_efficacy', 'difference_creative_self_efficacy',
       'pre_creative_personal_identity', 'post_creative_personal_identity',
       'difference_creative_personal_identity'],
      dtype='object', length=292)
0          NaN
1          NaN
2          NaN
3          NaN
4          NaN
        ...   
68    0.666667
69    0.750000
70    0.916667
71    0.708333
72         NaN
Name: pre_creative_self_efficacy, Length: 73, dtype: float64
0          NaN
1          NaN
2          NaN
3          NaN
4          NaN
        ...   
68    0.750000
69    0.833333
70    0.958333
71    0.750000
72         NaN
Name: post_creative_self_efficacy, Length: 73, dtype: flo

In [75]:
def prepend_first_AUT_to_embedded_data(df):
    """
    Prepends the Q23 string to AlternateUsesTask1 with a semicolon,
    updating the AlternateUsesTask1 column in the given DataFrame.
    
    Parameters:
        df (pd.DataFrame): DataFrame containing 'Q23' and 'AlternateUsesTask1' columns.
    
    Returns:
        pd.DataFrame: Modified DataFrame with updated 'AlternateUsesTask1' column.
    """
    df['AlternateUsesTask1'] = df['Q23'].str.strip() + '; ' + df['AlternateUsesTask1'].str.strip()    
    df['AlternateUsesTask5'] = df['Q72'].str.strip() + '; ' + df['AlternateUsesTask5'].str.strip()
    
    return df

In [76]:
data = prepend_first_AUT_to_embedded_data(data)

In [77]:
def prepend_alternate_uses(row):
    if row['AssignedCondition'] in ['Assisted-First', 'Fully Assisted']:
        row['AlternateUsesTask2'] = str(row['Q80']).strip() + '; ' + str(row['AlternateUsesTask2']).strip()
        row['AlternateUsesTask3'] = str(row['Q202']).strip() + '; ' + str(row['AlternateUsesTask3']).strip()
        row['AlternateUsesTask4'] = str(row['Q210']).strip() + '; ' + str(row['AlternateUsesTask4']).strip()
    else:
        row['AlternateUsesTask2'] = str(row['Q190']).strip() + '; ' + str(row['AlternateUsesTask2']).strip()
        row['AlternateUsesTask3'] = str(row['Q221']).strip() + '; ' + str(row['AlternateUsesTask3']).strip()
        row['AlternateUsesTask4'] = str(row['Q229']).strip() + '; ' + str(row['AlternateUsesTask4']).strip()
    return row

In [78]:
data = data.apply(prepend_alternate_uses, axis=1)

In [79]:
print(data['AlternateUsesTask2'])
print(data['AlternateUsesTask5'])

0     Come up with as many alternate ways as possibl...
1     {"ImportId":"QID190_TEXT"}; {"ImportId":"Alter...
2                                              nan; nan
3                                              nan; nan
4                                              nan; nan
                            ...                        
68    flip it as a game; empty it as use it to grown...
69                                        shirt; fabric
70    swing; chair; planter; decoration; trampoline;...
71    You can attach it to your car; you can make an...
72                                             nan; nan
Name: AlternateUsesTask2, Length: 73, dtype: object
0     Come up with as many alternate ways as possibl...
1     {"ImportId":"QID72_TEXT"}; {"ImportId":"Altern...
2                                                   NaN
3                                                   NaN
4                                                   NaN
                            ...                     

In [80]:
def assign_dat_columns(row):
    """
    Assigns DAT1 through DAT10 columns based on the AssignedCondition.
    
    Parameters:
        row (pd.Series): A row of the DataFrame.
        
    Returns:
        pd.Series: The row with new DAT columns assigned.
    """
    if row['AssignedCondition'] in ['Control', 'Assisted-First']:
        for i in range(1, 11):
            row[f'DAT{i}'] = row.get(f'Q37_{i}', None)
    elif row['AssignedCondition'] in ['Fully Assisted', 'Assisted-Second']:
        for i in range(1, 11):
            row[f'DAT{i}'] = row.get(f'Q193_{i}', None)
    return row

In [81]:
data = data.apply(assign_dat_columns, axis=1)

In [82]:
print(data['DAT1'])

0        NaN
1        NaN
2        NaN
3        NaN
4        NaN
       ...  
68    flower
69       dog
70       cat
71      Oboe
72       NaN
Name: DAT1, Length: 73, dtype: object


In [83]:
def score_digit_span(row):
    """
    Scores the digit span task based on the highest sequence correctly recalled.

    Parameters:
        row (pd.Series): A row of the DataFrame containing participant responses.

    Returns:
        int: The highest digit span score achieved by the participant.
    """
    # Define the correct sequences with their corresponding scores
    correct_sequences = {
        'Q141': '1376',
        'Q143': '95408',
        'Q145': '597832',
        'Q153': '4012683',
        'Q163': '83976574'
    }
    
    # Initialize the score
    score = 0
    
    # Iterate over the correct sequences
    for question, correct_answer in correct_sequences.items():
        # Check if the participant's response matches the correct answer
        if str(row.get(question, '')).strip() == correct_answer:
            # Update the score to the length of the correct sequence
            score = len(correct_answer)
    
    return score

In [84]:
digit_span_scores = data.apply(score_digit_span, axis=1)
data = pd.concat([data, digit_span_scores.rename('DigitSpanScore')], axis=1)

In [85]:
print(data['DigitSpanScore'])

0     0
1     0
2     0
3     0
4     0
     ..
68    7
69    6
70    7
71    8
72    0
Name: DigitSpanScore, Length: 73, dtype: int64


In [86]:
def calculate_fluency_scores(df):
    """
    Calculates fluency scores for AlternateUsesTask columns by counting semicolons.

    Parameters:
        df (pd.DataFrame): DataFrame containing AlternateUsesTask1 to AlternateUsesTask5 columns.

    Returns:
        pd.DataFrame: DataFrame with new fluency columns for each AlternateUsesTask column.
    """
    fluency_frames = {}  # Dictionary to hold new fluency columns

    for i in range(1, 6):
        col_name = f'AlternateUsesTask{i}'
        fluency_col_name = f'Fluency{i}'
        fluency_frames[fluency_col_name] = np.where(df[col_name].notna(), df[col_name].str.count(';') + 1, 0)

    # Create a DataFrame from the dictionary and concatenate it with the original DataFrame
    fluency_df = pd.DataFrame(fluency_frames, index=df.index)
    df = pd.concat([df, fluency_df], axis=1)

    return df

In [87]:
data = calculate_fluency_scores(data)

In [88]:
print(data['Fluency1'])
print(data.columns)
print(data.shape)

0     2.0
1     2.0
2     0.0
3     0.0
4     0.0
     ... 
68    6.0
69    8.0
70    8.0
71    5.0
72    0.0
Name: Fluency1, Length: 73, dtype: float64
Index(['AlternateUsesTask1', 'AlternateUsesTask2', 'AlternateUsesTask3',
       'AlternateUsesTask4', 'AlternateUsesTask5', 'AssignedCondition',
       'Create New Field or Choose From Dropdown...', 'CreativityLevel',
       'DAT1', 'DAT10',
       ...
       'post_creative_personal_identity', 'post_creative_self_efficacy',
       'pre_creative_personal_identity', 'pre_creative_self_efficacy',
       'DigitSpanScore', 'Fluency1', 'Fluency2', 'Fluency3', 'Fluency4',
       'Fluency5'],
      dtype='object', length=308)
(73, 308)


In [89]:
# Step 1: Define the cutoff date and time
cutoff_datetime = pd.to_datetime('2025-02-17 17:00:00')

# Step 2: Have a default for any errors
default_date = pd.Timestamp('2025-01-01')
data['StartDate'] = pd.to_datetime(data['StartDate'], errors='coerce').fillna(default_date)

# Step 3: Filter the DataFrame
data = data[data['StartDate'] >= cutoff_datetime]

# Display the filtered DataFrame
print(data['StartDate'])

45   2025-02-17 21:29:26
51   2025-02-19 17:16:03
54   2025-02-23 20:00:03
55   2025-02-24 16:47:02
56   2025-02-19 14:48:31
57   2025-02-20 14:02:32
58   2025-02-23 14:15:03
59   2025-02-25 11:24:13
60   2025-03-05 17:31:07
61   2025-03-05 17:34:35
62   2025-03-09 17:23:59
63   2025-03-05 18:22:27
64   2025-03-13 18:30:54
65   2025-03-14 11:44:15
66   2025-03-16 21:14:19
67   2025-03-30 17:47:09
68   2025-03-30 18:01:44
69   2025-03-30 18:41:45
70   2025-03-30 19:21:16
71   2025-03-31 09:49:38
72   2025-03-30 19:36:54
Name: StartDate, dtype: datetime64[ns]


  data['StartDate'] = pd.to_datetime(data['StartDate'], errors='coerce').fillna(default_date)


In [None]:
import dat

# GloVe model from https://nlp.stanford.edu/projects/glove/
model = dat.Model("glove.840B.300d.txt", "words.txt")

In [None]:

# Word examples (Figure 1 in paper)
test = ["Oboe", "apricot" , "elephant", "pillow case", "basketball", "Vampire", "crochet", "chess", "overalls", "tape"]

test2 = ["dog", "graph", "skateboard", "guitar", "shirt", "black", "window", "soup", "string lights", "pinwheel"]
test3 = ["cat", "vegetable ","swimming ","sewn", "read", "watch", "blanket", "trash", "rug"]
# Compute the DAT score (transformed average cosine distance of first 7 valid words)
print(model.dat(test)) # 87.03
print(model.dat(test2))
print(model.dat(test3))

86.40841
78.68301
79.78776


In [95]:
# Combine DAT columns into lists per row
dat_cols = [f'DAT{i}' for i in range(1, 11)]

# Make sure all words are strings and handle missing values
data[dat_cols] = data[dat_cols].astype(str).fillna('')

# Apply the DAT model to each row
def compute_dat_score(row):
    words = [word for word in row[dat_cols] if word]  # Filter out empty strings
    return model.dat(words)

data['DATScore'] = data.apply(compute_dat_score, axis=1)

In [93]:
print(data['DATScore'])

45    77.898827
51    14.901323
54    79.355782
55    76.684273
56          NaN
57          NaN
58          NaN
59          NaN
60          NaN
61          NaN
62    83.678215
63          NaN
64    77.799004
65    82.316826
66    82.750648
67    80.883568
68    89.440758
69    78.683006
70    79.787758
71    87.030182
72          NaN
Name: DATScore, dtype: float64
