In [405]:
import pandas as pd
import matplotlib.pyplot as plt

In [406]:
# Define the file path for the CSV file
file_path = 'pre-intervention.csv'

In [407]:
# Load the CSV file into a pandas DataFrame
data = pd.read_csv(file_path)

In [408]:
# 1. Transform the "How many hours per week do you spend preparing instructional materials?" column
hours_mapping = {
    'Less than 1 hour': 0.5,
    '1-3 hours': 2,
    '4-6 hours': 5,
    '7-10 hours': 8.5,
    '11-15 hours': 13,
    'More than 15 hours': 16
}
pd.set_option('future.no_silent_downcasting', True)
data['How many hours per week do you spend preparing instructional materials?'] = data['How many hours per week do you spend preparing instructional materials?'].replace(hours_mapping)

In [409]:
# 2. Transform the "How often do you use digital tools or resources in your lesson planning and material preparation?" column
digital_tool_mapping = {
	'Never': 0,
    'Rarely (once a month or less)': 1,
    'Occasionally (2-3 times a month)': 2,
    'Frequently (once a week)': 3,
    'Very Frequently (2-4 times a week)': 4,
    'Always (daily)': 5
}
data['How often do you use digital tools or resources in your lesson planning and material preparation?'] = data['How often do you use digital tools or resources in your lesson planning and material preparation?'].replace(digital_tool_mapping)

In [410]:
# 5. Transform the "How often do you feel overwhelmed by your teaching workload?" column
overwhelm_mapping = {
    'Never': 0,
    'Rarely': 1,
    'Occasionally': 2,
    'Frequently': 3,
    'Very Frequently': 4,
    'Always': 5
}
data['How often do you feel overwhelmed by your teaching workload?'] = data['How often do you feel overwhelmed by your teaching workload?'].replace(overwhelm_mapping)

In [411]:
# # 6. Transform the "Do you find it challenging to create personalised instructional materials that meet the diverse needs of your students?" column
challenge_mapping = {
    'Strongly Agree': 4,
    'Agree': 3,
    'Neutral': 2,
    'Disagree': 1,
    'Strongly Disagree': 0
}
data['Do you find it challenging to create personalised instructional materials that meet the diverse needs of your students?'] = data['Do you find it challenging to create personalised instructional materials that meet the diverse needs of your students?'].replace(challenge_mapping)

In [412]:
# 7. Transform the "How comfortable are you with using technology in your teaching practices?" column
comfort_mapping = {
    'Very Comfortable': 4,
    'Comfortable': 3,
    'Neutral': 2,
    'Uncomfortable': 1,
    'Very Uncomfortable': 0
}
data['How comfortable are you with using technology in your teaching practices?'] = data['How comfortable are you with using technology in your teaching practices?'].replace(comfort_mapping)

In [413]:
# 8. Transform the "Have you previously used any AI tools for teaching or instructional material preparation?" column
ai_tool_mapping = {
    'Yes': 1,
    'No': 2
}
data['Have you previously used any AI tools for teaching or instructional material preparation?'] = data['Have you previously used any AI tools for teaching or instructional material preparation?'].replace(ai_tool_mapping)

In [414]:
# Refine column names from the transformed data
column_names = [
    "Teaching_Experience",
    "Subjects_Taught",
    "Year_Groups_Taught",
    "Frequency_of_Digital_Tool_Usage",
    "Hours_Preparing_Materials",
    "Overwhelmed_by_Workload",
    "Comfort_with_Technology",
    "Challenges_Personalizing_Materials",
    "AI_Tool_Usage"
]
# data = pd.read_csv(file_path, names=column_names, header=0)

# Assign the refined column names to the transformed data
data.columns = column_names

# Display the refined data
data.head()

#Save the refined data to a new CSV file
data.to_csv('transformed_pre_intervention_v2.csv', index=False)

In [415]:
# Read the transformed data
transformed_header_data = pd.read_csv('transformed_pre_intervention_v2.csv')
transformed_header_data.head()

Unnamed: 0,Teaching_Experience,Subjects_Taught,Year_Groups_Taught,Frequency_of_Digital_Tool_Usage,Hours_Preparing_Materials,Overwhelmed_by_Workload,Comfort_with_Technology,Challenges_Personalizing_Materials,AI_Tool_Usage
0,4 - 6 years,"English, Physical Education","Year 10 / Grade 10 / Senior Secondary 1, Year ...",4,8.5,4.0,2,4.0,2
1,7 - 10 years,Science,"Year 10 / Grade 10 / Senior Secondary 1, Year ...",2,2.0,2.0,3,3.0,2
2,4 - 6 years,"Mathematics, Physics","Year 10 / Grade 10 / Senior Secondary 1, Year ...",4,2.0,2.0,1,4.0,1
3,1 - 3 years,Social studies,"Adult education, continuing education",2,2.0,3.0,4,2.0,2
4,7 - 10 years,"History, Geography","Year 11 / Grade 11 / Senior Secondary 2, Year ...",2,0.5,4.0,2,0.0,1


### Descriptive Statistics on the Pre-Intervention Data

In [416]:
# Function to create a pie chart for a given column and labels
def create_pie_chart(column_data, column_name, labels, colors):
    counts = column_data.value_counts().values  # Get counts for the column

    # Custom autopct function to show both count and percentage
    def autopct_format(pct, counts):
        total = sum(counts)
        count = int(round(pct * total / 100.0))
        return f'{count} ({pct:.1f}%)'

    # Create the pie chart
    plt.figure(figsize=(7, 7), facecolor='white')
    plt.pie(counts, labels=labels, autopct=lambda pct: autopct_format(pct, counts), startangle=90, colors=colors[:len(labels)])

    # Equal aspect ratio ensures that pie is drawn as a circle
    plt.axis('equal')

    # Title for the chart
    plt.title(f'{column_name}', pad=20)

    # Show the pie chart
    plt.show()
    
# Function to create a bar chart for a given column and labels
def create_bar_chart(column_data, column_name, labels, colors):
    counts = column_data.value_counts().values  # Get counts for the column

    # Create the bar chart
    plt.figure(figsize=(10, 6),facecolor='white')
    plt.bar(labels[:len(counts)], counts, color=colors[:len(counts)], edgecolor='black')

    # Title and labels for the chart
    plt.title(f'{column_name}', pad=20)
    plt.xlabel('Categories')
    plt.ylabel('Count')

    # Annotate each bar with the count
    for i, count in enumerate(counts):
        plt.text(i, count + 0.5, str(count), ha='center')

    # Show the bar chart
    plt.xticks(rotation=45, ha='right')
    plt.tight_layout()
    plt.show()
    
# Function to create a line graph for a given column and labels
def create_line_graph(column_data, column_name, labels, color='blue'):
    counts = column_data.value_counts().sort_index().values  # Get counts for the column

    # Create the line graph
    plt.figure(figsize=(10, 6), facecolor='white')
    plt.plot(labels[:len(counts)], counts, marker='o', color=color, linestyle='-', linewidth=2)

    # Title and labels for the chart
    plt.title(f'{column_name}', pad=20)
    plt.xlabel('Categories')
    plt.ylabel('Count')

    # Annotate each point with the count
    for i, count in enumerate(counts):
        plt.text(i, count + 0.5, str(count), ha='center')

    # Show the line graph
    plt.xticks(rotation=45, ha='right')
    plt.tight_layout()
    plt.show()


### Quick overview of pre-intervention data

In [417]:
transformed_header_data.describe().round(1)

Unnamed: 0,Frequency_of_Digital_Tool_Usage,Hours_Preparing_Materials,Overwhelmed_by_Workload,Comfort_with_Technology,Challenges_Personalizing_Materials,AI_Tool_Usage
count,703.0,703.0,699.0,703.0,702.0,703.0
mean,3.0,3.9,2.4,2.5,3.5,1.5
std,1.5,3.5,1.2,1.1,0.7,0.5
min,0.0,0.5,0.0,0.0,0.0,1.0
25%,2.0,2.0,2.0,2.0,3.0,1.0
50%,3.0,2.0,2.0,3.0,4.0,2.0
75%,4.0,5.0,3.0,3.0,4.0,2.0
max,5.0,16.0,5.0,4.0,4.0,2.0


### 1. Frequency of Teaching Experience

In [418]:
Teacher_Experience_index = transformed_header_data['Teaching_Experience'].value_counts().index
Teacher_Experience_values = transformed_header_data['Teaching_Experience'].value_counts().values
data = {'Teaching_Experience': Teacher_Experience_index, 'Count': Teacher_Experience_values}
df = pd.DataFrame(data)
# Calculating the total count
total_count = df['Count'].sum()
print(f'Total Count: {total_count}')
# Adding a new column for percentage
df['Percentage'] = ((df['Count'] / total_count) * 100).round(1)
Total_Percentage = df['Percentage'].sum()
print(f'Total Percentage: {Total_Percentage}')
# Display the DataFrame
df

Total Count: 703
Total Percentage: 99.99999999999999


Unnamed: 0,Teaching_Experience,Count,Percentage
0,More than ten years,210,29.9
1,7 - 10 years,184,26.2
2,4 - 6 years,176,25.0
3,1 - 3 years,124,17.6
4,Less than one year,9,1.3


### 2. Frequency of Digital Tool Usage

In [419]:
Digital_Tool_Usage_index = transformed_header_data['Frequency_of_Digital_Tool_Usage'].value_counts().index
Digital_Tool_Usage_value = transformed_header_data['Frequency_of_Digital_Tool_Usage'].value_counts().values
# map the index to the actual values
Digital_Tool_Usage_index = [list(digital_tool_mapping.keys())[list(digital_tool_mapping.values()).index(i)] for i in Digital_Tool_Usage_index]
# Calculating the total count
data = {'Digital_Tool_Usage': Digital_Tool_Usage_index, 'Count': Digital_Tool_Usage_value}
df = pd.DataFrame(data)
# Calculating the total count
total_count = df['Count'].sum()
print(f'Total Count: {total_count}')
# Adding a new column for percentage
df['Percentage'] = ((df['Count'] / total_count) * 100).round(1)
Total_Percentage = df['Percentage'].sum()
print(f'Total Percentage: {Total_Percentage}')
# Display the DataFrame
df

Total Count: 703
Total Percentage: 99.9


Unnamed: 0,Digital_Tool_Usage,Count,Percentage
0,Very Frequently (2-4 times a week),154,21.9
1,Occasionally (2-3 times a month),151,21.5
2,Frequently (once a week),138,19.6
3,Always (daily),135,19.2
4,Rarely (once a month or less),86,12.2
5,Never,39,5.5


### 3. Hours Preparing Materials

In [420]:
Hours_Preparing_Materials_index = transformed_header_data['Hours_Preparing_Materials'].value_counts().index
Hours_Preparing_Materials_value = transformed_header_data['Hours_Preparing_Materials'].value_counts().values
# map the index to the actual values
Hours_Preparing_Materials_index = [list(hours_mapping.keys())[list(hours_mapping.values()).index(i)] for i in Hours_Preparing_Materials_index]
# Calculating the total count
data = {'Hours_Preparing_Materials': Hours_Preparing_Materials_index, 'Count': Hours_Preparing_Materials_value}
df = pd.DataFrame(data)
# Calculating the total count
total_count = df['Count'].sum()
print(f'Total Count: {total_count}')
# Adding a new column for percentage
df['Percentage'] = ((df['Count'] / total_count) * 100).round(1)
Total_Percentage = df['Percentage'].sum()
print(f'Total Percentage: {Total_Percentage}')
# Display the DataFrame
df

Total Count: 703
Total Percentage: 100.0


Unnamed: 0,Hours_Preparing_Materials,Count,Percentage
0,1-3 hours,336,47.8
1,4-6 hours,168,23.9
2,Less than 1 hour,87,12.4
3,7-10 hours,68,9.7
4,11-15 hours,27,3.8
5,More than 15 hours,17,2.4


### 4. Overwhelmed by Workload

In [421]:
Overwhelmed_by_Workload_index = transformed_header_data['Overwhelmed_by_Workload'].value_counts().index
Overwhelmed_by_Workload_value = transformed_header_data['Overwhelmed_by_Workload'].value_counts().values
# map the index to the actual values
Overwhelmed_by_Workload_index = [list(overwhelm_mapping.keys())[list(overwhelm_mapping.values()).index(i)] for i in Overwhelmed_by_Workload_index]
# Calculating the total count
data = {'Overwhelmed_by_Workload': Overwhelmed_by_Workload_index, 'Count': Overwhelmed_by_Workload_value}
df = pd.DataFrame(data)
# Calculating the total count
total_count = df['Count'].sum()
print(f'Total Count: {total_count}')
# Adding a new column for percentage
df['Percentage'] = ((df['Count'] / total_count) * 100).round(1)
Total_Percentage = df['Percentage'].sum()
print(f'Total Percentage: {Total_Percentage}')
# Display the DataFrame
df

Total Count: 699
Total Percentage: 99.9


Unnamed: 0,Overwhelmed_by_Workload,Count,Percentage
0,Occasionally,293,41.9
1,Frequently,142,20.3
2,Rarely,114,16.3
3,Always,65,9.3
4,Very Frequently,61,8.7
5,Never,24,3.4


### 5. Comfort with Technology

In [422]:
Comfort_with_Technology_index = transformed_header_data['Comfort_with_Technology'].value_counts().index
Comfort_with_Technology_value = transformed_header_data['Comfort_with_Technology'].value_counts().values
# map the index to the actual values
Comfort_with_Technology_index = [list(comfort_mapping.keys())[list(comfort_mapping.values()).index(i)] for i in Comfort_with_Technology_index]
# Calculating the total count
data = {'Comfort_with_Technology': Comfort_with_Technology_index, 'Count': Comfort_with_Technology_value}
df = pd.DataFrame(data)
# Calculating the total count
total_count = df['Count'].sum()
print(f'Total Count: {total_count}')
# Adding a new column for percentage
df['Percentage'] = ((df['Count'] / total_count) * 100).round(1)
Total_Percentage = df['Percentage'].sum()
print(f'Total Percentage: {Total_Percentage}')
# Display the DataFrame
df

Total Count: 703
Total Percentage: 100.0


Unnamed: 0,Comfort_with_Technology,Count,Percentage
0,Comfortable,320,45.5
1,Neutral,135,19.2
2,Very Comfortable,111,15.8
3,Uncomfortable,109,15.5
4,Very Uncomfortable,28,4.0


### 6. Challenges Personalizing Materials

In [423]:
Challenges_Personalizing_Materials_index = transformed_header_data['Challenges_Personalizing_Materials'].value_counts().index
Challenges_Personalizing_Materials_value = transformed_header_data['Challenges_Personalizing_Materials'].value_counts().values
# map the index to the actual values
Challenges_Personalizing_Materials_index = [list(challenge_mapping.keys())[list(challenge_mapping.values()).index(i)] for i in Challenges_Personalizing_Materials_index]
# Calculating the total count
data = {'Challenges_Personalizing_Materials': Challenges_Personalizing_Materials_index, 'Count': Challenges_Personalizing_Materials_value}
df = pd.DataFrame(data)
# Calculating the total count
total_count = df['Count'].sum()
print(f'Total Count: {total_count}')
# Adding a new column for percentage
df['Percentage'] = ((df['Count'] / total_count) * 100).round(1)
   
Total_Percentage = df['Percentage'].sum()
print(f'Total Percentage: {Total_Percentage}')
# Display the DataFrame
df

Total Count: 702
Total Percentage: 99.89999999999999


Unnamed: 0,Challenges_Personalizing_Materials,Count,Percentage
0,Strongly Agree,429,61.1
1,Agree,217,30.9
2,Neutral,43,6.1
3,Disagree,8,1.1
4,Strongly Disagree,5,0.7


### 7. AI Tool Usage

In [424]:
AI_Tool_Usage_index = transformed_header_data['AI_Tool_Usage'].value_counts().index
AI_Tool_Usage_value = transformed_header_data['AI_Tool_Usage'].value_counts().values
# map the index to the actual values
AI_Tool_Usage_index = [list(ai_tool_mapping.keys())[list(ai_tool_mapping.values()).index(i)] for i in AI_Tool_Usage_index]
# Calculating the total count
data = {'AI_Tool_Usage': AI_Tool_Usage_index, 'Count': AI_Tool_Usage_value}
df = pd.DataFrame(data)
# Calculating the total count
total_count = df['Count'].sum()
print(f'Total Count: {total_count}')
# Adding a new column for percentage
df['Percentage'] = ((df['Count'] / total_count) * 100).round(1)
Total_Percentage = df['Percentage'].sum()
print(f'Total Percentage: {Total_Percentage}')
# Display the DataFrame
df

Total Count: 703
Total Percentage: 100.0


Unnamed: 0,AI_Tool_Usage,Count,Percentage
0,No,356,50.6
1,Yes,347,49.4


### Testing Correlation between Columns

In [440]:
from scipy.stats import pearsonr

In [443]:
# Check if there are missing values in the relevant columns and handle them
df = transformed_header_data[['Frequency_of_Digital_Tool_Usage', 'Overwhelmed_by_Workload']]
# Calculate the Pearson correlation coefficient
df = df.dropna(subset=['Frequency_of_Digital_Tool_Usage', 'Overwhelmed_by_Workload'])
corr, p_value = pearsonr(df['Frequency_of_Digital_Tool_Usage'], df['Overwhelmed_by_Workload'])
# Output the result
print(f"Pearson Correlation Coefficient: {corr}")
print(f"P-value: {p_value}")

Pearson Correlation Coefficient: 0.0015039097518604536
P-value: 0.9683628861829182


### Correlation Matrix for Selected Columns

In [425]:
# Columns of interest for the calculations
columns_of_interest = [
    "Frequency_of_Digital_Tool_Usage",
    "Hours_Preparing_Materials",
    "Overwhelmed_by_Workload",
    "Comfort_with_Technology",
    "Challenges_Personalizing_Materials",
    "AI_Tool_Usage",
]

### 1. Pearson Correlation Matrix for Selected Columns

In [426]:
correlation_matrix = transformed_header_data[columns_of_interest].corr(method='pearson')
correlation_matrix.style.background_gradient(cmap='coolwarm').set_caption("Correlation Matrix for Selected Columns").set_table_styles(
	    [{'selector': 'caption', 'props': [('color', 'black'), ('font-size', '16px')]}]
)

Unnamed: 0,Frequency_of_Digital_Tool_Usage,Hours_Preparing_Materials,Overwhelmed_by_Workload,Comfort_with_Technology,Challenges_Personalizing_Materials,AI_Tool_Usage
Frequency_of_Digital_Tool_Usage,1.0,0.153681,-0.004552,-0.172538,0.226812,-0.311257
Hours_Preparing_Materials,0.153681,1.0,0.110809,0.048125,0.051014,-0.091507
Overwhelmed_by_Workload,-0.004552,0.110809,1.0,0.210129,0.021425,-0.03258
Comfort_with_Technology,-0.172538,0.048125,0.210129,1.0,-0.063493,0.078424
Challenges_Personalizing_Materials,0.226812,0.051014,0.021425,-0.063493,1.0,-0.147364
AI_Tool_Usage,-0.311257,-0.091507,-0.03258,0.078424,-0.147364,1.0


### 1b. Pearson correlation coefficient metrix and the p-value

In [439]:
import pandas as pd
import numpy as np
from scipy.stats import pearsonr

th_data = transformed_header_data.dropna(subset=columns_of_interest)
transformed_header_data = th_data
# Create empty DataFrames for storing correlation coefficients and p-values
correlation_matrix = pd.DataFrame(np.zeros((len(columns_of_interest), len(columns_of_interest))), 
                                  columns=columns_of_interest, 
                                  index=columns_of_interest)

p_value_matrix = pd.DataFrame(np.zeros((len(columns_of_interest), len(columns_of_interest))), 
                              columns=columns_of_interest, 
                              index=columns_of_interest)

# Calculate Pearson correlation and p-values
for col1 in columns_of_interest:
    for col2 in columns_of_interest:
        if col1 == col2:
            # Fill diagonal with 1 for correlation and 0 for p-values
            correlation_matrix.loc[col1, col2] = 1.0
            p_value_matrix.loc[col1, col2] = 0.0
        else:
            # Compute Pearson correlation and p-value
            corr, p_val = pearsonr(transformed_header_data[col1], transformed_header_data[col2])
            correlation_matrix.loc[col1, col2] = corr
            p_value_matrix.loc[col1, col2] = p_val

# Function to format both correlation and p-value
def custom_format(corr, p):
    if p < 0.001:
        return f'{corr:.2f}\n(p<{p:.3f})'  # Display up to 3 decimal places for small p-values
    else:
        return f'{corr:.2f}\n(p={p:.3f})'

# Create a DataFrame for formatted correlation and p-values
formatted_matrix = pd.DataFrame('', columns=columns_of_interest, index=columns_of_interest, dtype='object')

# Apply custom formatting to combine correlation and p-values
for col1 in columns_of_interest:
    for col2 in columns_of_interest:
        formatted_matrix.loc[col1, col2] = custom_format(correlation_matrix.loc[col1, col2], p_value_matrix.loc[col1, col2])

# Apply background gradient only on the correlation values

# Apply background gradient only on the correlation values
styled_matrix = correlation_matrix.style.background_gradient(cmap='coolwarm') \
    .set_caption("Pearson Correlation Matrix with P-values for Selected Columns") \
    .set_table_styles([{'selector': 'caption', 'props': [('color', 'black'), ('font-size', '16px')]}])

# Display formatted matrix (correlation coefficients and p-values) without background gradient
styled_matrix_with_text = formatted_matrix.style.set_properties(**{'white-space': 'pre'}) \
    .set_caption("Pearson Correlation Matrix with P-values for Selected Columns") \
    .set_table_styles([{'selector': 'caption', 'props': [('color', 'black'), ('font-size', '16px')]}])

# Display both matrices
styled_matrix_with_text

Unnamed: 0,Frequency_of_Digital_Tool_Usage,Hours_Preparing_Materials,Overwhelmed_by_Workload,Comfort_with_Technology,Challenges_Personalizing_Materials,AI_Tool_Usage
Frequency_of_Digital_Tool_Usage,1.00 (p<0.000),0.15 (p<0.000),0.00 (p=0.968),-0.17 (p<0.000),0.22 (p<0.000),-0.31 (p<0.000)
Hours_Preparing_Materials,0.15 (p<0.000),1.00 (p<0.000),0.11 (p=0.003),0.05 (p=0.180),0.05 (p=0.200),-0.09 (p=0.020)
Overwhelmed_by_Workload,0.00 (p=0.968),0.11 (p=0.003),1.00 (p<0.000),0.21 (p<0.000),0.02 (p=0.572),-0.04 (p=0.346)
Comfort_with_Technology,-0.17 (p<0.000),0.05 (p=0.180),0.21 (p<0.000),1.00 (p<0.000),-0.06 (p=0.092),0.07 (p=0.049)
Challenges_Personalizing_Materials,0.22 (p<0.000),0.05 (p=0.200),0.02 (p=0.572),-0.06 (p=0.092),1.00 (p<0.000),-0.15 (p<0.000)
AI_Tool_Usage,-0.31 (p<0.000),-0.09 (p=0.020),-0.04 (p=0.346),0.07 (p=0.049),-0.15 (p<0.000),1.00 (p<0.000)


### 2. Spearman Correlation Matrix for Selected Columns

In [428]:
spearman_correlation_matrix = transformed_header_data[columns_of_interest].corr(method='spearman')

# Styling the correlation matrix with a background gradient and caption
spearman_correlation_matrix.style.background_gradient(cmap='coolwarm').set_caption(
    "Spearman Correlation Matrix for Selected Columns"
).set_table_styles(
    [{'selector': 'caption', 'props': [('color', 'black'), ('font-size', '16px')]}]
)

Unnamed: 0,Frequency_of_Digital_Tool_Usage,Hours_Preparing_Materials,Overwhelmed_by_Workload,Comfort_with_Technology,Challenges_Personalizing_Materials,AI_Tool_Usage
Frequency_of_Digital_Tool_Usage,1.0,0.156357,-0.000232,-0.155506,0.231652,-0.302691
Hours_Preparing_Materials,0.156357,1.0,0.122344,0.066165,0.071841,-0.073544
Overwhelmed_by_Workload,-0.000232,0.122344,1.0,0.233886,0.015232,-0.035363
Comfort_with_Technology,-0.155506,0.066165,0.233886,1.0,-0.076739,0.067893
Challenges_Personalizing_Materials,0.231652,0.071841,0.015232,-0.076739,1.0,-0.166382
AI_Tool_Usage,-0.302691,-0.073544,-0.035363,0.067893,-0.166382,1.0


### 3. Spearman correlation coefficient metrix and the p-value for selected columns

In [429]:
import numpy as np
from scipy.stats import spearmanr

def custom_format(corr, p):
    # Check if the p-value is very small, display more precision
    if p < 0.001:
        return f'{corr:.2f}\n(p={p:.6f})'  # Display up to 6 decimal places for small p-values
    else:
        return f'{corr:.2f}\n(p={p:.3f})'

# Assuming 'data' is your DataFrame and 'columns_of_interest' contains the columns you want to analyze
data = transformed_header_data
data_subset = data[columns_of_interest]

# Create empty DataFrames for storing correlation coefficients and p-values
spearman_corr = pd.DataFrame(np.zeros((len(columns_of_interest), len(columns_of_interest))), 
                             columns=columns_of_interest, 
                             index=columns_of_interest)

p_values = pd.DataFrame(np.zeros((len(columns_of_interest), len(columns_of_interest))), 
                        columns=columns_of_interest, 
                        index=columns_of_interest)

clean_data = data.dropna(subset=columns_of_interest)

# Calculate Spearman correlation and p-values
for col1 in columns_of_interest:
    for col2 in columns_of_interest:
        if col1 == col2:
            spearman_corr.loc[col1, col2] = 1.0
            p_values.loc[col1, col2] = 0.0
        else:
            corr, p_val = spearmanr(clean_data[col1], clean_data[col2])
            spearman_corr.loc[col1, col2] = corr
            p_values.loc[col1, col2] = p_val

# Create a new DataFrame with dtype 'object' for storing formatted strings
combined = pd.DataFrame('', columns=columns_of_interest, index=columns_of_interest, dtype='object')

# Custom format for displaying both correlation and p-values
# def custom_format(corr, p):
#     return f'{corr:.2f}\n(p={p:.3f})'

# Apply custom formatting to combine correlation and p-values
for col1 in columns_of_interest:
    for col2 in columns_of_interest:
        combined.loc[col1, col2] = custom_format(spearman_corr.loc[col1, col2], p_values.loc[col1, col2])

# Styling the combined matrix
combined = combined.style.set_properties(**{'white-space': 'pre'}) \
    .set_caption("Spearman Correlation Matrix with P-values for Selected Columns") \
    .set_table_styles([{'selector': 'caption', 'props': [('color', 'black'), ('font-size', '16px')]}])

combined

Unnamed: 0,Frequency_of_Digital_Tool_Usage,Hours_Preparing_Materials,Overwhelmed_by_Workload,Comfort_with_Technology,Challenges_Personalizing_Materials,AI_Tool_Usage
Frequency_of_Digital_Tool_Usage,1.00 (p=0.000000),0.16 (p=0.000033),-0.00 (p=0.995),-0.16 (p=0.000037),0.23 (p=0.000000),-0.30 (p=0.000000)
Hours_Preparing_Materials,0.16 (p=0.000033),1.00 (p=0.000000),0.12 (p=0.001),0.07 (p=0.081),0.07 (p=0.058),-0.07 (p=0.052)
Overwhelmed_by_Workload,-0.00 (p=0.995),0.12 (p=0.001),1.00 (p=0.000000),0.23 (p=0.000000),0.02 (p=0.688),-0.04 (p=0.351)
Comfort_with_Technology,-0.16 (p=0.000037),0.07 (p=0.081),0.23 (p=0.000000),1.00 (p=0.000000),-0.08 (p=0.043),0.07 (p=0.073)
Challenges_Personalizing_Materials,0.23 (p=0.000000),0.07 (p=0.058),0.02 (p=0.688),-0.08 (p=0.043),1.00 (p=0.000000),-0.17 (p=0.000010)
AI_Tool_Usage,-0.30 (p=0.000000),-0.07 (p=0.052),-0.04 (p=0.351),0.07 (p=0.073),-0.17 (p=0.000010),1.00 (p=0.000000)


###  Chi-Square Test for AI Tool Usage and Comfort with Technology

In [430]:
# Chi-Square Test for AI Tool Usage and Comfort with Technology
import pandas as pd
from scipy.stats import chi2_contingency
# Create a contingency table for AI Tool Usage and Comfort with Technology
contingency_table = pd.crosstab(data['AI_Tool_Usage'], data['Comfort_with_Technology'])

# Perform Chi-Square Test
chi2, p, dof, expected = chi2_contingency(contingency_table)

# Print the results
print("Chi-Square Test Statistic:", chi2)
print("p-value:", p)
print("Degrees of Freedom:", dof)
print("Expected Frequencies Table:")
print(expected)

# Interpretation of the result
if p < 0.05:
    print("There is a significant association between AI Tool Usage and Comfort with Technology.")
else:
    print("There is no significant association between AI Tool Usage and Comfort with Technology.")

Chi-Square Test Statistic: 5.523146280686654
p-value: 0.23770238237531374
Degrees of Freedom: 4
Expected Frequencies Table:
[[ 13.87965616  54.03151862  66.91977077 157.13753582  54.03151862]
 [ 14.12034384  54.96848138  68.08022923 159.86246418  54.96848138]]
There is no significant association between AI Tool Usage and Comfort with Technology.


### Chi-Square Test for AI Tool Usage and Challenges Personalizing Materials

In [431]:
# Chi-Square Test for AI Tool Usage and Challenges Personalizing Materials  
import pandas as pd
from scipy.stats import chi2_contingency
# Create a contingency table for AI Tool Usage and Challenges Personalizing Materials
contingency_table = pd.crosstab(data['AI_Tool_Usage'], data['Challenges_Personalizing_Materials'])
# Perform Chi-Square Test
chi2, p, dof, expected = chi2_contingency(contingency_table)
# Print the results
print("Chi-Square Test Statistic:", chi2)
print("p-value:", p)
print("Degrees of Freedom:", dof)
print("Expected Frequencies Table:")
print(expected)
# Interpretation of the result
if p < 0.05:
    print("There is a significant association between AI Tool Usage and Challenges Personalizing Materials.")
else:
	print("There is no significant association between AI Tool Usage and Challenges Personalizing Materials.")
	

Chi-Square Test Statistic: 28.7386146930329
p-value: 8.83371545720294e-06
Degrees of Freedom: 4
Expected Frequencies Table:
[[  2.47851003   3.96561605  21.31518625 106.57593123 211.66475645]
 [  2.52148997   4.03438395  21.68481375 108.42406877 215.33524355]]
There is a significant association between AI Tool Usage and Challenges Personalizing Materials.


### Chi-square  Test for AI Tool Usage and Overwhelmed by Workload

In [432]:
# Chi-Square Test for AI Tool Usage and Overwhelmed by Workload
import pandas as pd
from scipy.stats import chi2_contingency
# Create a contingency table for AI Tool Usage and Overwhelmed by Workload
contingency_table = pd.crosstab(data['AI_Tool_Usage'], data['Overwhelmed_by_Workload'])
# Perform Chi-Square Test
chi2, p, dof, expected = chi2_contingency(contingency_table)
# Print the results
print("Chi-Square Test Statistic:", chi2)
print("p-value:", p)
print("Degrees of Freedom:", dof)
print("Expected Frequencies Table:")
print(expected)
# Interpretation of the result
if p < 0.05:
    print("There is a significant association between AI Tool Usage and Overwhelmed by Workload.")
else:
	print("There is no significant association between AI Tool Usage and Overwhelmed by Workload.")

Chi-Square Test Statistic: 4.491238858602434
p-value: 0.4810565039575948
Degrees of Freedom: 5
Expected Frequencies Table:
[[ 11.89684814  56.51002865 145.24068768  70.38968481  30.23782235
   31.72492837]
 [ 12.10315186  57.48997135 147.75931232  71.61031519  30.76217765
   32.27507163]]
There is no significant association between AI Tool Usage and Overwhelmed by Workload.


### Chi-Square Matrix for selected columns

In [433]:
# Chi-Square Test for all pairs of selected columns
from itertools import combinations
from scipy.stats import chi2_contingency
# Create a list of all possible pairs of selected columns
column_pairs = list(combinations(columns_of_interest, 2))
# Perform Chi-Square Test for each pair of columns
for pair in column_pairs:
    contingency_table = pd.crosstab(data[pair[0]], data[pair[1]])
    chi2, p, dof, expected = chi2_contingency(contingency_table)
    print(f"Chi-Square Test for {pair[0]} and {pair[1]}:")
    print("Chi-Square Test Statistic:", chi2)
    print("p-value:", p)
    if p < 0.05:
        print("significant association", pair[0], "and", pair[1])
    else:
        print("no significant association", pair[0], "and", pair[1])
    print()

# Display the results in a matrix form
# Create an empty DataFrame to store the p-values
p_values_matrix = pd.DataFrame(1, columns=columns_of_interest, index=columns_of_interest,  dtype='float64')
# Fill the DataFrame with p-values
for pair in column_pairs:
    contingency_table = pd.crosstab(data[pair[0]], data[pair[1]])
    chi2, p, dof, expected = chi2_contingency(contingency_table)
    p_values_matrix.loc[pair[0], pair[1]] = p
    p_values_matrix.loc[pair[1], pair[0]] = p
    
# Display the p-values matrix
p_values_matrix.style.background_gradient(cmap='coolwarm').set_caption(
    "Chi-Square P-Values Matrix for Selected Columns"
).set_table_styles(
    [{'selector': 'caption', 'props': [('color', 'black'), ('font-size', '16px')]},]
)
    

Chi-Square Test for Frequency_of_Digital_Tool_Usage and Hours_Preparing_Materials:
Chi-Square Test Statistic: 44.50936066094838
p-value: 0.009505826153609464
significant association Frequency_of_Digital_Tool_Usage and Hours_Preparing_Materials

Chi-Square Test for Frequency_of_Digital_Tool_Usage and Overwhelmed_by_Workload:
Chi-Square Test Statistic: 57.867009394764324
p-value: 0.00020436081483094073
significant association Frequency_of_Digital_Tool_Usage and Overwhelmed_by_Workload

Chi-Square Test for Frequency_of_Digital_Tool_Usage and Comfort_with_Technology:
Chi-Square Test Statistic: 39.915939394602674
p-value: 0.005119065776368011
significant association Frequency_of_Digital_Tool_Usage and Comfort_with_Technology

Chi-Square Test for Frequency_of_Digital_Tool_Usage and Challenges_Personalizing_Materials:
Chi-Square Test Statistic: 83.52946233117063
p-value: 9.802988291644367e-10
significant association Frequency_of_Digital_Tool_Usage and Challenges_Personalizing_Materials

Chi-S

Unnamed: 0,Frequency_of_Digital_Tool_Usage,Hours_Preparing_Materials,Overwhelmed_by_Workload,Comfort_with_Technology,Challenges_Personalizing_Materials,AI_Tool_Usage
Frequency_of_Digital_Tool_Usage,1.0,0.009506,0.000204,0.005119,0.0,0.0
Hours_Preparing_Materials,0.009506,1.0,4e-06,0.56332,0.709418,0.211891
Overwhelmed_by_Workload,0.000204,4e-06,1.0,0.0,0.38724,0.481057
Comfort_with_Technology,0.005119,0.56332,0.0,1.0,6.1e-05,0.237702
Challenges_Personalizing_Materials,0.0,0.709418,0.38724,6.1e-05,1.0,9e-06
AI_Tool_Usage,0.0,0.211891,0.481057,0.237702,9e-06,1.0


### ANOVA Test for AI Tool Usage and Comfort with Technology

In [434]:
### ANOVA Test for AI Tool Usage and Challenges Personalizing Materials
import pandas as pd
import statsmodels.api as sm
from statsmodels.formula.api import ols


columns_of_interest = [
    "Frequency_of_Digital_Tool_Usage",
    "Hours_Preparing_Materials",
    "Overwhelmed_by_Workload",
    "Comfort_with_Technology",
    "Challenges_Personalizing_Materials",
    "AI_Tool_Usage",
]

# Ensure we are using the original columns without .value_counts() to avoid length mismatch
data = pd.DataFrame({
    'AI_Tool_Usage': transformed_header_data['AI_Tool_Usage'],
    'Comfort_with_Technology': transformed_header_data['Comfort_with_Technology'],
    'Challenges_Personalizing_Materials': transformed_header_data['Challenges_Personalizing_Materials'],
})

# Perform two-way ANOVA
model = ols('Challenges_Personalizing_Materials ~ C(Comfort_with_Technology) + C(AI_Tool_Usage) + C(Comfort_with_Technology):C(AI_Tool_Usage)', data=data).fit()
anova_table = sm.stats.anova_lm(model, typ=2)

# Create a neat table using pandas styling
styled_table = anova_table.style.set_caption("ANOVA Results").set_table_styles([
    {'selector': 'caption', 'props': [('color', 'black'), ('font-size', '16px')]}]
).format(precision=4)  # Adjust the precision if needed

# Display the styled table
styled_table

Unnamed: 0,sum_sq,df,F,PR(>F)
C(Comfort_with_Technology),3.4073,4.0,1.6427,0.1617
C(AI_Tool_Usage),7.232,1.0,13.9462,0.0002
C(Comfort_with_Technology):C(AI_Tool_Usage),0.4455,4.0,0.2148,0.9303
Residual,356.7711,688.0,,
