In [3]:
import pandas as pd
import numpy as np

# Set random seed for reproducible results
np.random.seed(42)
# Define the number of responses
n_responses = 75

# Generate degree program data (roughly 60% undergrad, 40% postgrad)
degree_program = np.random.choice(['undergraduate', 'postgraduate'], 
                                 size=n_responses, p=[0.6, 0.4])

# Generate time in Canberra (roughly equal distribution with slight skew toward newer students)
time_in_canberra = np.random.choice(['<1 year', '1-3 years', '3+ years'], 
                                   size=n_responses, p=[0.4, 0.35, 0.25])

# Initialize arrays for Likert responses
interactive_activities = np.zeros(n_responses)
attend_in_person = np.zeros(n_responses)
watch_online = np.zeros(n_responses)

# Generate responses based on the specified relationships
for i in range(n_responses):
    # Base probabilities for Likert scale (1=strongly disagree, 5=strongly agree)
    
    # Interactive activities: newer students are more keen
    if time_in_canberra[i] == '<1 year':
        # Newer students: higher probability of agreement (skew toward 4-5)
        interactive_activities[i] = np.random.choice([1, 2, 3, 4, 5], 
                                                   p=[0.05, 0.1, 0.2, 0.35, 0.3])
    elif time_in_canberra[i] == '1-3 years':
        # Medium-term students: moderate agreement
        interactive_activities[i] = np.random.choice([1, 2, 3, 4, 5], 
                                                   p=[0.1, 0.15, 0.3, 0.3, 0.15])
    else:  # 3+ years
        # Longer-term students: slightly less enthusiastic
        interactive_activities[i] = np.random.choice([1, 2, 3, 4, 5], 
                                                   p=[0.15, 0.2, 0.35, 0.2, 0.1])
    
    # In-person attendance: postgrads attend more
    if degree_program[i] == 'postgraduate':
        # Postgrads: higher probability of attending in person
        attend_in_person[i] = np.random.choice([1, 2, 3, 4, 5], 
                                             p=[0.1, 0.1, 0.2, 0.35, 0.25])
    else:  # undergraduate
        # Undergrads: lower probability of attending in person
        attend_in_person[i] = np.random.choice([1, 2, 3, 4, 5], 
                                             p=[0.2, 0.25, 0.3, 0.15, 0.1])
    
    # Online watching: undergrads watch more online
    if degree_program[i] == 'undergraduate':
        # Undergrads: higher probability of watching online
        watch_online[i] = np.random.choice([1, 2, 3, 4, 5], 
                                         p=[0.1, 0.15, 0.2, 0.3, 0.25])
    else:  # postgraduate
        # Postgrads: lower probability of watching online
        watch_online[i] = np.random.choice([1, 2, 3, 4, 5], 
                                         p=[0.25, 0.3, 0.25, 0.15, 0.05])

# Create the dataframe
survey_data = pd.DataFrame({
    'interactive_activities_likert': interactive_activities.astype(int),
    'attend_in_person_likert': attend_in_person.astype(int),
    'watch_online_likert': watch_online.astype(int),
    'degree_program': degree_program,
    'time_in_canberra': time_in_canberra
})

# Add more descriptive column names as comments for reference
column_descriptions = {
    'interactive_activities_likert': 'I enjoy interactive activities in lectures (1-5 scale)',
    'attend_in_person_likert': 'I usually attend lectures in person (1-5 scale)', 
    'watch_online_likert': 'I usually watch lectures online (1-5 scale)',
    'degree_program': 'What kind of degree program are you in?',
    'time_in_canberra': 'How long have you lived in Canberra?'
}

# Display first few rows
print("Survey Data Preview:")
print(survey_data.head(10))

print(f"\nDataset shape: {survey_data.shape}")
print(f"\nSummary statistics:")
print(survey_data.describe())

print(f"\nCategorical variable counts:")
print(f"\nDegree Program Distribution:")
print(survey_data['degree_program'].value_counts())
print(f"\nTime in Canberra Distribution:")
print(survey_data['time_in_canberra'].value_counts())

# Save to CSV
survey_data.to_csv('hci_survey_data.csv', index=False)
print(f"\nData saved to 'hci_survey_data.csv'")

# Quick validation of our intended relationships
print(f"\n=== Validation of Data Relationships ===")

print(f"\nInteractive Activities by Time in Canberra:")
print(survey_data.groupby('time_in_canberra')['interactive_activities_likert'].mean().round(2))

print(f"\nIn-Person Attendance by Degree Program:")
print(survey_data.groupby('degree_program')['attend_in_person_likert'].mean().round(2))

print(f"\nOnline Watching by Degree Program:")
print(survey_data.groupby('degree_program')['watch_online_likert'].mean().round(2))

Survey Data Preview:
   interactive_activities_likert  attend_in_person_likert  \
0                              5                        2   
1                              3                        5   
2                              4                        5   
3                              5                        2   
4                              4                        3   
5                              4                        2   
6                              1                        3   
7                              1                        4   
8                              4                        2   
9                              4                        5   

   watch_online_likert degree_program time_in_canberra  
0                    2  undergraduate        1-3 years  
1                    1   postgraduate         3+ years  
2                    1   postgraduate          <1 year  
3                    4  undergraduate          <1 year  
4                    1