In [2]:
import pandas as pd
import numpy as np

# Load the data
df = pd.read_csv('./data/SurveyData.csv')

rename_mapping = {
    'Timestamp': 'Timestamp',
    'I consent and am aware that my responses in this survey will be collected for data analysis': 'Consent',
    'What is your age?': 'Age',
    'What is your gender': 'Gender',
    'When was your last theme park visit? (MM/YYYY)': 'LastVisit',
    'Were there any special occasions that prompted the visit? (etc Christmas, school holidays, weekends)': 'SpecialOccasionFlag',
    'If you answered yes for above, which special occasion was it?': 'SpecialOccasionType',
    'Which period of the year do you normally visit the theme park? ': 'VisitPeriodYear',
    'Which period of the day do you normally visit the theme park? (Opening time, early afternoon, late afternoon, evening)': 'VisitPeriodDay',
    'Which theme parks have you visited?': 'VisitedParks',
    'If you answered "Others", please state the theme park': 'OtherPark',
    'How frequently do you visit theme parks?': 'VisitFrequency',
    'What are your considerations when planning your theme park visits?': 'PlanningConsiderations',
    'Which attractions or rides do you plan to visit when you go visit a theme park? Pick up to 2-3': 'PlannedAttractions',
    'If you stated others, please state the attraction': 'OtherPlannedAttraction',
    'Which attractions or rides do you end up visiting? Pick up to 2-3': 'VisitedAttractions',
    'If you stated others, please state the attraction.1': 'OtherVisitedAttraction',
    'How was the WiFi quality in the theme park?': 'WiFiQuality',
    'What are some pain points you have experienced?': 'PainPoints',
    'If you answered "Others", please state the pain points': 'OtherPainPoints',
    'What do you hope can be implemented to overcome said challenges?': 'SuggestedImprovements'
}


df.rename(columns=rename_mapping, inplace=True)
df.drop(columns=['Timestamp', 'Consent'], inplace=True)
print("Renamed columns:")
print(df.columns.tolist())

Renamed columns:
['Age', 'Gender', 'LastVisit', 'SpecialOccasionFlag', 'SpecialOccasionType', 'VisitPeriodYear', 'VisitPeriodDay', 'VisitedParks', 'OtherPark', 'VisitFrequency', 'PlanningConsiderations', 'PlannedAttractions', 'OtherPlannedAttraction', 'VisitedAttractions', 'OtherVisitedAttraction', 'WiFiQuality', 'PainPoints', 'OtherPainPoints', 'SuggestedImprovements']


In [9]:
from datetime import timedelta

np.random.seed(3101)
# --- Set Target and Prepare for Synthetic Generation ---
target_total = 200
num_synthetic = target_total - len(df)
print(f"Real responses: {len(df)}; Synthetic responses to generate: {num_synthetic}")

synthetic_rows = []

# For Age (numeric)
age_series = pd.to_numeric(df['Age'], errors='coerce').dropna()
age_mean = age_series.mean() if not age_series.empty else 40
age_std = age_series.std() if not age_series.empty else 10

# For LastVisit (dates in MM/YYYY)
df['LastVisit_dt'] = pd.to_datetime(df['LastVisit'], format='%m/%Y', errors='coerce')
date_series = df['LastVisit_dt'].dropna()
min_date = date_series.min() if not date_series.empty else pd.to_datetime("01/2000", format='%m/%Y')
max_date = date_series.max() if not date_series.empty else pd.to_datetime("12/2020", format='%m/%Y')
date_range_days = (max_date - min_date).days

# For categorical columns, we calculate the empirical probability distributions:
def get_category_probs(col):
    vals = df[col].dropna()
    if vals.empty:
        return None
    return vals.value_counts(normalize=True)

gender_probs = get_category_probs('Gender')
visitfreq_probs = get_category_probs('VisitFrequency')
wifi_probs = get_category_probs('WiFiQuality')
specialOccasion_probs = get_category_probs('SpecialOccasionFlag')

#Synthetic Data Generation
for i in range(num_synthetic):
    synthetic_row = {}
    
    # Generate Age using a normal distribution
    synthetic_age = int(np.clip(np.random.normal(age_mean, age_std), 10, 100))
    synthetic_row['Age'] = synthetic_age

    # Generate Gender using empirical probabilities
    if gender_probs is not None:
        synthetic_row['Gender'] = np.random.choice(gender_probs.index, p=gender_probs.values)
    else:
        synthetic_row['Gender'] = ''

    # Generate LastVisit as a random date between min and max, then format as MM/YYYY
    if date_range_days > 0:
        rand_days = np.random.randint(0, date_range_days + 1)
        synthetic_date = min_date + timedelta(days=rand_days)
        synthetic_row['LastVisit'] = synthetic_date.strftime('%m/%Y')
    else:
        synthetic_row['LastVisit'] = ''
    
    # For SpecialOccasionFlag (categorical)
    if specialOccasion_probs is not None:
        synthetic_row['SpecialOccasionFlag'] = np.random.choice(specialOccasion_probs.index, p=specialOccasion_probs.values)
    else:
        synthetic_row['SpecialOccasionFlag'] = ''
    
    # For SpecialOccasionType (free text), generate by sampling a random existing value
    if not df['SpecialOccasionType'].dropna().empty:
        synthetic_row['SpecialOccasionType'] = np.random.choice(df['SpecialOccasionType'].dropna().values)
    else:
        synthetic_row['SpecialOccasionType'] = ''
    
    # For VisitPeriodYear (categorical)
    if not df['VisitPeriodYear'].dropna().empty:
        synthetic_row['VisitPeriodYear'] = np.random.choice(df['VisitPeriodYear'].dropna().values)
    else:
        synthetic_row['VisitPeriodYear'] = ''
    
    # For VisitPeriodDay (categorical)
    if not df['VisitPeriodDay'].dropna().empty:
        synthetic_row['VisitPeriodDay'] = np.random.choice(df['VisitPeriodDay'].dropna().values)
    else:
        synthetic_row['VisitPeriodDay'] = ''
    
    # For VisitedParks (free text)
    if not df['VisitedParks'].dropna().empty:
        synthetic_row['VisitedParks'] = np.random.choice(df['VisitedParks'].dropna().values)
    else:
        synthetic_row['VisitedParks'] = ''
    
    # For OtherPark (free text)
    if not df['OtherPark'].dropna().empty:
        synthetic_row['OtherPark'] = np.random.choice(df['OtherPark'].dropna().values)
    else:
        synthetic_row['OtherPark'] = ''
    
    # For VisitFrequency (categorical)
    if visitfreq_probs is not None:
        synthetic_row['VisitFrequency'] = np.random.choice(visitfreq_probs.index, p=visitfreq_probs.values)
    else:
        synthetic_row['VisitFrequency'] = ''
    
    # For PlanningConsiderations (free text)
    if not df['PlanningConsiderations'].dropna().empty:
        synthetic_row['PlanningConsiderations'] = np.random.choice(df['PlanningConsiderations'].dropna().values)
    else:
        synthetic_row['PlanningConsiderations'] = ''
    
    # For PlannedAttractions (free text)
    if not df['PlannedAttractions'].dropna().empty:
        synthetic_row['PlannedAttractions'] = np.random.choice(df['PlannedAttractions'].dropna().values)
    else:
        synthetic_row['PlannedAttractions'] = ''
    
    # For OtherPlannedAttraction (free text)
    if not df['OtherPlannedAttraction'].dropna().empty:
        synthetic_row['OtherPlannedAttraction'] = np.random.choice(df['OtherPlannedAttraction'].dropna().values)
    else:
        synthetic_row['OtherPlannedAttraction'] = ''
    
    # For VisitedAttractions (free text)
    if not df['VisitedAttractions'].dropna().empty:
        synthetic_row['VisitedAttractions'] = np.random.choice(df['VisitedAttractions'].dropna().values)
    else:
        synthetic_row['VisitedAttractions'] = ''
    
    # For OtherVisitedAttraction (free text)
    if not df['OtherVisitedAttraction'].dropna().empty:
        synthetic_row['OtherVisitedAttraction'] = np.random.choice(df['OtherVisitedAttraction'].dropna().values)
    else:
        synthetic_row['OtherVisitedAttraction'] = ''
    
    # For WiFiQuality (categorical)
    if wifi_probs is not None:
        synthetic_row['WiFiQuality'] = np.random.choice(wifi_probs.index, p=wifi_probs.values)
    else:
        synthetic_row['WiFiQuality'] = ''
    
    # For PainPoints (free text)
    if not df['PainPoints'].dropna().empty:
        synthetic_row['PainPoints'] = np.random.choice(df['PainPoints'].dropna().values)
    else:
        synthetic_row['PainPoints'] = ''
    
    # For OtherPainPoints (free text)
    if not df['OtherPainPoints'].dropna().empty:
        synthetic_row['OtherPainPoints'] = np.random.choice(df['OtherPainPoints'].dropna().values)
    else:
        synthetic_row['OtherPainPoints'] = ''
    
    # For SuggestedImprovements (free text)
    if not df['SuggestedImprovements'].dropna().empty:
        synthetic_row['SuggestedImprovements'] = np.random.choice(df['SuggestedImprovements'].dropna().values)
    else:
        synthetic_row['SuggestedImprovements'] = ''
    
    synthetic_rows.append(synthetic_row)

df_synthetic = pd.DataFrame(synthetic_rows)
df_combined = pd.concat([df.drop(columns=['LastVisit_dt']), df_synthetic], ignore_index=True)

# Save the combined dataset
#df_combined.to_csv('./data/Synthesised_Surveydata.csv', index=False)

Real responses: 132; Synthetic responses to generate: 68
