In [15]:
import pandas as pd
import numpy as np

# Load the data
df = pd.read_csv('./data/SurveyData.csv')

rename_mapping = {
    'Timestamp': 'Timestamp',
    'I consent and am aware that my responses in this survey will be collected for data analysis': 'Consent',
    'What is your age?': 'Age',
    'What is your gender': 'Gender',
    'When was your last theme park visit? (MM/YYYY)': 'LastVisit',
    'Were there any special occasions that prompted the visit? (etc Christmas, school holidays, weekends)': 'SpecialOccasionFlag',
    'If you answered yes for above, which special occasion was it?': 'SpecialOccasionType',
    'Which period of the year do you normally visit the theme park? ': 'VisitPeriodYear',
    'Which period of the day do you normally visit the theme park? (Opening time, early afternoon, late afternoon, evening)': 'VisitPeriodDay',
    'Which theme parks have you visited?': 'VisitedParks',
    'If you answered "Others", please state the theme park': 'OtherPark',
    'How frequently do you visit theme parks?': 'VisitFrequency',
    'What are your considerations when planning your theme park visits?': 'PlanningConsiderations',
    'Which attractions or rides do you plan to visit when you go visit a theme park? Pick up to 2-3': 'PlannedAttractions',
    'If you stated others, please state the attraction': 'OtherPlannedAttraction',
    'Which attractions or rides do you end up visiting? Pick up to 2-3': 'VisitedAttractions',
    'If you stated others, please state the attraction.1': 'OtherVisitedAttraction',
    'How was the WiFi quality in the theme park?': 'WiFiQuality',
    'What are some pain points you have experienced?': 'PainPoints',
    'If you answered "Others", please state the pain points': 'OtherPainPoints',
    'What do you hope can be implemented to overcome said challenges?': 'SuggestedImprovements'
}


df.rename(columns=rename_mapping, inplace=True)
df.drop(columns=['Timestamp', 'Consent'], inplace=True)
print("Renamed columns:")
print(df.columns.tolist())

Renamed columns:
['Age', 'Gender', 'LastVisit', 'SpecialOccasionFlag', 'SpecialOccasionType', 'VisitPeriodYear', 'VisitPeriodDay', 'VisitedParks', 'OtherPark', 'VisitFrequency', 'PlanningConsiderations', 'PlannedAttractions', 'OtherPlannedAttraction', 'VisitedAttractions', 'OtherVisitedAttraction', 'WiFiQuality', 'PainPoints', 'OtherPainPoints', 'SuggestedImprovements']


In [20]:
# --- Synthetic Data Generation ---
target_total = 200
num_synthetic = target_total - len(df)
print(f"\nReal responses: {len(df)}; Synthetic responses to generate: {num_synthetic}")

cols = df.columns.tolist()

synthetic_data = {}
for col in cols:
    if df[col].dropna().empty:
        synthetic_data[col] = [np.nan] * num_synthetic
    else:
        synthetic_data[col] = df[col].dropna().sample(n=num_synthetic, replace=True).reset_index(drop=True)

df_synthetic = pd.DataFrame(synthetic_data)

df_combined = pd.concat([df, df_synthetic], ignore_index=True)
df_combined.to_csv('./data/Synthesised_Surveydata.csv', index=False)


Real responses: 132; Synthetic responses to generate: 68
