In [1]:
import pandas as pd
import numpy as np

# Reading in data
responses = pd.read_csv("SurveyResponses.csv")

# Selecting questions related to our selected topic
responses = responses.iloc[:, [1, 2, 3, 4, 7, 8, 11, 17, 18, 19, 22, 25, 29, 31, 32, 33, 53]]

# Renaming columns/variables
responses.columns = [
                     "className",      # What class are you filling out this survey for? Select all that apply.
                     "classStanding",  # What is your current class standing?
                     "isTransfer",     # Are you a transfer student?
                     "major",          # What is your major/minor? (If your major is not listed, please write it in "Other..")
                     "gpa",            # What is your current cumulative GPA?
                     "age",            # What is your age?
                     "gender",         # What gender do you identify as?
                     "exerciseHrs",    # How many hours of strenuous exercise do you do per week on average?
                     "gymVisits",      # How many times do you go to the gym per week on average?
                     "mentalHealth",   # Rate your overall mental health. (1-5, 5 being good)
                     "timeToSleep",    # How long does it take for you to fall asleep?
                     "hrsSleep",       # How many hours of sleep do you get on average during college?
                     "energyLevels",   # On average, how would you describe your energy levels throughout the day? (1-5, 5 being high)
                     "caffeineFreq",   # How often do you consume caffeinated beverages?
                     "caffeineType",   # What types of caffeinated drink do you consume? Choose all that apply.
                     "caffeineIntake", # What is your daily caffeinated drink intake?\n(For reference, a "small" Red Bull is 8.4oz and "medium" Starbucks Grande is 16oz)
                     "units"           # How many units are you registered for this quarter?
                     ]

# Replacing isTransfer Yes and No with 1 and 0
responses["isTransfer"] = responses["isTransfer"].map({"Yes": 1, "No": 0})

# Replacing instances of custom input with "Other"
responses["major"] = responses["major"].apply(lambda x: x if x in ["Data Science major", "CS major", "CSBA major", "CEN or EE major", "Mathematics major"] else "Other")

# Fixing typo in exerciseHrs
responses["exerciseHrs"] = responses["exerciseHrs"].apply(lambda x: "0 - 1 hours" if x == "0 -1 hours" else x)

# Replacing instances of custom input with NaN
responses["caffeineFreq"] = responses["caffeineFreq"].apply(lambda x: x if x in ["Once per day", "Once per week", "Multiple times per week", "I don't consume them at all", "Once per month"] else np.nan)

# Replacing NaN with None, and custom input with "Other"
responses["caffeineType"] = responses['caffeineType'].fillna('None')
responses["caffeineType"] = responses['caffeineType'].apply(lambda x: x if any(substring in x for substring in ["Tea", "Coffee", "Energy drink", "None"]) else "Other")

# Replacing garbage values and replacing numbers with ranges
responses["units"] = ["<12" if x.isdigit() and int(x) < 12 else ">15" if x.isdigit() and int(x) > 15 else "12-15" for x in responses["units"]]

responses.to_csv("CleanedResponses.csv")