In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

In [2]:
data = pd.read_csv('surveyResponses.csv')
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 376 entries, 0 to 375
Data columns (total 14 columns):
 #   Column                                                                                                   Non-Null Count  Dtype  
---  ------                                                                                                   --------------  -----  
 0   Timestamp                                                                                                376 non-null    object 
 1   University Program Faculty?                                                                              371 non-null    object 
 2   Which residence did you stay at?                                                                         373 non-null    object 
 3   What rating would you give the residence you lived at, based on your experience? (Worst: 1, Best: 10).   372 non-null    float64
 4   I see myself as extraverted and enthusiastic                                          

In [3]:
#get num empty results in columns
data.isna().sum()

Timestamp                                                                                                  0
University Program Faculty?                                                                                5
Which residence did you stay at?                                                                           3
What rating would you give the residence you lived at, based on your experience? (Worst: 1, Best: 10).     4
I see myself as extraverted and enthusiastic                                                               2
I see myself as critical and quarrelsome.                                                                  2
I see myself as dependable and self-disciplined.                                                           4
I see myself as anxious and easily upset                                                                   2
I see myself as open to new experiences.                                                                   5
I see myself as res

In [4]:
#With some empty results we need to remove them from our data set so we need to retrieve the rows that are empty
flt = data.isna().any(axis=1)
data[flt]

Unnamed: 0,Timestamp,University Program Faculty?,Which residence did you stay at?,"What rating would you give the residence you lived at, based on your experience? (Worst: 1, Best: 10).",I see myself as extraverted and enthusiastic,I see myself as critical and quarrelsome.,I see myself as dependable and self-disciplined.,I see myself as anxious and easily upset,I see myself as open to new experiences.,I see myself as reserved and quiet.,I see myself as sympathetic and warm.,I see myself as disorganized and careless.,I see myself as calm and emotionally stable.,I see myself as conventional and uncreative.
25,12/02/2024 15:42:42,Engineering,im a rare offcampus first year who commuted in...,,3.0,3.0,6.0,6.0,4.0,5.0,6.0,5.0,5.0,4.0
46,12/02/2024 17:23:51,Engineering,UW Place (UWP),6.0,,,,,,,,,,
48,12/02/2024 17:40:07,,,,,,,,,,,,,
78,12/02/2024 21:15:35,Mathematics,UW Place (UWP),8.0,2.0,4.0,5.0,2.0,7.0,,5.0,4.0,4.0,2.0
98,13/02/2024 02:45:27,Mathematics,UW Place (UWP),7.0,4.0,5.0,6.0,2.0,6.0,5.0,7.0,,5.0,3.0
139,14/03/2024 13:33:41,Environment,Village 1,4.0,3.0,5.0,4.0,4.0,2.0,3.0,5.0,3.0,,5.0
144,14/03/2024 13:35:23,Science,,4.0,3.0,6.0,4.0,4.0,2.0,5.0,3.0,4.0,4.0,6.0
150,14/03/2024 13:36:47,Environment,Village 1,2.0,2.0,5.0,4.0,4.0,,3.0,6.0,3.0,4.0,4.0
165,14/03/2024 13:42:40,Mathematics,,4.0,2.0,5.0,4.0,4.0,3.0,6.0,3.0,4.0,4.0,6.0
207,14/03/2024 14:00:42,Mathematics,Ron Eydt Village (REV),,7.0,1.0,,3.0,4.0,6.0,2.0,6.0,5.0,4.0


In [5]:
#Drop the above rows
data.dropna(how='any', inplace=True)

In [6]:
#just double check that there are now no emptty rows
flt = data.isna().any(axis=1)
data[flt]

Unnamed: 0,Timestamp,University Program Faculty?,Which residence did you stay at?,"What rating would you give the residence you lived at, based on your experience? (Worst: 1, Best: 10).",I see myself as extraverted and enthusiastic,I see myself as critical and quarrelsome.,I see myself as dependable and self-disciplined.,I see myself as anxious and easily upset,I see myself as open to new experiences.,I see myself as reserved and quiet.,I see myself as sympathetic and warm.,I see myself as disorganized and careless.,I see myself as calm and emotionally stable.,I see myself as conventional and uncreative.


In [7]:
#drop timeStamp column
data = data.drop('Timestamp', axis=1)

In [8]:
#rename our columns to be easier to read
data = data.rename(columns=
               {'University Program Faculty?': 'program', 
                'Which residence did you stay at?': 'residence',
                'What rating would you give the residence you lived at, based on your experience? (Worst: 1, Best: 10). ': 'rating',
                'I see myself as extraverted and enthusiastic': 'extravertedScore',
                'I see myself as critical and quarrelsome.': 'criticalScore',
                'I see myself as dependable and self-disciplined.': 'dependableScore',
                'I see myself as anxious and easily upset': 'anxiousScore',
                'I see myself as open to new experiences.': 'openScore',
                'I see myself as reserved and quiet.': 'reservedScore',
                'I see myself as sympathetic and warm.': 'sympathyScore',
                'I see myself as disorganized and careless.': 'carelessScore',
                'I see myself as calm and emotionally stable.': 'calmScore',
                'I see myself as conventional and uncreative.': 'creativeScore'
               })

In [9]:
#we want to remove all values that are not from our predefined set of residences, for example if a student put "Off campus for the semester"
residenceList = ['Village 1', 'Ron Eydt Village (REV)', 'Claudette Millar Hall (CMH)', 'Mackenzie King Village (MKV)',
                 'UW Place (UWP)', 'Columbia Lake Village South (CLV-South)', 'Columbia Lake Village North (CLV-North)', 'Minota Hagey (MH)']
indexNames = data[~data['residence'].isin(residenceList)].index
data = data.drop(indexNames)

In [10]:
#convert categorical variable residence into values using nominal one hot encoding
dfOneHot = pd.get_dummies(data, columns=["residence"])
data = pd.concat([data['residence'], dfOneHot], axis=1)
data.head()

Unnamed: 0,residence,program,rating,extravertedScore,criticalScore,dependableScore,anxiousScore,openScore,reservedScore,sympathyScore,carelessScore,calmScore,creativeScore,residence_Claudette Millar Hall (CMH),residence_Columbia Lake Village South (CLV-South),residence_Mackenzie King Village (MKV),residence_Ron Eydt Village (REV),residence_UW Place (UWP),residence_Village 1
0,UW Place (UWP),Engineering,7.0,6.0,3.0,6.0,4.0,5.0,3.0,3.0,2.0,4.0,2.0,0,0,0,0,1,0
1,Ron Eydt Village (REV),Engineering,10.0,7.0,7.0,6.0,5.0,5.0,1.0,7.0,2.0,4.0,2.0,0,0,0,1,0,0
2,Ron Eydt Village (REV),Engineering,10.0,5.0,5.0,6.0,3.0,7.0,4.0,7.0,2.0,7.0,5.0,0,0,0,1,0,0
3,Ron Eydt Village (REV),Engineering,6.0,4.0,3.0,5.0,2.0,4.0,4.0,6.0,6.0,5.0,5.0,0,0,0,1,0,0
4,Village 1,Engineering,6.0,6.0,4.0,2.0,5.0,7.0,4.0,6.0,7.0,5.0,2.0,0,0,0,0,0,1


In [11]:
#Now we need to aggregate the data to process for the personality test attribution. According to the personality test we selected
#We must 1. Recode the reverse-scored items (i.e., recode a 7 with a 1, a 6 with a 2, a 5 with a 3, etc.). The reverse scored columns are 2, 4, 6, 8, & 10.
#2. Take the AVERAGE of the two items (the standard item and the recoded reverse-scored item) that make up each scale.

# 1. Reverse the specified columns
reverseColumnList = ['criticalScore', 'anxiousScore', 'reservedScore', 'carelessScore', 'creativeScore']

def reverseColumns(x):
    
    #converts a 7 into 1, 6 into 2, etc
    return 8 - x

reversedColumns = data[reverseColumnList].apply(reverseColumns)
data[reverseColumnList] = reversedColumns

#2. Take average for matching columns to get our five scores for each coresponding personality

personalityTraitSet = {
    'extraversion' : ['extravertedScore', 'reservedScore'],
    'agreeableness' : ['criticalScore', 'sympathyScore'],
    'conscientiousness': ['dependableScore', 'carelessScore'],
    'emotionalStability' : ['anxiousScore', 'calmScore'],
    'Openness' : ['openScore', 'creativeScore']
}



data['extraversion'] = (data['extravertedScore'] + data['reservedScore']) / 2
data['agreeableness'] = (data['criticalScore'] + data['sympathyScore']) / 2
data['conscientiousness'] = (data['dependableScore'] + data['carelessScore']) / 2
data['emotionalStability'] = (data['anxiousScore'] + data['calmScore']) / 2
data['Openness'] = (data['openScore'] + data['creativeScore']) / 2
data.drop(columns=['extravertedScore', 'reservedScore', 'criticalScore', 'sympathyScore', 'dependableScore', 'carelessScore', 'anxiousScore', 'calmScore', 'openScore', 'creativeScore'], inplace=True)

In [14]:
print(data['program'].value_counts())

Engineering    94
Mathematics    66
Science        57
Arts           53
Health         41
Environment    29
AFM             4
Name: program, dtype: int64


In [12]:
##with our dataset ready we now save it as a csv to use for other models
data.to_csv('parsedSurveyData.csv', sep=',', index=False, encoding='utf-8')
