In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

In [2]:
data = pd.read_csv('surveyResponses.csv')
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 376 entries, 0 to 375
Data columns (total 14 columns):
 #   Column                                                                                                   Non-Null Count  Dtype  
---  ------                                                                                                   --------------  -----  
 0   Timestamp                                                                                                376 non-null    object 
 1   University Program Faculty?                                                                              371 non-null    object 
 2   Which residence did you stay at?                                                                         373 non-null    object 
 3   What rating would you give the residence you lived at, based on your experience? (Worst: 1, Best: 10).   372 non-null    float64
 4   I see myself as extraverted and enthusiastic                                          

In [3]:
#get num empty results in columns
data.isna().sum()

Timestamp                                                                                                  0
University Program Faculty?                                                                                5
Which residence did you stay at?                                                                           3
What rating would you give the residence you lived at, based on your experience? (Worst: 1, Best: 10).     4
I see myself as extraverted and enthusiastic                                                               2
I see myself as critical and quarrelsome.                                                                  2
I see myself as dependable and self-disciplined.                                                           4
I see myself as anxious and easily upset                                                                   2
I see myself as open to new experiences.                                                                   5
I see myself as res

In [4]:
#With some empty results we need to remove them from our data set so we need to retrieve the rows that are empty
flt = data.isna().any(axis=1)
data[flt]

Unnamed: 0,Timestamp,University Program Faculty?,Which residence did you stay at?,"What rating would you give the residence you lived at, based on your experience? (Worst: 1, Best: 10).",I see myself as extraverted and enthusiastic,I see myself as critical and quarrelsome.,I see myself as dependable and self-disciplined.,I see myself as anxious and easily upset,I see myself as open to new experiences.,I see myself as reserved and quiet.,I see myself as sympathetic and warm.,I see myself as disorganized and careless.,I see myself as calm and emotionally stable.,I see myself as conventional and uncreative.
25,12/02/2024 15:42:42,Engineering,im a rare offcampus first year who commuted in...,,3.0,3.0,6.0,6.0,4.0,5.0,6.0,5.0,5.0,4.0
46,12/02/2024 17:23:51,Engineering,UW Place (UWP),6.0,,,,,,,,,,
48,12/02/2024 17:40:07,,,,,,,,,,,,,
78,12/02/2024 21:15:35,Mathematics,UW Place (UWP),8.0,2.0,4.0,5.0,2.0,7.0,,5.0,4.0,4.0,2.0
98,13/02/2024 02:45:27,Mathematics,UW Place (UWP),7.0,4.0,5.0,6.0,2.0,6.0,5.0,7.0,,5.0,3.0
139,14/03/2024 13:33:41,Environment,Village 1,4.0,3.0,5.0,4.0,4.0,2.0,3.0,5.0,3.0,,5.0
144,14/03/2024 13:35:23,Science,,4.0,3.0,6.0,4.0,4.0,2.0,5.0,3.0,4.0,4.0,6.0
150,14/03/2024 13:36:47,Environment,Village 1,2.0,2.0,5.0,4.0,4.0,,3.0,6.0,3.0,4.0,4.0
165,14/03/2024 13:42:40,Mathematics,,4.0,2.0,5.0,4.0,4.0,3.0,6.0,3.0,4.0,4.0,6.0
207,14/03/2024 14:00:42,Mathematics,Ron Eydt Village (REV),,7.0,1.0,,3.0,4.0,6.0,2.0,6.0,5.0,4.0


In [5]:
#Drop the above rows
data.dropna(how='any', inplace=True)

In [6]:
#just double check that there are now no emptty rows
flt = data.isna().any(axis=1)
data[flt]

Unnamed: 0,Timestamp,University Program Faculty?,Which residence did you stay at?,"What rating would you give the residence you lived at, based on your experience? (Worst: 1, Best: 10).",I see myself as extraverted and enthusiastic,I see myself as critical and quarrelsome.,I see myself as dependable and self-disciplined.,I see myself as anxious and easily upset,I see myself as open to new experiences.,I see myself as reserved and quiet.,I see myself as sympathetic and warm.,I see myself as disorganized and careless.,I see myself as calm and emotionally stable.,I see myself as conventional and uncreative.


In [7]:
#drop timeStamp column
data = data.drop('Timestamp', axis=1)

In [8]:
#rename our columns to be easier to read
data = data.rename(columns=
               {'University Program Faculty?': 'program', 
                'Which residence did you stay at?': 'residence',
                'What rating would you give the residence you lived at, based on your experience? (Worst: 1, Best: 10). ': 'rating',
                'I see myself as extraverted and enthusiastic': 'extravertedScore',
                'I see myself as critical and quarrelsome.': 'criticalScore',
                'I see myself as dependable and self-disciplined.': 'dependableScore',
                'I see myself as anxious and easily upset': 'anxiousScore',
                'I see myself as open to new experiences.': 'openScore',
                'I see myself as reserved and quiet.': 'reservedScore',
                'I see myself as sympathetic and warm.': 'sympathyScore',
                'I see myself as disorganized and careless.': 'carelessScore',
                'I see myself as calm and emotionally stable.': 'calmScore',
                'I see myself as conventional and uncreative.': 'creativeScore'
               })

In [9]:
#we want to remove all values that are not from our predefined set of residences, for example if a student put "Off campus for the semester"
residenceList = ['Village 1', 'Ron Eydt Village (REV)', 'Claudette Millar Hall (CMH)', 'Mackenzie King Village (MKV)',
                 'UW Place (UWP)', 'Columbia Lake Village South (CLV-South)', 'Columbia Lake Village North (CLV-North)', 'Minota Hagey (MH)']
indexNames = data[~data['residence'].isin(residenceList)].index
data = data.drop(indexNames)

In [10]:
#convert categorical variable residence into values using nominal one hot encoding
dfOneHot = pd.get_dummies(data, columns=["residence"])
data = pd.concat([data['residence'], dfOneHot], axis=1)
data.head()

Unnamed: 0,residence,program,rating,extravertedScore,criticalScore,dependableScore,anxiousScore,openScore,reservedScore,sympathyScore,carelessScore,calmScore,creativeScore,residence_Claudette Millar Hall (CMH),residence_Columbia Lake Village South (CLV-South),residence_Mackenzie King Village (MKV),residence_Ron Eydt Village (REV),residence_UW Place (UWP),residence_Village 1
0,UW Place (UWP),Engineering,7.0,6.0,3.0,6.0,4.0,5.0,3.0,3.0,2.0,4.0,2.0,0,0,0,0,1,0
1,Ron Eydt Village (REV),Engineering,10.0,7.0,7.0,6.0,5.0,5.0,1.0,7.0,2.0,4.0,2.0,0,0,0,1,0,0
2,Ron Eydt Village (REV),Engineering,10.0,5.0,5.0,6.0,3.0,7.0,4.0,7.0,2.0,7.0,5.0,0,0,0,1,0,0
3,Ron Eydt Village (REV),Engineering,6.0,4.0,3.0,5.0,2.0,4.0,4.0,6.0,6.0,5.0,5.0,0,0,0,1,0,0
4,Village 1,Engineering,6.0,6.0,4.0,2.0,5.0,7.0,4.0,6.0,7.0,5.0,2.0,0,0,0,0,0,1


In [11]:
#convert categorical variable programs into values using nominal one hot encoding
dfOneHot = pd.get_dummies(data, columns=["program"])
data = pd.concat([data['program'], dfOneHot], axis=1)
data.head()

Unnamed: 0,program,residence,rating,extravertedScore,criticalScore,dependableScore,anxiousScore,openScore,reservedScore,sympathyScore,...,residence_Ron Eydt Village (REV),residence_UW Place (UWP),residence_Village 1,program_AFM,program_Arts,program_Engineering,program_Environment,program_Health,program_Mathematics,program_Science
0,Engineering,UW Place (UWP),7.0,6.0,3.0,6.0,4.0,5.0,3.0,3.0,...,0,1,0,0,0,1,0,0,0,0
1,Engineering,Ron Eydt Village (REV),10.0,7.0,7.0,6.0,5.0,5.0,1.0,7.0,...,1,0,0,0,0,1,0,0,0,0
2,Engineering,Ron Eydt Village (REV),10.0,5.0,5.0,6.0,3.0,7.0,4.0,7.0,...,1,0,0,0,0,1,0,0,0,0
3,Engineering,Ron Eydt Village (REV),6.0,4.0,3.0,5.0,2.0,4.0,4.0,6.0,...,1,0,0,0,0,1,0,0,0,0
4,Engineering,Village 1,6.0,6.0,4.0,2.0,5.0,7.0,4.0,6.0,...,0,0,1,0,0,1,0,0,0,0


In [12]:
#Now we need to aggregate the data to process for the personality test attribution. According to the personality test we selected
#We must 1. Recode the reverse-scored items (i.e., recode a 7 with a 1, a 6 with a 2, a 5 with a 3, etc.). The reverse scored columns are 2, 4, 6, 8, & 10.
#2. Take the AVERAGE of the two items (the standard item and the recoded reverse-scored item) that make up each scale.

# 1. Reverse the specified columns
reverseColumnList = ['criticalScore', 'anxiousScore', 'reservedScore', 'carelessScore', 'creativeScore']

def reverseColumns(x):
    
    #converts a 7 into 1, 6 into 2, etc
    return 8 - x

reversedColumns = data[reverseColumnList].apply(reverseColumns)
data[reverseColumnList] = reversedColumns

#2. Take average for matching columns to get our five scores for each coresponding personality

personalityTraitSet = {
    'extraversion' : ['extravertedScore', 'reservedScore'],
    'agreeableness' : ['criticalScore', 'sympathyScore'],
    'conscientiousness': ['dependableScore', 'carelessScore'],
    'emotionalStability' : ['anxiousScore', 'calmScore'],
    'Openness' : ['openScore', 'creativeScore']
}



data['extraversion'] = (data['extravertedScore'] + data['reservedScore']) / 2
data['agreeableness'] = (data['criticalScore'] + data['sympathyScore']) / 2
data['conscientiousness'] = (data['dependableScore'] + data['carelessScore']) / 2
data['emotionalStability'] = (data['anxiousScore'] + data['calmScore']) / 2
data['Openness'] = (data['openScore'] + data['creativeScore']) / 2
data.drop(columns=['extravertedScore', 'reservedScore', 'criticalScore', 'sympathyScore', 'dependableScore', 'carelessScore', 'anxiousScore', 'calmScore', 'openScore', 'creativeScore'], inplace=True)

In [13]:
Otherdata = data[data['residence_Ron Eydt Village (REV)'] != 1]
REVdata = data[data['residence_Ron Eydt Village (REV)'] == 1]


higherScore = REVdata[REVdata['rating'] > 4]
lowerScore = REVdata[REVdata['rating'] <= 4]


lowerScore['extraversion'] = 8 - lowerScore['extraversion']
lowerScore['agreeableness'] = 8 - lowerScore['agreeableness']
lowerScore['conscientiousness'] = 8 - lowerScore['conscientiousness']
lowerScore['emotionalStability'] = 8 - lowerScore['emotionalStability']
lowerScore['Openness'] = 8 - lowerScore['Openness']






A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  lowerScore['extraversion'] = 8 - lowerScore['extraversion']
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  lowerScore['agreeableness'] = 8 - lowerScore['agreeableness']
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  lowerScore['conscientiousness'] = 8 - lowerScore['conscientiousness']
A value is tr

In [14]:
REVdata = higherScore.append(lowerScore, ignore_index=True)


  REVdata = higherScore.append(lowerScore, ignore_index=True)


In [15]:
data = Otherdata.append(REVdata, ignore_index=True)

  data = Otherdata.append(REVdata, ignore_index=True)


In [26]:
data.iloc[:,[0,1,2,16,17,18,19,20]]

Unnamed: 0,program,residence,rating,extraversion,agreeableness,conscientiousness,emotionalStability,Openness
0,Engineering,UW Place (UWP),7.0,5.5,4.0,6.0,4.0,5.5
1,Engineering,Village 1,6.0,5.0,5.0,1.5,4.0,6.5
2,Mathematics,Village 1,9.0,5.0,5.5,6.0,6.0,4.0
3,Mathematics,UW Place (UWP),7.0,3.5,6.5,6.5,6.5,5.5
4,Engineering,Village 1,10.0,3.5,6.0,6.5,6.0,6.0
...,...,...,...,...,...,...,...,...
339,Engineering,Ron Eydt Village (REV),3.0,2.0,1.5,5.0,4.0,2.0
340,Mathematics,Ron Eydt Village (REV),3.0,1.5,1.5,6.0,4.0,2.0
341,Health,Ron Eydt Village (REV),4.0,2.0,2.0,5.5,4.0,2.0
342,Mathematics,Ron Eydt Village (REV),4.0,2.0,2.0,6.5,4.0,2.5


In [17]:
data.to_csv('ParsedSurveyData.csv', sep=',', index=False, encoding='utf-8')

## Augment Data 

In [18]:
residences = ['residence_Claudette Millar Hall (CMH)', 'residence_Columbia Lake Village South (CLV-South)',
             'residence_Mackenzie King Village (MKV)', 'residence_Ron Eydt Village (REV)', 
             'residence_UW Place (UWP)', 'residence_Village 1']

programs = ['program_AFM', 'program_Arts', 'program_Engineering', 'program_Environment', 
           'program_Health', 'program_Mathematics', 'program_Science']

##create a neural network model for every residence

copyDf = data.copy()
copyDf.drop(copyDf.index, inplace=True)

def augmentData(x_data, y_data):
    


    def add_noise(data, noise_level=0.15):
        noise = np.random.normal(0, noise_level, data.shape)
        return data + noise

    # Function to scale the data
    def scale_data(data, scale_factor=0.15):
        scale = np.random.uniform(1 - scale_factor, 1 + scale_factor, data.shape)
        return data * scale

    # Number of Augmentations we want to do (3 is 3x per sample)
    num_augmentations = 4  

    # Augment the data multiple times
    augmented_X_data = []
    augmented_y_data = []
    for _ in range(num_augmentations):
        augmented_X_data = np.concatenate([add_noise(x_data), scale_data(x_data)])
        augmented_y_data = np.concatenate([add_noise(y_data), scale_data(y_data)])
            
    return augmented_X_data, augmented_y_data

for residence in residences:
    residence_data = data[data[residence] == 1]
    for program in programs:
        program_data = residence_data[residence_data[program] == 1]
        X = program_data[program_data.columns[16:]]    
        y = program_data['rating']
        
        augmented_X_data, augmented_y_data = augmentData(X, y)
        
        for i in range(0, len(augmented_X_data)):
            
            all_columns = copyDf.columns
            new_row = {col: 0 for col in all_columns}
            
            new_row.update({'rating': augmented_y_data[i], 
                       residence: 1,
                       program : 1,
                       'extraversion': augmented_X_data[i][0],
                       'agreeableness': augmented_X_data[i][1],
                       'conscientiousness': augmented_X_data[i][2],
                       'emotionalStability': augmented_X_data[i][3],
                       'Openness': augmented_X_data[i][4]
                      })
            copyDf = copyDf.append(new_row, ignore_index=True)
            
print(copyDf)
  
        

  copyDf = copyDf.append(new_row, ignore_index=True)
  copyDf = copyDf.append(new_row, ignore_index=True)
  copyDf = copyDf.append(new_row, ignore_index=True)
  copyDf = copyDf.append(new_row, ignore_index=True)
  copyDf = copyDf.append(new_row, ignore_index=True)
  copyDf = copyDf.append(new_row, ignore_index=True)
  copyDf = copyDf.append(new_row, ignore_index=True)
  copyDf = copyDf.append(new_row, ignore_index=True)
  copyDf = copyDf.append(new_row, ignore_index=True)
  copyDf = copyDf.append(new_row, ignore_index=True)
  copyDf = copyDf.append(new_row, ignore_index=True)
  copyDf = copyDf.append(new_row, ignore_index=True)
  copyDf = copyDf.append(new_row, ignore_index=True)
  copyDf = copyDf.append(new_row, ignore_index=True)
  copyDf = copyDf.append(new_row, ignore_index=True)
  copyDf = copyDf.append(new_row, ignore_index=True)
  copyDf = copyDf.append(new_row, ignore_index=True)
  copyDf = copyDf.append(new_row, ignore_index=True)
  copyDf = copyDf.append(new_row, ignore_index

  copyDf = copyDf.append(new_row, ignore_index=True)
  copyDf = copyDf.append(new_row, ignore_index=True)
  copyDf = copyDf.append(new_row, ignore_index=True)
  copyDf = copyDf.append(new_row, ignore_index=True)
  copyDf = copyDf.append(new_row, ignore_index=True)
  copyDf = copyDf.append(new_row, ignore_index=True)
  copyDf = copyDf.append(new_row, ignore_index=True)
  copyDf = copyDf.append(new_row, ignore_index=True)
  copyDf = copyDf.append(new_row, ignore_index=True)


     program  residence     rating  residence_Claudette Millar Hall (CMH)  \
0        0.0        0.0   4.960284                                    1.0   
1        0.0        0.0   1.102062                                    1.0   
2        0.0        0.0   2.044173                                    1.0   
3        0.0        0.0   1.975293                                    1.0   
4        0.0        0.0   2.961116                                    1.0   
..       ...        ...        ...                                    ...   
683      0.0        0.0   3.339991                                    0.0   
684      0.0        0.0   7.873126                                    0.0   
685      0.0        0.0   2.737839                                    0.0   
686      0.0        0.0   3.136056                                    0.0   
687      0.0        0.0  10.227518                                    0.0   

     residence_Columbia Lake Village South (CLV-South)  \
0                

In [19]:
copyDf.to_csv('AugmentedSurveyData.csv', sep=',', index=False, encoding='utf-8')