In [None]:
# Set up environment

from IPython.core.interactiveshell import InteractiveShell
InteractiveShell.ast_node_interactivity = "all"

import pandas as pd
import numpy as np
import seaborn as sns

In [None]:
# Read in data

df1 = pd.read_csv('student-inperson-pretrain.csv')
df2 = pd.read_csv('staff-inperson-pretrain.csv')
df3 = pd.read_csv('student-virtual-pretrain.csv')
df4 = pd.read_csv('staff-virtual-pretrain.csv')

In [None]:
# Check to make sure data was loaded correctly

df1.shape; df2.shape; df3.shape; df4.shape
386+77+148+158

In [None]:
# Add 'Status' variables to in-person data based on dataset they're in:
# Classification corresponds to the folder of the dataset

df1.insert(7,'Status',1)
df2.insert(7,'Status',2)


In [None]:
# Drop columns in staff-virtual that aren't in another dataset (i.e. Q46 and Q1)

cols = df4.columns.values
drop_col_lst = [7,8,9,10,11,12]

for i in drop_col_lst:
    df4 = df4.drop(cols[i],axis=1)


In [None]:
df4.shape

In [None]:
# Add 'Status' variables to virtual data based on dataset they're in:
# Classification corresponds to the folder of the dataset

df3.insert(7,'Status',1)
df4.insert(7,'Status',2)

In [None]:
df3.shape;df4.shape

In [None]:
# Find remaining variables to add to virtual datasets; The reason why
# the in-person datasets have all variables is because they include
# both pre-modified and modified survey_types

all_cols = df1.columns.values
need_to_add = np.setdiff1d(df1.columns.values,df3.columns.values)
print(need_to_add)

In [None]:
# locations where these variables/columns should be added to match with df1&df2
for i  in list(range(12)):
    print(df1.columns.get_loc(need_to_add[i]))

In [None]:
# Add variables/columns to df3 and df4

index_to_add = [94,95,116,119,120,121,122,128,130,131,132,134]
for j in list(range(12)):
    df3.insert(index_to_add[j],need_to_add[j],np.nan)
    df4.insert(index_to_add[j],need_to_add[j],np.nan)


In [None]:
# check to make sure everything makes sense (dimensions)
df1.shape; df2.shape; df3.shape; df4.shape

In [None]:
# Now that the column dimensions are the same across the four datasets,
# we can concatenate them vertically to create one dataset for pretraining

pretraining = pd.concat([df1,df2,df3,df4],ignore_index=True)
print(pretraining.shape)

In [None]:
#Change order of columns and re-label the "role @ CU" items

extref = pretraining['ExternalReference']
pretraining.drop(labels=['ExternalReference'],axis=1,inplace=True)
pretraining.insert(0,'ExternalReference',extref)

status = pretraining['Status']
pretraining.drop(labels=['Status'],axis=1,inplace=True)
pretraining.insert(1,'Status',status)

Q2_1 = pretraining['Q2_1']
pretraining.drop(labels=['Q2_1'],axis=1,inplace=True)
pretraining.insert(2,'Student',Q2_1)

Q2_2 = pretraining['Q2_2']
pretraining.drop(labels=['Q2_2'],axis=1,inplace=True)
pretraining.insert(3,'Staff',Q2_2)

Q2_3 = pretraining['Q2_3']
pretraining.drop(labels=['Q2_3'],axis=1,inplace=True)
pretraining.insert(4,'Faculty',Q2_3)

print(pretraining.columns.values)


In [None]:
pretraining.shape

In [None]:
# Export the dataset for cleaning
export_path = 'pre-training.csv'
pretraining.to_csv(export_path,index=False)

In [None]:
# Retrieve ExternalReference of 16 year old participant. Will need to drop this participant as
# they violate the inclusion criteria for the study

pretraining.loc[pretraining['Q11']==16,'ExternalReference']