# BFI Form Data Cleaner
For use with data collected from 
https://docs.google.com/forms/d/1CTh1XxUhZqoBa-sCbEISlh8Jb3bUomD9_fmsCscCXJ4/edit and downloaded as a CSV

In [70]:
import pandas as pd
from numpy import array

In [103]:
raw_data = pd.read_csv("data/Personality (BFI).csv")
raw_data

Unnamed: 0,Timestamp,Subject #:,Is talkative,Tends to find fault with others,Does a thorough job,"Is depressed, blue","Is original, comes up with new ideas",Is reserved,Is helpful and unselfish with others,Can be somewhat careless,...,Prefers work that is routine,"Is outgoing, sociable",Is sometimes rude to others,Makes plans and follows through with them,Gets nervous easily,"Likes to reflect, play with ideas",Has few artistic interests,Likes to cooperate with others,Is easily distracted,"Is sophisticated in art, music, or literature"
0,2020/04/04 3:21:39 PM MDT,pilot01,Neither agree nor disagree,Disagree a little,Agree Strongly,Disagree a little,Agree a little,Neither agree nor disagree,Agree a little,Agree a little,...,Agree Strongly,Agree a little,Disagree Strongly,Agree a little,Agree a little,Agree Strongly,Disagree Strongly,Neither agree nor disagree,Agree a little,Agree Strongly
1,2020/04/04 3:24:31 PM MDT,pilot02,Disagree a little,Neither agree nor disagree,Agree a little,Disagree Strongly,Agree a little,Neither agree nor disagree,Agree a little,Agree a little,...,Disagree Strongly,Agree a little,Agree a little,Agree Strongly,Disagree a little,Neither agree nor disagree,Agree a little,Agree a little,Agree Strongly,Disagree a little


In [104]:
# Replace text responses with numeric responses
data = raw_data.replace({
    'Disagree Strongly':1,
    'Disagree a little':2,
    'Neither agree nor disagree':3,
    'Agree a little':4,
    'Agree Strongly':5
}, regex=True)

data = data.fillna(0)
data

Unnamed: 0,Timestamp,Subject #:,Is talkative,Tends to find fault with others,Does a thorough job,"Is depressed, blue","Is original, comes up with new ideas",Is reserved,Is helpful and unselfish with others,Can be somewhat careless,...,Prefers work that is routine,"Is outgoing, sociable",Is sometimes rude to others,Makes plans and follows through with them,Gets nervous easily,"Likes to reflect, play with ideas",Has few artistic interests,Likes to cooperate with others,Is easily distracted,"Is sophisticated in art, music, or literature"
0,2020/04/04 3:21:39 PM MDT,pilot01,3,2,5,2,4,3,4,4,...,5,4,1,4,4,5,1,3,4,5
1,2020/04/04 3:24:31 PM MDT,pilot02,2,3,4,1,4,3,4,4,...,1,4,4,5,2,3,4,4,5,2


In [73]:
# Reverse scoring
"""
To score the BFI, you’ll first need to reverse-score all negatively-keyed items:

Extraversion: 6, 21, 31
Agreeableness: 2, 12, 27, 37
Conscientiousness: 8, 18, 23, 43
Neuroticism: 9, 24, 34
Openness: 35, 41

*NOTE: questions are shifted 2 to the right because of timestamp + subject*
To fix, add 1 to each number to get index

**ASSUMING QUESTIONS ARE IN CORRECT ORDER**
"""

cols_to_rev = [3, 7, 9, 10, 13, 19, 22, 24, 25, 28, 32, 35, 36, 38, 42, 44]
# Select columns to access with index by using list(raw_data.columns)[i]

for i in cols_to_rev:
    data[list(data.columns)[i]] = data[list(data.columns)[i]].apply(lambda x: 6 - x)
    
data

Unnamed: 0,Timestamp,Subject #:,Is talkative,Tends to find fault with others,Does a thorough job,"Is depressed, blue","Is original, comes up with new ideas",Is reserved,Is helpful and unselfish with others,Can be somewhat careless,...,Prefers work that is routine,"Is outgoing, sociable",Is sometimes rude to others,Makes plans and follows through with them,Gets nervous easily,"Likes to reflect, play with ideas",Has few artistic interests,Likes to cooperate with others,Is easily distracted,"Is sophisticated in art, music, or literature"
0,2020/04/04 3:21:39 PM MDT,pilot01,3,4,5,2,4,3,4,2,...,1,4,5,4,4,5,5,3,2,5
1,2020/04/04 3:24:31 PM MDT,pilot02,2,3,4,1,4,3,4,2,...,5,4,2,5,2,3,2,4,1,2


In [74]:
# Trying to think of how to average certain cols efficiently...
test = data.iloc[:,0:5]
test

Unnamed: 0,Timestamp,Subject #:,Is talkative,Tends to find fault with others,Does a thorough job
0,2020/04/04 3:21:39 PM MDT,pilot01,3,4,5
1,2020/04/04 3:24:31 PM MDT,pilot02,2,3,4


In [45]:
# Prepare indices for each B5 Doomain
"""
You will create scale scores by averaging the following items for each B5 domain
(where R indicates using the reverse-scored item).

Extraversion: 1, 6R 11, 16, 21R, 26, 31R, 36
Agreeableness: 2R, 7, 12R, 17, 22, 27R, 32, 37R, 42
Conscientiousness: 3, 8R, 13, 18R, 23R, 28, 33, 38, 43R
Neuroticism: 4, 9R, 14, 19, 24R, 29, 34R, 39
Openness: 5, 10, 15, 20, 25, 30, 35R, 40, 41R, 44

*NOTE: questions are shifted 2 to the right because of timestamp + subject*
To fix, add 1 to each number to get index

**ASSUMING QUESTIONS ARE IN CORRECT ORDER**
"""

extra_idx = list(map(lambda x: x + 1, [1, 6, 11, 16, 21, 26, 31, 36]))
agree_idx = list(map(lambda x: x + 1, [2, 7, 12, 17, 22, 27, 32, 37, 42]))
conscious_idx = list(map(lambda x: x + 1, [3, 8, 13, 18, 23, 28, 33, 38, 43]))
neuro_idx = list(map(lambda x: x + 1, [4, 9, 14, 19, 24, 29, 34, 39]))
open_idx = list(map(lambda x: x + 1, [5, 10, 15, 20, 25, 30, 35, 40, 41, 44]))

# Should show columns needed for Extraversion sum:
data.iloc[:, extra_idx]

Unnamed: 0,Is talkative,Is reserved,Is full of energy,Generates a lot of enthusiasm,Tends to be quiet,Has an assertive personality,"Is sometimes shy, inhibited","Is outgoing, sociable"
0,3,3,4,5,2,2,2,4
1,2,3,3,2,2,4,2,4


In [97]:
# Should sum up rows above (for Extraversion)
print(data.iloc[:, extra_idx].sum(axis=1))

print('\n')

# Should create AVERAGES
print(data.iloc[:, extra_idx].mean(axis=1))

0    25
1    22
dtype: int64


0    3.125
1    2.750
dtype: float64


In [98]:
# Create columns
names = list(data[list(data.columns)[1]])
extra = list(data.iloc[:, extra_idx].mean(axis=1))
agree = list(data.iloc[:, agree_idx].mean(axis=1))
conscious = list(data.iloc[:, conscious_idx].mean(axis=1))
neuro = list(data.iloc[:, neuro_idx].mean(axis=1))
openn = list(data.iloc[:, open_idx].mean(axis=1))

In [99]:
# Compile cols in new dataframe
d = {
    'Subject':names,
    'Extraversion':extra,
    'Agreeableness':agree,
    'Conscientiousness':conscious,
    'Neuroticism':neuro,
    'Open':openn
}

df = pd.DataFrame(d)
df

Unnamed: 0,Subject,Extraversion,Agreeableness,Conscientiousness,Neuroticism,Open
0,pilot01,3.125,4.0,3.0,3.5,4.2
1,pilot02,2.75,4.0,3.0,2.125,3.5


In [102]:
# Save as csv
df.to_csv('cleaned_data/cleaned_BFI.csv', index=False)