# 0.0 Data preparation

# Preparation

Imports libraries

Loads data

In [None]:
%matplotlib inline

print("0.0 data preparation")

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import matplotlib.cm as cm
import matplotlib.transforms as tr
import seaborn as sns
import re
import json
import datetime
import random
import os

from random import randint
from ipywidgets import FloatProgress,IntProgress,IntText,Text,interact,interactive,IntSlider,FloatSlider
from IPython.display import display
from itertools import chain
from scipy.stats import ttest_ind

In [None]:
pd.__version__

In [None]:
np.__version__

### common variables

In [None]:
def tryCreateFolder(path, displayMessage = False):
    try:  
        os.mkdir(path)
    except OSError:  
        if displayMessage:
            print ("Creation of the directory %s failed" % path)
    else:  
        if displayMessage:
            print ("Successfully created the directory %s " % path)

In [None]:
# define the name of the directory to be created
graphsSavePathStem = "../Graphs"
tryCreateFolder(graphsSavePathStem)

In [None]:
suffixSUSNormalized = "SUSNormalized"

In [None]:
dataFolderPath = "../Data/"
csvSuffix = '.csv'

# Miranda House data
unanonymizedDataFilesNamesStemMirandaHouse = "2019-06-03_event_raw_unanonymized"
dataFilesNamesStemMirandaHouse = "2019-06-03_event_raw"
unanonymizedRaw20190603PathMirandaHouse = dataFolderPath + unanonymizedDataFilesNamesStemMirandaHouse + csvSuffix
raw20190603PathMirandaHouse = dataFolderPath + dataFilesNamesStemMirandaHouse + csvSuffix

# Cité des Sciences data
unanonymizedDataFilesNamesStemCiteDesSciences = "2019-07-03_event_raw_unanonymized"
dataFilesNamesStemCiteDesSciences = "2019-07-03_event_raw"
unanonymizedRaw20190703PathCiteDesSciences = dataFolderPath + unanonymizedDataFilesNamesStemCiteDesSciences + csvSuffix
raw20190703PathCiteDesSciences = dataFolderPath + dataFilesNamesStemCiteDesSciences + csvSuffix

In [None]:
questionCount = 12
firstLikertQuestionIndex = 0
lastLikertQuestionIndex = 11
firstSUSQuestionIndex = 0
lastSUSQuestionIndex = 10
questionArrayInt = [i+1 for i in range(questionCount)]
questionArrayStr = sorted(["Q" + "{0:0=2d}".format(i) for i in questionArrayInt])
#questionArrayStr

# data anonymization

In [None]:
# returns anonymized data: replaces Name by identifier and removes the Email column
# reads from the unanomizedresult to the anonymizedPath
def preprocessAnonymize(unanonymizedPath, anonymizedPath):
    unanonymizedData = pd.read_csv(unanonymizedPath,dtype=str)

    unanonymizedData = unanonymizedData.drop(axis=1, columns=['Email'])

    nameSet = set(unanonymizedData['Name'])
    nameSeries = pd.Series(list(nameSet))
    for answerIndex in unanonymizedData.index:
        name = unanonymizedData.loc[answerIndex, 'Name']
        unanonymizedData.loc[answerIndex, 'Name'] = nameSeries.index[nameSeries == name][0]

    print("writing to " + anonymizedPath)
    unanonymizedData.to_csv(anonymizedPath, encoding='utf-8')
    
    return unanonymizedData

In [None]:
if False:
    raw20190603 = preprocessAnonymize(unanonymizedRaw20190603PathMirandaHouse, raw20190603PathMirandaHouse)
if False:
    raw20190703 = preprocessAnonymize(unanonymizedRaw20190703PathCiteDesSciences, raw20190703PathCiteDesSciences)

# Functions

## Loading

In [None]:
# Miranda House
try:
    raw20190603  = pd.read_csv(raw20190603PathMirandaHouse,dtype=str)
except:
    print("Miranda House data read failed: processing raw data...")
    raw20190603 = preprocessAnonymize(unanonymizedRaw20190603PathMirandaHouse, raw20190603PathMirandaHouse)
    raw20190603 = pd.read_csv(raw20190603PathMirandaHouse,dtype=str)
finally:
    raw20190603 = raw20190603.drop(axis=1, columns=raw20190603.columns[0])

In [None]:
# Cite des Sciences
try:
    raw20190703  = pd.read_csv(raw20190703PathCiteDesSciences,dtype=str)
except:
    print("Cite des Sciences data read failed: processing raw data...")
    raw20190703 = preprocessAnonymize(unanonymizedRaw20190703PathCiteDesSciences, raw20190703PathCiteDesSciences)
    raw20190703 = pd.read_csv(raw20190703PathCiteDesSciences,dtype=str)
finally:
    raw20190703 = raw20190703.drop(axis=1, columns=raw20190703.columns[0])

In [None]:
assert ((raw20190603.columns == raw20190703.columns).all()), \
("column mismatch: data from different experiments must have the same columns")

## additional treatment: column renaming
The "Name" columns becomes the "ID" column due to anonymization

In [None]:
timestampQuestion = raw20190603.columns[0]
idQuestion = "ID"

In [None]:
columns = raw20190603.columns.values
columns[1] = idQuestion

raw20190603.columns = columns
raw20190703.columns = columns

## additional treatment: column name extraction
Game question
short, indexed, Likert, SUS questions

In [None]:
gameQuestion = raw20190603.columns[2]
gameQuestion

In [None]:
shortQuestions = pd.Series(index=questionArrayStr, data=raw20190603.columns[3:])
shortQuestions

In [None]:
indexedQuestions = pd.Series(index=range(1,13), data=raw20190603.columns[3:])
indexedQuestions

In [None]:
indexedLikertQuestions = indexedQuestions[firstLikertQuestionIndex:lastLikertQuestionIndex]
indexedLikertQuestions

In [None]:
positiveLikertQuestions = indexedLikertQuestions.copy()
positiveLikertQuestions = positiveLikertQuestions[positiveLikertQuestions.index % 2 == 1]
negativeLikertQuestions = indexedLikertQuestions.copy()
negativeLikertQuestions = negativeLikertQuestions[negativeLikertQuestions.index % 2 == 0]

In [None]:
shortLikertQuestions = shortQuestions[firstLikertQuestionIndex:lastLikertQuestionIndex]
shortLikertQuestions

In [None]:
indexedSUSQuestions = indexedQuestions[firstSUSQuestionIndex:lastSUSQuestionIndex]
indexedSUSQuestions

In [None]:
positiveSUSQuestions = indexedSUSQuestions.copy()
positiveSUSQuestions = positiveSUSQuestions[positiveSUSQuestions.index % 2 == 1]
negativeSUSQuestions = indexedSUSQuestions.copy()
negativeSUSQuestions = negativeSUSQuestions[negativeSUSQuestions.index % 2 == 0]

In [None]:
shortSUSQuestions = shortQuestions[firstSUSQuestionIndex:lastSUSQuestionIndex]
shortSUSQuestions

## 1-word question descriptions

In [None]:
raw20190603.columns[3:]

In [None]:
shortDescQuestions = pd.Series(index=raw20190603.columns[3:], data=[
     "frequency",				#01. I think that I would like to play this game frequently.',
     "complexity",				#02. I found the game unnecessarily complex.',
     "ease",					#03. I thought the game was easy to play.',
     "need for support",		#04. I think that I would need the support of a technical person to be able to play this game.',    
     "integration",				#05. I found the various functions in this game were well integrated.',
     "consistency",				#06. I thought there was too much inconsistency in this game.',
     "others' learning",		#07. I would imagine that most people would learn to play this game very quickly.',
     "convenience",				#08. I found the game very cumbersome to play.',
	 "confidence",				#09. I felt very confident using the game.',
	 "my learning",				#10. I needed to learn a lot of things before I could get going with this game.',
	 "recommendation",			#11. I would recommend this game to a friend.',
	 "remarks",					#12. Write here your remarks about the game: which feature was missing, what failed or glitched, what was great:',
	 ])
shortDescQuestions

In [None]:
shortDescQuestions = pd.Series(index=raw20190603.columns[3:], data=[
     "replay appeal",			#01. I think that I would like to play this game frequently.',
     "simplicity",				#02. I found the game unnecessarily complex.',
     "ease",					#03. I thought the game was easy to play.',
     "autonomy",		        #04. I think that I would need the support of a technical person to be able to play this game.',    
     "integration",				#05. I found the various functions in this game were well integrated.',
     "consistency",				#06. I thought there was too much inconsistency in this game.',
     "learnable by others",		#07. I would imagine that most people would learn to play this game very quickly.',
     "convenience",				#08. I found the game very cumbersome to play.', maniability
	 "confidence",				#09. I felt very confident using the game.',
	 "learnable by self",	    #10. I needed to learn a lot of things before I could get going with this game.',
	 "recommendation",			#11. I would recommend this game to a friend.',
	 "remarks",					#12. Write here your remarks about the game: which feature was missing, what failed or glitched, what was great:',
	 ])
shortDescQuestions

## game names

In [None]:
games = raw20190603[gameQuestion].unique()
games

In [None]:
gameDrBugTitle = 'Dr Bug: Microbe Mayhem'
gameSuperbugsTitle = 'Superbugs: the Game'
gameFungalTitle = 'Fungal Invaders'

assert (set(games)==set((gameDrBugTitle, gameSuperbugsTitle, gameFungalTitle))), ("Wrong list of games")

In [None]:
identityGameNames = pd.Series(index=[gameDrBugTitle, gameSuperbugsTitle, gameFungalTitle],
                           data=[gameDrBugTitle, gameSuperbugsTitle, gameFungalTitle])
shortGameNames = pd.Series(index=identityGameNames.index,
                           data=['Dr Bug', 'Superbugs', 'Fungal Invaders'])
shortGameNames

In [None]:
def getShortGameTitle(longGameName):
    return shortGameNames.get(longGameName, "unknown")

## advanced treatment: data refinement
Scores are stored as strings and must be converted to integers and NaNs for missing values

In [None]:
def getNumericalData(anonymizedData):
    numericalData = anonymizedData.copy()

    for question in indexedLikertQuestions:
        for respondent in numericalData.index:
            try:
                numericalData.loc[respondent, question] = int(numericalData.loc[respondent, question])
            except:
                numericalData.loc[respondent, question] = np.nan
    return numericalData

In [None]:
def getNormalizedNumericalData(numericalData):
    # transforms the agreement scores (1-5 Likert scale) into a 0-4 mark with 0 = bad and 4 = great
    normalizedNumericalData = numericalData.copy()
    for respondant in normalizedNumericalData.index:
        for question in indexedLikertQuestions:
            answer = normalizedNumericalData.loc[respondant, question]
            if pd.notna(answer):
                if question in negativeLikertQuestions.values:            
                    normalizedNumericalData.loc[respondant, question] = 5-answer
                else:
                    normalizedNumericalData.loc[respondant, question] = answer-1
                    
    return normalizedNumericalData

In [None]:
# Miranda House data
data20190603 = getNumericalData(raw20190603)
data20190603SUSNormalized = getNormalizedNumericalData(data20190603)

# Cite des Sciences data
data20190703 = getNumericalData(raw20190703)
data20190703SUSNormalized = getNormalizedNumericalData(data20190703)

In [None]:
datasets = dict()

datasets["data20190603"] = data20190603
datasets["data20190603SUSNormalized"] = data20190603SUSNormalized
datasets["data20190703"] = data20190703
datasets["data20190703SUSNormalized"] = data20190703SUSNormalized

In [None]:
def selectDataset(datasetName):
    #_dataName = "data20190603"
    _dataName = datasetName
    assert (_dataName in datasets), ("Not found in datasets: '" + _dataName + "'")
    inputData=datasets[_dataName]

    _dataNameSUSNormalized = _dataName + suffixSUSNormalized
    assert (_dataName in datasets), ("Not found in datasets: '" + _dataNameSUSNormalized + "'")
    inputDataSUSNormalized=datasets[_dataNameSUSNormalized]
    
    return _dataName, inputData, _dataNameSUSNormalized, inputDataSUSNormalized

In [None]:
# digital5StepDescriptions = range(minLikertValue, maxLikertValue+1)
# mixed5StepDecriptions = ['Strongly agree', '2', '3', '4', 'Strongly disagree']
# likert5StepDescriptions = ['Strongly agree', 'Slightly agree', 'Neutral', 'Slightly disagree', 'Strongly disagree']
likert5StepDescriptions = ['Strongly agree', 'Agree', 'Neutral', 'Disagree', 'Strongly disagree']