In [128]:
import pandas as pd
import numpy as np
import seaborn as sns
import numpy as np
from matplotlib import pyplot as plt
import entrofy.core

## Read the data

In [129]:
participants = pd.read_csv("Waterhackweek-2020-Participants-Workshop-On-Water-Data-Science.csv")

In [130]:
instructors = pd.read_csv("Waterhackweek-2020-Instructors-Workshop-On-Water-Data-Science.csv")

In [131]:
motivation = pd.read_csv("motivation_a60_20200116.csv")

## Clean the data

### Country of Origin

Clean 'Country of Origin' 
to account for various was USA is reported by applicants in the form (no controlled vocabulary or dropdown list). 

In [89]:
participants.replace(to_replace = {'Country of origin':{'US':'USA',
                                                 'United States':'USA',
                                                  'U.S.A':'USA',
                                                  'U.S.':'USA',
                                         'United States of America':'USA'}}, inplace = True)

In [90]:
instructors.replace(to_replace = {'Country of Origin and/or Tribal or First Nation Affiliation':{'US':'USA',
                                                 'United States':'USA',
                                                  'U.S.A':'USA',
                                                  'U.S.':'USA',
                                         'United States of America':'USA'}}, inplace = True)

In [91]:
participants['USA'] = participants.apply(lambda row: 1 if row['Country of origin'] == 'USA' else 0, axis=1)

instructors['USA'] = instructors.apply(lambda row: 1 if row['Country of Origin and/or Tribal or First Nation Affiliation'] == 'USA' else 0, axis=1)

### Affiliation

'Are you currently affiliated with the University of Washington?'  For research network building, benefit to the broader academic community, and direct benefit to UW researchers, our program aims for 50% of participants whom are affiliated with UW. 

In [92]:
participants['atUW'] = participants.apply(lambda row: 1 if row['Are you currently affiliated with the University of Washington?'] == 'Yes' 
                                          else 0, axis=1)

In [93]:
instructors['atUW'] = instructors.apply(lambda row: 1 if row['Are you associated with the University of Washington?'] == 'Yes' 
                                        else 0, axis=1)

### Position

'What is your current position?' For research network building, benefit to the broader academic community, and direct benefit to UW researchers, our program aims for 50% of participants whom are doctoral (PhD) students and a uniform distribution of all other educational and career stages.

"Position" is categorical. If any of the participants fills in the "Other" column, check if that can be mapped to any of the existing categories, or create a new category for it

In [96]:
participants['position'] = participants['What is your current position?']
participants.replace(to_replace = {'position':{'Undergraduate Student and Research Technician':'Undergraduate student',
                                               'Postdoctoral fellow' : 'Postdoc',
                                               'Independent researcher' : 'Academic researcher (e.g. research scientist, data scientist)',
                                               'Indendently consulting data scientist' : 'Independent Consulting Data Scientist'
                                  }}, inplace = True)
participants['position'].unique()

array(["Master's student",
       'Software professional (e.g. developer, infrastructure research, industry)',
       'Professional scientist (e.g. government, non-profit, industry scientist)',
       'Academic researcher (e.g. research scientist, data scientist)',
       'PhD student', 'Academic faculty', 'Postdoc',
       'Undergraduate student', 'Independent Consulting Data Scientist'],
      dtype=object)

In [97]:
instructors['position'] = instructors['What is your current position?']

instructors.replace(to_replace = {'position':{ 'Evaluation Scientist' : 'Waterhackweek (e.g. Researcher, Staff, Consultant)',
                                               'Freshwater Initiative Communications Specialist' : 'Waterhackweek (e.g. Researcher, Staff, Consultant)',

                                  }}, inplace = True)

instructors['position'].unique()

array(['Academic researcher (e.g. research scientist, data scientist)',
       'Professional scientist (e.g. government, non-profit, industry scientist)',
       'Waterhackweek (e.g. Researcher, Staff, Consultant)',
       'PhD student', 'Academic faculty', "Master's student",
       'Undergraduate student', 'eScience Staff '], dtype=object)

### Gender

Check if the non-required field gender has been filled out, if yes, then use that, otherwise use the pronouns field, otherwise put in the "Prefer Not to say" category

In [98]:
participants['gender']=participants['What is your gender (optional)?']
participants.replace(to_replace = {'gender':{'Non-binary/third gender':'Non-binary/Third gender or Prefer not to say',
                                             np.nan : 'Non-binary/Third gender or Prefer not to say'
                                  }}, inplace = True)

participants['gender'].unique()

array(['Male', 'Female', 'Non-binary/Third gender or Prefer not to say'],
      dtype=object)

In [99]:
instructors['gender']=instructors['What is your gender?']
instructors.replace(to_replace = {'gender':{'Non-binary/third gender':'Neither or Prefer not to say',
                                             np.nan : 'Non-binary/Third gender or Prefer not to say'
                                  }}, inplace = True)

instructors['gender'].unique()

array(['Female', 'Male'], dtype=object)

### Experience

Experience is used by the instructors to prepare materials accordingly

Programming experience

In [100]:
participants['programming_experience'] = participants['Programming experience.  We strive to invite a broad mix of people whose expertise ranges from beginner to expert level.']
participants.replace(to_replace = {'programming_experience':{'I have written the scripts and edited both according to my work':'I have structured and written code to be reusable, including functions and classes'}}, inplace = True)
participants['programming_experience'].unique()

array(['I have read and edited scripts written by other people',
       'I have written and released a software package for others to use, including unit tests and documentation',
       'I have structured and written code to be reusable, including functions and classes',
       'I have written scripts for basic analysis tasks, but have not included formal structures like functions or classes',
       'I have no coding experience'], dtype=object)

Water related research experience

In [101]:
participants['water_experience'] = participants['Experience with working with water-related datasets']
participants.replace(to_replace = {'water_experience':{'Experienced with both spatial and temporal data':'Very experienced with both spatial and temporal data'}}, inplace = True)
participants['water_experience'].unique()

array(['Little experience, just starting!',
       'More experience with geospatial than timeseries',
       'Very experienced with both spatial and temporal data',
       'More experience with point timeseries than geospatial datasets',
       'Very experienced in geospatial data in a GIS setting, and more experience with point timeseries than geospatial through coding'],
      dtype=object)

Machine learning experience

In [102]:
participants['ml_experience'] = participants['Machine learning knowledge']
participants['ml_experience'].unique()

array(['Little to no experience',
       "I have had some exposure to machine learning e.g. through course work or tutorials, but haven't applied any methods in a real-world setting",
       'I have used some machine learning on real data sets (e.g. in my own research, as part of a side project)',
       'I have implemented or developed algorithms myself',
       "I've worked with a data scientist to use machine learning on real data sets but have not implemented it myself."],
      dtype=object)

Statistics experience

In [103]:
participants['stats_experience'] = participants['Statistics knowledge']
participants.replace(to_replace = {'stats_experience':{'I\'m a Bayesian and a big fan of Stan and PyMC3':'I have used statistics on real data sets (e.g. in my own research, as part of a side project)'}}, inplace = True)
participants['stats_experience'].unique()

array(['I have used statistics on real data sets (e.g. in my own research, as part of a side project)',
       "I have had some exposure to statistics, e.g. through course work or tutorials, but haven't applied any methods to real data sets",
       'I have implemented or developed algorithms myself'], dtype=object)

Cloud Computing Experience

In [104]:
participants['cloud_experience'] = participants['Cloud computing experience']
participants.replace(to_replace = {'cloud_experience':{'I\'m a student ambassador for my university\'s HPC system, but still need to learn more':'I routinely use HPC and cloud computing'}}, inplace = True)
participants.replace(to_replace = {'cloud_experience':{'I have used HPC to get precise point position using real time IRNSS navigation data in a pilot project at Indian Institute of Remote Sensing, Dehradun':'I have had some exposure to cloud computing'}}, inplace = True)
participants['cloud_experience'].unique()

array(['Little to no experience',
       'I have had some exposure to cloud computing',
       'I routinely use HPC and cloud computing'], dtype=object)

Command line interface experience

In [105]:
participants['command_line_experience'] = participants['Familiarity with command line interfaces. This refers to your experience interacting with a computing environment through typed commands as opposed to graphical user interfaces, for example to install open source libraries.']
participants.replace(to_replace = {'command_line_experience':{'I used the command line a lot, but it\'s usually the same commands over and over':'I commonly use a command line in my work'}}, inplace = True)
participants.replace(to_replace = {'command_line_experience':{'Little to no experience; Used UNIX command line extensively for MS work at U Arizona':'Little to no experience'}}, inplace = True)
participants.replace(to_replace = {'command_line_experience':{'On through bash to manage python packages (Miniconda) and using JupyterNotebook to code':'I occasionally use a command line in my work'}}, inplace = True)
participants['command_line_experience'].unique()

array(['I commonly use a command line in my work',
       'I occasionally use a command line in my work',
       'Little to no experience'], dtype=object)

Version control experience

In [106]:
participants['Familiarity with version control and code sharing tools. This refers to your experience working with version control and code sharing in an open source environment, using tools such as Git and GitHub.']
participants['version_control_experience'] = participants['Familiarity with version control and code sharing tools. This refers to your experience working with version control and code sharing in an open source environment, using tools such as Git and GitHub.']
participants.replace(to_replace = {'version_control_experience' : {'Little to no experience; Only used Git for a course to pull, have not pushed but understand how Git works':'Little to no experience'}}, inplace = True)
participants['version_control_experience'].unique()

array(['Little to no experience',
       'I commonly use version control and code sharing tools in my work',
       'I occasionally use version control and code sharing tools in my work'],
      dtype=object)

### LGBTQ

In [107]:
participants['lgbtq']=participants['Do you consider yourself a member of the Lesbian, Gay, Bisexual, Transgender, and/or Queer (LGBTQ) community?'].fillna(0)
participants.replace(to_replace = {'lgbtq':{'No, but I identify as an Ally*':'No or Prefer not to say',
                                   0:'No or Prefer not to say',
                                   'No':'No or Prefer not to say',
                                  }}, inplace = True)

participants['lgbtq'].unique()

array(['No or Prefer not to say', 'Yes'], dtype=object)

In [108]:
instructors['lgbtq']=instructors['Do you consider yourself a member of the Lesbian, Gay, Bisexual, Transgender, and/or Queer (LGBTQ) community?'].fillna(0)
instructors.replace(to_replace = {'lgbtq':{'No, but I identify as an Ally*':'No or Prefer not to say',
                                   0:'No or Prefer not to say',
                                   'No':'No or Prefer not to say',
                                  }}, inplace = True)

instructors['lgbtq'].unique()

array(['No or Prefer not to say', 'Yes'], dtype=object)

### Non-Binary Trans

In [109]:
participants['nonbin_trans']=participants['What is your gender (optional)?'].fillna(0)
participants.replace(to_replace = {'nonbin_trans':{'Male':'Binary or Prefer not to say',
                                       'Female':'Binary or Prefer not to say',
                                          0:'Binary or Prefer not to say',
                                  }}, inplace = True)

participants['nonbin_trans'].unique()

array(['Binary or Prefer not to say', 'Non-binary/third gender'],
      dtype=object)

In [110]:
instructors['nonbin_trans']=instructors['What is your gender?'].fillna(0)
instructors.replace(to_replace = {'nonbin_trans':{'Male':'Binary or Prefer not to say',
                                       'Female':'Binary or Prefer not to say',
                                          0:'Binary or Prefer not to say',
                                  }}, inplace = True)

instructors['nonbin_trans'].unique()

array(['Binary or Prefer not to say'], dtype=object)

### Race Ethnic
This field is created for visualization purposes

Known issue: multiple race identities reported by minority race; Future Work = code to fractions to total identity proportions

In [111]:
participants['What best describes you? (select all that apply):']
participants['race_ethnic']=participants['What best describes you? (select all that apply):'].fillna(0)

participants.replace(to_replace = {'race_ethnic':{'Asian':'North Asian',
                                   0:'No answer or Prefer not to say',
                                   'American Indian, Alaska Native, or Native American': 'Indigenous',
                                   'Asian; White, European American, or Caucasian':'North Asian',
                                   'Hispanic, Latinx, or Spanish; White, European American, or Caucasian':'Hispanic, Latinx, or Spanish',
                                    'Indian' : 'South Asian',
                                    'Pakistani' : 'South Asian',
                                    'Asian; Indian' : 'South Asian',
                                   'Middle Eastern or North African; White, European American, or Caucasian':'Middle Eastern or North African',
                                   'Native Hawaiian or Other Pacific Islander' : 'Indigenous'
                                  }}, inplace = True)

participants['race_ethnic'].unique()

array(['White, European American, or Caucasian', 'North Asian',
       'No answer or Prefer not to say', 'Indigenous', 'South Asian',
       'Hispanic, Latinx, or Spanish', 'Middle Eastern or North African'],
      dtype=object)

In [112]:
instructors['What categories best describe you? (select all that apply)']
instructors['race_ethnic']=instructors['What categories best describe you? (select all that apply)'].fillna(0)

instructors.replace(to_replace = {'race_ethnic':{'Asian':'North Asian',
                                   0:'No answer or Prefer not to say',
                                   'American Indian, Alaska Native, or Native American': 'Indigenous',
                                   #'Hispanic, Latinx, or Spanish':'People of Color',
                                   #'African, African American, or Black':'People of Color',
                                    'Indian' : 'South Asian',
                                    'Pakistani' : 'South Asian',
                                    'Asian; Indian' : 'South Asian',
                                   'Middle Eastern or North African; White, European American, or Caucasian':'Middle Eastern or North African',
                                   'Native Hawaiian or Other Pacific Islander' : 'Indigenous'
                                  }}, inplace = True)
instructors['race_ethnic'].unique()

array(['White, European American, or Caucasian',
       'African, African American, or Black',
       'Middle Eastern or North African', 'Hispanic, Latinx, or Spanish',
       'North Asian'], dtype=object)

### Color
Thsi field is created for the purpose of providing input to entrofy

In [113]:
participants['What best describes you? (select all that apply):']
participants['color']=participants['What best describes you? (select all that apply):'].fillna(0)

participants.replace(to_replace = {'color':{'Asian':'People of Color',
                                   0:'No answer or Prefer not to say',
                                   'American Indian, Alaska Native, or Native American': 'People of Color',
                                   'Asian; White, European American, or Caucasian':'People of Color',
                                   'Hispanic, Latinx, or Spanish; White, European American, or Caucasian':'People of Color',
                                    'Hispanic, Latinx, or Spanish' : 'People of Color',
                                    'Indian' : 'People of Color',
                                    'Middle Eastern or North African' :'People of Color',
                                    'Pakistani' : 'People of Color',
                                    'Asian; Indian' : 'People of Color',
                                   'Middle Eastern or North African; White, European American, or Caucasian':'People of Color',
                                   'Native Hawaiian or Other Pacific Islander' : 'People of Color'
                                  }}, inplace = True)

participants['color'].unique()

array(['White, European American, or Caucasian', 'People of Color',
       'No answer or Prefer not to say'], dtype=object)

In [114]:
instructors['color']=instructors['What categories best describe you? (select all that apply)'].fillna(0)

instructors.replace(to_replace = {'color':{'Asian':'People of Color',
                                   0:'No answer or Prefer not to say',
                                   'American Indian, Alaska Native, or Native American': 'People of Color',
                                   'Hispanic, Latinx, or Spanish':'People of Color',
                                   'African, African American, or Black':'People of Color',
                                    'Indian' : 'People of Color',
                                    'Pakistani' : 'People of Color',
                                    'Asian; Indian' : 'People of Color',
                                   'Middle Eastern or North African; White, European American, or Caucasian':'People of Color',
                                   'Native Hawaiian or Other Pacific Islander' : 'People of Color'
                                  }}, inplace = True)
instructors['color'].unique()

array(['White, European American, or Caucasian', 'People of Color'],
      dtype=object)

### Language

In [115]:
participants['language'] = participants['What languages do you speak?']
participants.replace(to_replace = {'language':{np.nan:'Prefer not to answer',
                                  }}, inplace = True)
participants.replace(to_replace = {'What languages do you speak?':{np.nan:'Prefer not to answer',
                                  }}, inplace = True)

participants['language'] = participants.apply(lambda row: 1 if row['language'] == 'English' else 0, axis=1)
participants['language'].unique()

array([1, 0])

In [116]:
instructors['language'] = instructors['What languages do you speak?']
instructors.replace(to_replace = {'language':{np.nan:'English',
                                           'English; Bi-lingual; Multi-lingual':'Multi-lingual',
                                  }}, inplace = True)
instructors.replace(to_replace = {'What languages do you speak?':{np.nan:'Prefer not to answer',
                                           'English; Bi-lingual; Multi-lingual':'Multi-lingual',
                                  }}, inplace = True)

instructors['language'] = instructors.apply(lambda row: 1 if row['language'] == 'English' else 0, axis=1)
instructors['language'].unique()

array([0, 1])

Set motivation for all instructors to be 3.0 and populate motivation for participants from the reviewers

In [117]:
participants.shape

(61, 59)

In [118]:
motivation.shape

(58, 2)

In [52]:
participants['seed'] = 0
participants['motivation']=motivation['Motivation']
participants['ID']=motivation['ID']

instructors['seed'] = 1 
instructors['motivation'] = 3

#New participant without motivation score in table (post reading essays)
#scored post review process
participants.loc[58,'motivation']=3
participants.loc[59,'motivation']=3
participants.loc[60,'motivation']=3

participants.loc[58,'ID']=58
participants.loc[59,'ID']=59
participants.loc[60,'ID']=60


#participants who filled team organizer/instructor form
instructors.loc[0,'seed']=0
instructors.loc[4:5,'seed']=0
instructors.loc[11,'seed']=0
instructors['ID'] = instructors.index + 100


### Process "What Data have you analyzed (select all that apply)?"

In [55]:
instructors['met_climate'] = 0
instructors['snow_ice'] = 0
instructors['alpine_forest'] = 0
instructors['riv_streams'] = 0
instructors['lake_wetlands'] = 0
instructors['geo_sediment'] = 0
instructors['quality_waste'] = 0
instructors['extreme_event_disaster'] = 0
instructors['urban'] = 0
instructors['datasci_model_software'] = 0
instructors['fish_aquatic'] = 0
instructors['health'] = 0
instructors['hydraulics'] = 0
instructors['ocean'] = 0
instructors['no_exp_yet'] = 0

In [56]:
#create a mapping between column names and actual form responses
data_analyzed_map = dict()
data_analyzed_map["Meteorology & Climate"] = "met_climate"
data_analyzed_map["Snow & Ice"] = "snow_ice"
data_analyzed_map["Rivers & Streams"] = "riv_streams"
data_analyzed_map["Lakes & Wetlands"] = "lake_wetlands"
data_analyzed_map["Geomorphology & Sediment"] = "geo_sediment"
data_analyzed_map["Fish & Aquatic Species"] = "fish_aquatic"
data_analyzed_map["Alpine & Forested Environments"] = "alpine_forest"
data_analyzed_map["Urban Environments"] = "urban"
data_analyzed_map["Water Quality & Waste"] = "quality_waste"
data_analyzed_map["Extreme Events & Disaster"] = "extreme_event_disaster"
data_analyzed_map["Data Science & Modeling Software"] = "datasci_model_software"
data_analyzed_map["Population & Public Health Impacts"] = "health"
data_analyzed_map["Channel & Delta Hydraulics"] = "hydraulics"
data_analyzed_map["No Expertise"] = "no_exp_yet"

In [57]:
participants['met_climate']=0
participants['snow_ice']=0
participants['alpine_forest']=0
participants['riv_streams']=0
participants['lake_wetlands']=0
participants['geo_sediment']=0
participants['quality_waste']=0
participants['extreme_event_disaster']=0
participants['urban']=0
participants['datasci_model_software']=0
participants['fish_aquatic']=0
participants['health']=0
participants['hydraulics']=0
participants['no_exp_yet']=0

In [58]:
def populate_data_analyzed_cols():
    other_topics = set()
    for i in range(len(participants)):
        data_analyzed = participants.loc[i, "What kind of data have you analyzed (check all relevant)?"]
        if pd.isnull(data_analyzed):
            continue
        split_data = data_analyzed.split(";")
        for data in split_data:
            data = data.strip()
            if data in data_analyzed_map.keys():
                participants.loc[i, data_analyzed_map[data]] = 1
            else:
                other_topics.add(data)
    return other_topics

In [59]:
other_topics = populate_data_analyzed_cols()
other_topics

{'16S sequences and microbiome',
 'Agriculture',
 'Data from water utilities companies',
 'Groundwater',
 'Intertidal mangrove forests',
 'No experience with freshwater data yet! Have experience with data from other fields.',
 'No expertise',
 'Ocean',
 'Streamflow Prediction in Ungauged Basins',
 'Water Use',
 'hydrogeologic data'}

Concatenate participants and instructors in a single dataframe

In [60]:
df1=participants[["ID","USA", "atUW", "position", "gender", "lgbtq", "nonbin_trans", "race_ethnic","color", "language", "met_climate",'snow_ice','alpine_forest','riv_streams','lake_wetlands','geo_sediment','quality_waste', 'extreme_event_disaster','urban','datasci_model_software','fish_aquatic','health','hydraulics','no_exp_yet',"seed","motivation"]]
df2=instructors[["ID","USA", "atUW", "position", "gender", "lgbtq", "nonbin_trans", "race_ethnic", "color","language", "met_climate",'snow_ice','alpine_forest','riv_streams','lake_wetlands','geo_sediment','quality_waste','extreme_event_disaster','urban', 'datasci_model_software','fish_aquatic','health','hydraulics','no_exp_yet',"seed","motivation"]]

In [61]:
cohort_all=pd.concat([df1,df2], ignore_index=True)

In [74]:
cohort_all.to_csv("cohort_all.csv")