In [1]:
import pandas as pd
from functools import reduce
import string
import numpy as np

responses = pd.DataFrame()
for i in range(16,21):
    event_df = pd.read_csv('responses/e{}_pre.csv'.format(i))
    event_df['event'] = 'e{}'.format(i)
    responses = pd.concat([responses, event_df], ignore_index=True)
pd.set_option("display.max_columns", None)
responses = responses.dropna(how='all', subset=['Network ID']) # drop rows that are not responses
responses.columns = responses.columns.str.replace('{', '', regex=False).str.replace('}', '', regex=False) # remove curly braces from typeform responses

responses_cleaned = pd.DataFrame()

For e15 to e20, there were no timestamps in the CSVs so that was removed as an exclusion criteria to qualify responses. e15 also did not have a pre-event response so only data from e16 onwards was included. 

In [3]:
responses_cleaned['events'] = responses['event']

There are some columns which are asking for the same feature -- need to combine these.

In [4]:
email_cols = ["And what's your email address?"]
responses_cleaned['email'] = responses[email_cols].apply(lambda x: ''.join(x.dropna().astype(str)), 1)

first_name_cols = ['Great, can we get your *first/given name* (e.g. Jane)?']
last_name_cols = ['And your last *name/surname*?']
responses_cleaned['first_name'] = responses[first_name_cols].apply(lambda x: ''.join(x.dropna().astype(str)), 1)
responses_cleaned['last_name'] = responses[last_name_cols].apply(lambda x: ''.join(x.dropna().astype(str)), 1)

school_cols = ['Thanks  field:ef34b985c51e4131! Which *school/Institution* are you from?','Thanks field:ef34b985c51e4131! Which *school/institution* are you from?']
responses_cleaned['school'] = responses[school_cols].apply(lambda x: ''.join(x.dropna().astype(str)), 1)

join_mailing_cols = ['Would you like to join our mailing list to be informed of future events?']
responses_cleaned['join_mailing'] = responses[join_mailing_cols].apply(lambda x: ''.join(x.dropna().astype(str)), 1)

qs_cols = ['Any *questions* about the event/ for the speakers?']
responses_cleaned['questions'] = responses[qs_cols].apply(lambda x: ''.join(x.dropna().astype(str)), 1)

knowledge_cols = [colname for colname in responses.columns if colname.startswith('Select the option that best describes your knowledge')]
interest_cols = [colname for colname in responses.columns if colname.startswith('Select the option that best describes your interest')]
responses_cleaned['knowledge'] = responses[knowledge_cols].apply(lambda x: ''.join(x.dropna().astype(str)), 1)
responses_cleaned['interest'] = responses[interest_cols].apply(lambda x: ''.join(x.dropna().astype(str)), 1)

internship_cols = ['How difficult has it been for you to find an internship or a job?']
responses_cleaned['internship'] = responses[internship_cols].apply(lambda x: ''.join(x.dropna().astype(str)), 1)

next_cols = ['Which *sectors or careers* do you want to hear from next?','Which *sectors* or *careers* do you want to hear from next?']
responses_cleaned['next'] = responses[next_cols].apply(lambda x: ''.join(x.dropna().astype(str)), 1)

Some columns are specific to particular events:

In [5]:
responses_cleaned['e16_interest_why_STEM'] = responses['What are some reasons as to why you are field:9aaaeeebe70858c4?'].astype(str)
responses_cleaned['e20_interest_why_SW'] = responses['What are some reasons as to why field:9aaaeeebe70858c4?'].astype(str)
responses_cleaned = responses_cleaned.applymap(lambda s:s.lower() if type(s) == str else s) #lowercase all string cells

We can look through some of the columns that we aren't sure about and drop them accordingly.

Can drop (i.e. just ignore and not copy them over to responses_cleaned) those columns since they aren't informative for our purposes. I'm also going to drop the column asking for age since it was only done for 1 survey.


Also, for future reference! Response validation is important: (i.e. in the survey, should select it such that we only allow a number to be chosen as a response) 

For e10, 11, 12, 14, students were also asked to pick 3 skills that they wanted to learn. We save the columns for each skill as booleans. We'll do the same for all the other fields as well regarding how they found out about CC.

In [6]:
responses_cleaned['skills_marketing'] = responses['Digital Marketing'].notnull()
responses_cleaned['skills_fullstack'] = responses['Full Stack Development (Computer Programming)'].notnull()
responses_cleaned['skills_ux'] = responses['UX Design'].notnull()
responses_cleaned['skills_data'] = responses['Data Analytics'].notnull()
responses_cleaned['skills_pm'] = responses['Product Management'].notnull()
responses_cleaned['skills_acc'] = responses['Accounting'].notnull()
responses_cleaned['skills_proc'] = responses['Procurement/ Logistics/ Supply Chain'].notnull()
responses_cleaned['skills_pr'] = responses['Public Relations'].notnull()
responses_cleaned['skills_mr'] = responses['Market/ User Research'].notnull()

In [7]:
responses_cleaned['how_event_teacher'] = responses['My teacher'].notnull()
responses_cleaned['how_event_LI'] = responses['Linkedin'].notnull()
responses_cleaned['how_event_insta'] = responses['Instagram (@careercontact)'].notnull()
responses_cleaned['how_event_school'] = responses["My schools' Education and Career Guidance Unit"].notnull()
responses_cleaned['how_event_friends'] = responses['Friends/ Peers'].notnull()
responses_cleaned['how_event_mailing'] = responses['CareerContact Mailing List'].notnull()
responses_cleaned['how_event_twitter'] = responses['Twitter (@careercontactcc)'].notnull()
responses_cleaned['how_event_fb'] = responses['Facebook (facebook.com/careercontact.org)'].notnull()
responses_cleaned['how_event_AYO'] = responses['AYO Social Media Channels'].notnull()
responses_cleaned['how_event_SIC'] = responses['SIC Social Media Channels'].notnull()

For several of the surveys we also asked about how much students were knowledgeable and interested in the careers that they're signing up for.

In [8]:
def knowledge_categories(df_string):
    if df_string.startswith('i know little or nothing about'):
        return 1
    elif 'certainty' in df_string:
        return 2
    elif df_string.startswith('i am able to articulate') and '& more' not in df_string and '& beyond' not in df_string:
        return 3
    elif df_string.startswith('i am able to articulate') and ('& more, e.g. challenges and trends' in df_string or '& beyond' in df_string):
        return 4

def interest_categories(df_string):
    if 'quite set on' in df_string:
        return 1
    elif 'considering other' in df_string:
        return 2
    elif 'undecided' in df_string:
        return 3

responses_cleaned['knowledge_coded'] = responses_cleaned.knowledge.apply(knowledge_categories)
responses_cleaned['interest_coded'] = responses_cleaned.interest.apply(interest_categories)

That covers all the columns in the original dataframe, we can export it and start our analysis now!

In [10]:
pd.to_pickle(responses_cleaned, 'data/preevent_responses_2.pandas')