In [1]:
import numpy as np 
import pandas as pd

# Loading the multiple choices dataset, we will not look to the free form data on this study
mc = pd.read_csv('data/multipleChoiceResponses.csv', low_memory=False)

# Separating questions from answers
# This Series stores all questions
mcQ = mc.iloc[0,:]
# This DataFrame stores all answers
mcA = mc.iloc[1:,:]

In [2]:
mc.shape, mcA.shape

((23860, 395), (23859, 395))

In [3]:
# removing everyone that took less than 4 minutes or more than 600 minutes to answer the survey
less3 = mcA[round(mcA.iloc[:,0].astype(int) / 60) <= 4].index
mcA = mcA.drop(less3, axis=0)
more300 = mcA[round(mcA.iloc[:,0].astype(int) / 60) >= 600].index
mcA = mcA.drop(more300, axis=0)

# removing gender trolls, because we noticed from other kernels thata there are some ouliers here
gender_trolls = mcA[(mcA.Q1 == 'Prefer to self-describe') | (mcA.Q1 == 'Prefer not to say')].index
mcA = mcA.drop(list(gender_trolls), axis=0)

# removing student trolls, because a student won't make more than 250k a year.
student_trolls = mcA[((mcA.Q6 == 'Student') & (mcA.Q9 > '500,000+')) | \
                     ((mcA.Q6 == 'Student') & (mcA.Q9 > '400-500,000')) | \
                     ((mcA.Q6 == 'Student') & (mcA.Q9 > '300-400,000')) | \
                     ((mcA.Q6 == 'Student') & (mcA.Q9 > '250-300,000'))].index
mcA = mcA.drop(list(student_trolls), axis=0)

# dropping all NaN and I do not wish to disclose my approximate yearly compensation, because we are only interested in respondents that revealed their earnings
mcA = mcA[~mcA.Q9.isnull()].copy()
not_disclosed = mcA[mcA.Q9 == 'I do not wish to disclose my approximate yearly compensation'].index
mcA = mcA.drop(list(not_disclosed), axis=0)

In [4]:
# Remove all "OTHER_TEXT columns"
mcA = mcA[[c for c in mcA.columns if not c.endswith("_OTHER_TEXT") and c != 'Q32_OTHER']]

In [5]:
mcA.shape

(12956, 365)

In [6]:
q12_software_dummies = pd.get_dummies(mcA[['Q12_MULTIPLE_CHOICE']])  # Encode Q12

In [7]:
mcA = mcA[[c for c in mcA.columns if not c.endswith("TEXT")]]  # Remove all Q12_Part_*
mcA = mcA[[c for c in mcA.columns if c != "Q12_MULTIPLE_CHOICE"]]  # Remove Q12 itself
mcA = pd.concat([mcA, q12_software_dummies], axis=1)  # Append encoded Q12

In [8]:
minusone_cols = (mcA == '-1').sum(axis=0)
minusone_cols = minusone_cols[minusone_cols > 0]

  result = method(y)


In [9]:
for name, v in zip(minusone_cols.index, minusone_cols):
    print(name, v)

In [10]:
mcA.iloc[:5, 235:240]

Unnamed: 0,Q31_Part_11,Q31_Part_12,Q32,Q33_Part_1,Q33_Part_2
2,,,,,
3,,,Time Series Data,Government websites,
5,,,,,
7,,,Numerical Data,,
8,,,Image Data,,


In [11]:
mcA.head(1)

Unnamed: 0,Time from Start to Finish (seconds),Q1,Q2,Q3,Q4,Q5,Q6,Q7,Q8,Q9,...,Q50_Part_5,Q50_Part_6,Q50_Part_7,Q50_Part_8,"Q12_MULTIPLE_CHOICE_Advanced statistical software (SPSS, SAS, etc.)","Q12_MULTIPLE_CHOICE_Basic statistical software (Microsoft Excel, Google Sheets, etc.)","Q12_MULTIPLE_CHOICE_Business intelligence software (Salesforce, Tableau, Spotfire, etc.)","Q12_MULTIPLE_CHOICE_Cloud-based data software & APIs (AWS, GCP, Azure, etc.)","Q12_MULTIPLE_CHOICE_Local or hosted development environments (RStudio, JupyterLab, etc.)",Q12_MULTIPLE_CHOICE_Other
2,434,Male,30-34,Indonesia,Bachelor’s degree,Engineering (non-computer focused),Other,Manufacturing/Fabrication,5-10,"10-20,000",...,,,,,0,1,0,0,0,0


In [12]:
descriptive_names = ['duration', 'sex', 'age', 'country', 'education',  'undergrad_major', 'role', 'industry', 'experience', 'salary'] 
mcA.columns = descriptive_names + list(mcA.columns[len(descriptive_names):])

In [None]:
cols = [column for column in mcA.columns if "Part" in column ]
def replace_vals(row):
    for col in cols:
        if pd.isna(row[col]):
            row[col] = 0
        elif row[col]:
            row[col] = 1
    return row
test = mcA.apply(replace_vals, axis=1)

In [None]:
test.head()

In [14]:
from pandas.api.types import CategoricalDtype

# transforming compensation into category type and ordening the values
categ = ['0-10,000', '10-20,000', '20-30,000', '30-40,000', '40-50,000',
         '50-60,000', '60-70,000', '70-80,000', '80-90,000', '90-100,000',
         '100-125,000', '125-150,000', '150-200,000', '200-250,000', '250-300,000',
         '300-400,000', '400-500,000', '500,000+']
cat_type = CategoricalDtype(categories=categ, ordered=True)
mcA['Q9'] = mcA['Q9'].astype(cat_type)
# Doing this we are transforming the category "I do not wish to disclose my approximate yearly compensation" into NaN

# transforming age into category type and sorting the values
categ = ['18-21', '22-24', '25-29', '30-34', '35-39', '40-44', 
         '45-49', '50-54', '55-59', '60-69', '70-79', '80+']
cat_type = CategoricalDtype(categories=categ, ordered=True)
mcA['Q2'] = mcA['Q2'].astype(cat_type)

# transforming years of experience into category type and sorting the values
categ = ['0-1', '1-2', '2-3', '3-4', '4-5', '5-10',
         '10-15', '15-20', '20-25', '25-30', '30+']
cat_type = CategoricalDtype(categories=categ, ordered=True)
personal_data.years_experience = personal_data.years_experience.astype(cat_type)

# transforming education level into category type and sorting the values
categ = ['No formal education past high school', 'Some college/university study without earning a bachelor’s degree',
         'Professional degree', 'Bachelor’s degree', 'Master’s degree', 'Doctoral degree', 'I prefer not to answer']
cat_type = CategoricalDtype(categories=categ, ordered=True)
personal_data.education_level = personal_data.education_level.astype(cat_type)

KeyError: 'Q9'

In [None]:
personal_data.shape

In [None]:
personal_data.to_csv('transformed.csv')