In [6]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from IPython import display
%matplotlib inline
from collections import defaultdict
df = pd.read_csv('./survey-results-public.csv')
df.head()

Unnamed: 0,Respondent,Professional,ProgramHobby,Country,University,EmploymentStatus,FormalEducation,MajorUndergrad,HomeRemote,CompanySize,...,StackOverflowMakeMoney,Gender,HighestEducationParents,Race,SurveyLong,QuestionsInteresting,QuestionsConfusing,InterestedAnswers,Salary,ExpectedSalary
0,1,Student,"Yes, both",United States,No,"Not employed, and not looking for work",Secondary school,,,,...,Strongly disagree,Male,High school,White or of European descent,Strongly disagree,Strongly agree,Disagree,Strongly agree,,
1,2,Student,"Yes, both",United Kingdom,"Yes, full-time",Employed part-time,Some college/university study without earning ...,Computer science or software engineering,"More than half, but not all, the time",20 to 99 employees,...,Strongly disagree,Male,A master's degree,White or of European descent,Somewhat agree,Somewhat agree,Disagree,Strongly agree,,37500.0
2,3,Professional developer,"Yes, both",United Kingdom,No,Employed full-time,Bachelor's degree,Computer science or software engineering,"Less than half the time, but at least one day ...","10,000 or more employees",...,Disagree,Male,A professional degree,White or of European descent,Somewhat agree,Agree,Disagree,Agree,113750.0,
3,4,Professional non-developer who sometimes write...,"Yes, both",United States,No,Employed full-time,Doctoral degree,A non-computer-focused engineering discipline,"Less than half the time, but at least one day ...","10,000 or more employees",...,Disagree,Male,A doctoral degree,White or of European descent,Agree,Agree,Somewhat agree,Strongly agree,,
4,5,Professional developer,"Yes, I program as a hobby",Switzerland,No,Employed full-time,Master's degree,Computer science or software engineering,Never,10 to 19 employees,...,,,,,,,,,,


In [7]:
def mean_amt(df, col_name, col_mean, look_for):
    '''
    INPUT:
    df - the pandas dataframe you want to search
    col_name - the column name you want to look through
    col_count - the column you want to count values from
    col_mean - the column you want the mean amount for
    look_for - a list of strings you want to search for in each row of df[col]
    
    OUTPUT:
    df_all - holds sum, square, total, mean, variance, and standard deviation for the col_mean
    '''
    new_df = defaultdict(int)
    squares_df = defaultdict(int)
    denoms = dict()
    for val in look_for:
        denoms[val] = 0
        for idx in range(df.shape[0]):
            #print(df[col_name])
            #print(df[col_name].isnull()[idx])
            if df[col_name].isnull()[idx] == False:
                if val in df[col_name][idx] and df[col_mean][idx] > 0:
                    new_df[val] += df[col_mean][idx]
                    squares_df[val] += df[col_mean][idx]**2 #Needed to understand the spread
                    denoms[val] += 1 
    
    # Turn into dataframes
    new_df = pd.DataFrame(pd.Series(new_df)).reset_index()
    squares_df = pd.DataFrame(pd.Series(squares_df)).reset_index()
    denoms = pd.DataFrame(pd.Series(denoms)).reset_index()
    
    # Change the column names
    new_df.columns = [col_name, 'col_sum']
    squares_df.columns = [col_name, 'col_squares']
    denoms.columns = [col_name, 'col_total']
    
    # Merge dataframes
    df_means = pd.merge(new_df, denoms)
    df_all = pd.merge(df_means, squares_df)
    
    # Additional columns needed for analysis
    df_all['mean_col'] = df_means['col_sum']/df_means['col_total']
    df_all['var_col'] = df_all['col_squares']/df_all['col_total'] - df_all['mean_col']**2
    df_all['std_col'] = np.sqrt(df_all['var_col'])
    df_all['lower_95'] = df_all['mean_col'] - 1.96*df_all['std_col']/np.sqrt(df_all['col_total'])
    df_all['upper_95'] = df_all['mean_col'] + 1.96*df_all['std_col']/np.sqrt(df_all['col_total'])
    return df_all

In [8]:
possible_vals = ['Web developer','Desktop applications developer','Mobile developer',
                 'Other','DevOps specialist','Embedded applications/devices developer',
                 'Systems administrator','Data scientist','Database administrator',
                 'Graphic designer','Developer with a statistics or mathematics background',
                 'Machine learning specialist','Quality assurance engineer','Graphics programming']

df_prof_male = df[(df['Gender']== 'Male') & (df['Professional']=='Professional developer')].reset_index()
df_prof_female = df[(df['Gender']== 'Female') & (df['Professional']=='Professional developer')].reset_index()

#print(df.head(10))
dev_male_perc = mean_amt(df_prof_male, 'DeveloperType', 'Salary', possible_vals)
dev_female_perc = mean_amt(df_prof_female, 'DeveloperType', 'Salary', possible_vals)

comp_df = pd.merge(dev_male_perc, dev_female_perc, left_index=True, right_index=True,on='DeveloperType',suffixes=('_male', '_female'))

comp_df['Diff_male_female_Vals'] = (comp_df['mean_col_male'] - comp_df['mean_col_female'])/(0.5*(comp_df['mean_col_male'] + comp_df['mean_col_female']))
comp_newdf = comp_df[['Diff_male_female_Vals','DeveloperType']]
comp_newdf.style.bar(subset=['Diff_male_female_Vals'], align='mid', color=['#d65f5f', '#5fba7d'])



Unnamed: 0,Diff_male_female_Vals,DeveloperType
0,-0.050643,Web developer
1,0.059627,Desktop applications developer
2,-0.109416,Mobile developer
3,0.188453,Other
4,-0.098411,DevOps specialist
5,-0.013246,Embedded applications/devices developer
6,0.137084,Systems administrator
7,-0.035932,Data scientist
8,0.171435,Database administrator
9,0.009989,Graphic designer
