In [3]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
%matplotlib inline

df = pd.read_csv('../Data/survey-results-public.csv')
df.head()


Unnamed: 0,Respondent,Professional,ProgramHobby,Country,University,EmploymentStatus,FormalEducation,MajorUndergrad,HomeRemote,CompanySize,...,StackOverflowMakeMoney,Gender,HighestEducationParents,Race,SurveyLong,QuestionsInteresting,QuestionsConfusing,InterestedAnswers,Salary,ExpectedSalary
0,1,Student,"Yes, both",United States,No,"Not employed, and not looking for work",Secondary school,,,,...,Strongly disagree,Male,High school,White or of European descent,Strongly disagree,Strongly agree,Disagree,Strongly agree,,
1,2,Student,"Yes, both",United Kingdom,"Yes, full-time",Employed part-time,Some college/university study without earning ...,Computer science or software engineering,"More than half, but not all, the time",20 to 99 employees,...,Strongly disagree,Male,A master's degree,White or of European descent,Somewhat agree,Somewhat agree,Disagree,Strongly agree,,37500.0
2,3,Professional developer,"Yes, both",United Kingdom,No,Employed full-time,Bachelor's degree,Computer science or software engineering,"Less than half the time, but at least one day ...","10,000 or more employees",...,Disagree,Male,A professional degree,White or of European descent,Somewhat agree,Agree,Disagree,Agree,113750.0,
3,4,Professional non-developer who sometimes write...,"Yes, both",United States,No,Employed full-time,Doctoral degree,A non-computer-focused engineering discipline,"Less than half the time, but at least one day ...","10,000 or more employees",...,Disagree,Male,A doctoral degree,White or of European descent,Agree,Agree,Somewhat agree,Strongly agree,,
4,5,Professional developer,"Yes, I program as a hobby",Switzerland,No,Employed full-time,Master's degree,Computer science or software engineering,Never,10 to 19 employees,...,,,,,,,,,,


In [4]:
df2 = pd.read_csv('../Data/survey-results-schema.csv')
list(df2[df2.Column=='CousinEducation']['Question'])
# df2.head()

["Let's pretend you have a distant cousin. They are 24 years old, have a college degree in a field not related to computer programming, and have been working a non-coding job for the last two years. They want your advice on how to switch to a career as a software developer. Which of the following options would you most strongly recommend to your cousin?\nLet's pretend you have a distant cousin named Robert. He is 24 years old, has a college degree in a field not related to computer programming, and has been working a non-coding job for the last two years. He wants your advice on how to switch to a career as a software developer. Which of the following options would you most strongly recommend to Robert?\nLet's pretend you have a distant cousin named Alice. She is 24 years old, has a college degree in a field not related to computer programming, and has been working a non-coding job for the last two years. She wants your advice on how to switch to a career as a software developer. Which

In [5]:
study = df['CousinEducation'].value_counts().reset_index()
study

Unnamed: 0,index,CousinEducation
0,Take online courses; Buy books and work throug...,711
1,Take online courses,551
2,None of these,523
3,Take online courses; Part-time/evening courses...,479
4,Take online courses; Bootcamp; Part-time/eveni...,465
...,...,...
730,Part-time/evening courses; Participate in hack...,1
731,Participate in hackathons; Participate in onli...,1
732,Master's degree; Part-time/evening courses; Co...,1
733,Master's degree; Get a job as a QA tester; Par...,1


一条数据记录会包含回答者选择的所有选项，因此需要对数据进行清洗和处理。

In [6]:
#清洗数据，先将index列名改成更为合适的名称
study.rename(columns={'index': 'method', 'CousinEducation': 'count'}, inplace = True)
study.head()

Unnamed: 0,method,count
0,Take online courses; Buy books and work throug...,711
1,Take online courses,551
2,None of these,523
3,Take online courses; Part-time/evening courses...,479
4,Take online courses; Bootcamp; Part-time/eveni...,465


创建一个包含所有方法选项的列表：
可以逐行读取数据，然后对method中的数据通过分号来进行分割。
**发现数据中还存在method开头多一个空格的情况，需要将空格去掉考虑。**

In [7]:
possible_vals = []
def possible_list(row_list):
    for i in row_list:
        if i.strip() not in possible_vals:
            possible_vals.append(i.strip())
        else:
            pass

for index, row in study.iterrows():
    row_list = row['method'].split(';')
    possible_list(row_list)
#     print(row_list)
# possible_vals = set(possible_vals)
print(possible_vals)

['Take online courses', 'Buy books and work through the exercises', 'None of these', 'Part-time/evening courses', 'Bootcamp', 'Other', 'Return to college', 'Contribute to open source', 'Conferences/meet-ups', 'Get a job as a QA tester', 'Participate in online coding competitions', 'Participate in hackathons', "Master's degree"]


In [8]:
#统计各个method的频数
def total_count(df, col1, col2, look_for):
    '''
    INPUT:
    df: 初始数据中CousinEducation数据。
    col1: method列。
    col2：count值列。
    look_for: possible_vals
    '''
    
    from collections import defaultdict
    new_df = defaultdict(int)
    #遍历方法列表
    for val in look_for:
        #遍历df每一行
        for idx in range(df.shape[0]):
            #统计method的频数，并写入到字new_df中
            if val in df[col1][idx]:
                new_df[val] += int(df[col2][idx])
    new_df = pd.DataFrame(pd.Series(new_df)).reset_index()
    new_df.columns = [col1, col2]
    new_df.sort_values('count', ascending=False, inplace=True)
    return new_df

In [9]:
study_df = total_count(study, 'method', 'count', possible_vals)
study_df

Unnamed: 0,method,count
0,Take online courses,15246
1,Buy books and work through the exercises,11750
3,Part-time/evening courses,7517
7,Contribute to open source,7423
4,Bootcamp,5276
8,Conferences/meet-ups,5244
6,Return to college,5017
10,Participate in online coding competitions,3610
9,Get a job as a QA tester,3376
11,Participate in hackathons,2747


In [10]:
study_df['prec'] = study_df['count']/np.sum(study_df['count'])
study_df

Unnamed: 0,method,count,prec
0,Take online courses,15246,0.209432
1,Buy books and work through the exercises,11750,0.161408
3,Part-time/evening courses,7517,0.10326
7,Contribute to open source,7423,0.101968
4,Bootcamp,5276,0.072476
8,Conferences/meet-ups,5244,0.072036
6,Return to college,5017,0.068918
10,Participate in online coding competitions,3610,0.04959
9,Get a job as a QA tester,3376,0.046376
11,Participate in hackathons,2747,0.037735


可能会关心这些method和工资收入的关系，进行重新的统计。

In [11]:
#统计各个method的频数
def total_count_salary(df, col_name, col_mean, look_for):
    '''
    INPUT:
    df: 初始数据中CousinEducation数据。
    col_name: method列。CousinEducation
    col_count：count值列。
    col_mean: the mean amount。Salary
    look_for: possible_vals
    
    OUTPUT:
    一些数值统计信息。
    df_all: sum,square, toal, mean, variance,
    standard deviation for the col_mean.
    '''
    
    from collections import defaultdict
    new_df = defaultdict(int)
    square_df = defaultdict(int)
    denoms = dict()
    #遍历方法列表
    for val in look_for:
        denoms[val] = 0
        #遍历df每一行
        for idx in range(df.shape[0]):
            if df[col_name].isnull()[idx] == False:
                if val in df[col_name][idx] and df[col_mean][idx] > 0:
                    new_df[val] += df[col_mean][idx]
                    #needed to understand the square
                    square_df[val] += df[col_mean][idx] ** 2
                    denoms[val] += 1
    new_df = pd.DataFrame(pd.Series(new_df)).reset_index()
    square_df = pd.DataFrame(pd.Series(square_df)).reset_index()
    denoms = pd.DataFrame(pd.Series(denoms)).reset_index()
    
    #change the column name
    new_df.columns = [col_name, 'col_sum']
    square_df.columns = [col_name, 'col_square']
    denoms.columns = [col_name, 'col_total']
    
    #merge the dataframes
    df_means = pd.merge(new_df, denoms)
    df_all = pd.merge(df_means, square_df)
    #additional columns needed for analysis
    df_all['mean_col'] = df_means['col_sum']/df_means['col_total']
    #方差
    df_all['val_col'] = df_all['col_square']/df_all['col_total'] - df_all['mean_col']**2

    #标准差
    df_all['std_col'] = np.sqrt(df_all['val_col'])
    #salary的95%的置信区间
    df_all['lower_95'] = df_all['mean_col'] - 1.96*df_all['std_col']/np.sqrt(df_all['col_total'])
    df_all['upper_95'] = df_all['mean_col'] + 1.96*df_all['std_col']/np.sqrt(df_all['col_total'])
    
    return df_all

In [None]:
salary_df = total_count_salary(df, 'CousinEducation', 'Salary', possible_vals)
salary_df

**如果要进行复杂的分析统计，上述方法的效率会极其低下。**
在实际中不建议这样使用。

In [1]:
#可以对结果进行排序，让具备最高平均收入的推荐方法排在最前面
salary_df.sort_values('mean_col', ascending=False)

NameError: name 'salary_df' is not defined

ImportError: DLL load failed: %1 不是有效的 Win32 应用程序。

In [None]:
data = [[0,0.004,0,0.0181,0,0,0,0,0,0],
[0,0,0,0.0819,0.0370,0,0,0,0,0],
[0,0,0,0,0.063,0.0569,0,0,0,0],
[0,0,0,0,0,0.0431,0.0779,0,0,0],
[0,0,0,0,0,0,0.0221,0.1,0,0]]