In [1]:
# Business Understanding

# Question I am trying to answer: "What experiences and background should aspiring developers build to increase salary potential"

# Questions blog post will answer:
# 1) What are the most important features of the data set, what do they mean, and how do they drive the predicted outcome? 
# 2) What unusual, or creative, insights are you able to gather from the data set?
# 3) How accurate is the model that you have trained to predict the data in the data set?
# 4) What will happen in a creative, predictive, scenario using the model that you have trained?

In [2]:
#Data Understanding

In [3]:
import pandas as pd
import matplotlib as plt

raw_data = pd.read_csv(r'C:\Users\rhoer\OneDrive\Desktop\Udacity\Data-Science-Blog-Post\Developer Survey\survey_results_public.zip', compression = "zip")


In [4]:
#Getting General Information
raw_data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 65437 entries, 0 to 65436
Columns: 114 entries, ResponseId to JobSat
dtypes: float64(13), int64(1), object(100)
memory usage: 56.9+ MB


In [5]:
#Checking for nulls values
raw_data.isna().sum()

ResponseId                 0
MainBranch                 0
Age                        0
Employment                 0
RemoteWork             10631
                       ...  
JobSatPoints_11        35992
SurveyLength            9255
SurveyEase              9199
ConvertedCompYearly    42002
JobSat                 36311
Length: 114, dtype: int64

In [6]:
raw_data.head()

Unnamed: 0,ResponseId,MainBranch,Age,Employment,RemoteWork,Check,CodingActivities,EdLevel,LearnCode,LearnCodeOnline,...,JobSatPoints_6,JobSatPoints_7,JobSatPoints_8,JobSatPoints_9,JobSatPoints_10,JobSatPoints_11,SurveyLength,SurveyEase,ConvertedCompYearly,JobSat
0,1,I am a developer by profession,Under 18 years old,"Employed, full-time",Remote,Apples,Hobby,Primary/elementary school,Books / Physical media,,...,,,,,,,,,,
1,2,I am a developer by profession,35-44 years old,"Employed, full-time",Remote,Apples,Hobby;Contribute to open-source projects;Other...,"Bachelor’s degree (B.A., B.S., B.Eng., etc.)",Books / Physical media;Colleague;On the job tr...,Technical documentation;Blogs;Books;Written Tu...,...,0.0,0.0,0.0,0.0,0.0,0.0,,,,
2,3,I am a developer by profession,45-54 years old,"Employed, full-time",Remote,Apples,Hobby;Contribute to open-source projects;Other...,"Master’s degree (M.A., M.S., M.Eng., MBA, etc.)",Books / Physical media;Colleague;On the job tr...,Technical documentation;Blogs;Books;Written Tu...,...,,,,,,,Appropriate in length,Easy,,
3,4,I am learning to code,18-24 years old,"Student, full-time",,Apples,,Some college/university study without earning ...,"Other online resources (e.g., videos, blogs, f...",Stack Overflow;How-to videos;Interactive tutorial,...,,,,,,,Too long,Easy,,
4,5,I am a developer by profession,18-24 years old,"Student, full-time",,Apples,,"Secondary school (e.g. American high school, G...","Other online resources (e.g., videos, blogs, f...",Technical documentation;Blogs;Written Tutorial...,...,,,,,,,Too short,Easy,,


In [7]:
#Data Exploration Findings
    #- There are many null values that I will need to account for before modeling    
    #- There are many columns with object data types that I will need to account for before modeling
    #- There are some columns that have multiple choice answers that have a ';' delimiter
    #- There are many rows that have null values since different sections of the survey are optional so many respondants partially reply to survey
#Survey Exploration Findings
    #- There were many people surveys that I should exclude based on the business question
    #- There is a question where you answer "Apples" if you are paying attention. All rows withouts apples in that column should be deleted
    #- There are many questions that are unlikely to have a strong relationship with business question (ex: section dedicated to Stack Overflow usage, survey length)

In [8]:
#Data Preparation

#Removing all rows without apples in column "Check"
data_apples = raw_data[raw_data['Check'] == "Apples"]
data_apples.info()

#No rows were removed. Likely these responses were filtered prior to publishing dataset

<class 'pandas.core.frame.DataFrame'>
Int64Index: 65437 entries, 0 to 65436
Columns: 114 entries, ResponseId to JobSat
dtypes: float64(13), int64(1), object(100)
memory usage: 57.4+ MB


In [9]:
#Removing all rows where respondent doesn't use coding professionally or are not currently working

#list of responses that say they currently code in some capacity professionally
pro_dev_responses = ['I am a developer by profession', 'I am not primarily a developer, but I write code sometimes as part of my work/studies']

#list of employment status responses to exclude (there are some respondants who picked every option so exluding is better than including)
pro_emp_responses = ['Not employed, but looking for work', 'Not employed, and not looking for work', 'Student, full-time', 'Retired', 'I prefer not to say']

#Filtering data by if they code professionally
pro_data = data_apples.loc[data_apples['MainBranch'].isin(pro_dev_responses)]

#Filtering data by employement status
pro_data_employment = pro_data['Employment'].str.split(';', expand = True)
pro_data_employment_keys = pro_data_employment.keys()
pro_data_employment_mask = ~pro_data_employment[pro_data_employment_keys].isin(pro_emp_responses).any(axis = 1)
pro_data = pro_data.loc[pro_data_employment_mask]

pro_data.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 49262 entries, 0 to 65435
Columns: 114 entries, ResponseId to JobSat
dtypes: float64(13), int64(1), object(100)
memory usage: 43.2+ MB


In [10]:
#Verifying that rows were filtered correctly
print(pro_data['MainBranch'].value_counts())

I am a developer by profession                                                           44968
I am not primarily a developer, but I write code sometimes as part of my work/studies     4294
Name: MainBranch, dtype: int64


In [11]:
#Attemping to drop all rows with NAN
DropNAData = pro_data.dropna()
DropNAData.info()

#This approach cannot be used due to few rows remaining afterwards

<class 'pandas.core.frame.DataFrame'>
Int64Index: 0 entries
Columns: 114 entries, ResponseId to JobSat
dtypes: float64(13), int64(1), object(100)
memory usage: 0.0+ bytes


In [12]:
#Column count of NAN values

nan_counts = pro_data.isna().sum()
nan_counts

#nan_counts show that many columns are responisble for NAN values. 

ResponseId                 0
MainBranch                 0
Age                        0
Employment                 0
RemoteWork               229
                       ...  
JobSatPoints_11        21679
SurveyLength            6705
SurveyEase              6686
ConvertedCompYearly    26863
JobSat                 21925
Length: 114, dtype: int64

In [13]:
#Removing columns unrelated to business question



col_filtered_data = pro_data.drop(columns = ['Check', 'SurveyLength', 'SurveyEase', 'AIChallenges', 'AIEthics', 'AIThreat', 'AINextMuch more integrated', 'AINextNo change',
                                         'AINextMore integrated', 'AINextLess integrated', 'AINextMuch less integrated', 'BuyNewTool', 'BuildvsBuy', 'TechEndorse',
                                          'LanguageWantToWorkWith', 'DatabaseWantToWorkWith', 'LanguageAdmired', 'DatabaseAdmired', 'PlatformWantToWorkWith',
                                           'PlatformAdmired', 'WebframeWantToWorkWith', 'WebframeAdmired', 'EmbeddedWantToWorkWith', 'EmbeddedAdmired', 'MiscTechWantToWorkWith',
                                             'MiscTechAdmired', 'ToolsTechWantToWorkWith', 'ToolsTechAdmired', 'NEWCollabToolsWantToWorkWith', 'NEWCollabToolsAdmired',
                                               'OfficeStackAsyncWantToWorkWith', 'OfficeStackAsyncAdmired', 'OfficeStackSyncWantToWorkWith', 'OfficeStackSyncAdmired',
                                                  'AISearchDevWantToWorkWith', 'AISearchDevAdmired', 'NEWSOSites', 'SOVisitFreq', 'SOAccount', 'SOPartFreq', 'SOHow', 'SOComm',
                                                    'AISent', 'AIBen', 'AIAcc', 'AIComplex', 'AIToolInterested in Using', 'AIToolNot interested in Using', 'JobSatPoints_1', 
                                                      'JobSatPoints_4', 'JobSatPoints_5', 'JobSatPoints_6', 'JobSatPoints_7', 'JobSatPoints_8', 'JobSatPoints_9', 'JobSatPoints_10',
                                                        'JobSatPoints_11', 'JobSat'  ])

#Notes on column removal justification:

#Check - mark Apples if you are paying attention. This has already been used for filtering so this is no longer required
#SurveyLength - question regarding quality of survey
#SurveyEase - question regarding quality of survey
#AIChallenges - question about opinions on AI and is unrrelated to professional coding experience
#AIEthics - question about opinions on AI and is unrrelated to professional coding experience
#AIThreat - question about opinions on AI and is unrrelated to professional coding experience
#AINextMuch more integrated - question about opinions on AI usage in the future and is unrrelated to current professional coding
#AINextNo change - question about opinions on AI usage in the future and is unrrelated to current professional coding
#AINextMore integrated - question about opinions on AI usage in the future and is unrrelated to current professional coding
#AINextLess integrated - question about opinions on AI usage in the future and is unrrelated to current professional coding
#AINextMuch less integrated - question about opinions on AI usage in the future and is unrrelated to current professional coding
#BuyNewTool - question is about new software preferences and is unrrelated to professional coding
#BuildvsBuy - question is about new software preferences and is unrrelated to professional coding
#TechEndorse - question is about new software preferences and is unrrelated to professional coding
#LanguageWantToWorkWith - question is not about professional coding experience
#DatabaseWantToWorkWith - question is not about professional coding experience
#LanguageAdmired - question is not about professional coding experience
#DatabaseAdmired - question is not about professional coding experience 
#PlatformWantToWorkWith - question is not about professional coding experience 
#PlatformAdmired - question is not about professional coding experience 
#WebframeWantToWorkWith - question is not about professional coding experience 
#WebframeAdmired - question is not about professional coding experience  
#EmbeddedWantToWorkWith - question is not about professional coding experience  
#EmbeddedAdmired - question is not about professional coding experience  
#MiscTechWantToWorkWith - question is not about professional coding experience
#MiscTechAdmired - question is not about professional coding experience
#ToolsTechWantToWorkWith - question is not about professional coding experience
#ToolsTechAdmired - question is not about professional coding experience
#NEWCollabToolsWantToWorkWith - question is not about professional coding experience
#NEWCollabToolsAdmired - question is not about professional coding experience
#OfficeStackAsyncWantToWorkWith - question is not about professional coding experience
#OfficeStackAsyncAdmired - question is not about professional coding experience
#OfficeStackSyncWantToWorkWith - question is not about professional coding experience
#OfficeStackSyncAdmired - question is not about professional coding experience
#AISearchDevWantToWorkWith - question is not about professional coding experience
#AISearchDevAdmired - question is not about professional coding experience
#NEWSOSites - question is about Stack Overflow and not about professional coding experience
#SOVisitFreq - question is about Stack Overflow and not about professional coding experience
#SOAccount - question is about Stack Overflow and not about professional coding experience
#SOPartFreq - question is about Stack Overflow and not about professional coding experience
#SOHow - question is about Stack Overflow and not about professional coding experience
#SOComm - question is about Stack Overflow and not about professional coding experience
#AISent - question about opinions on AI and is unrrelated to professional coding experience
#AIBen - question about opinions on AI and is unrrelated to professional coding experience
#AIAcc - question about opinions on AI and is unrrelated to professional coding experience
#AIComplex - question about opinions on AI and is unrrelated to professional coding experience
#AIToolInterested in Using - question about opinions on AI and is unrrelated to professional coding experience
#AIToolNot interested in Using - question about opinions on AI and is unrrelated to professional coding experience
#JobSatPoints 1,4,5,6,7,8,9,10,11 - question is about job satisfaction and is unrrelated to professional coding experience
#Job Sat - question is about job satisfaction and is unrrelated to professional coding experience








In [14]:
#Filtering to only respondants that answered the TeamsQuestions portion of the survey
#This is important as there are a few questions that give insight into how long they have had their job, independence and organizational influence

teams_filtered_data = col_filtered_data.loc[col_filtered_data['TBranch'] == 'Yes']

In [15]:
teams_filtered_data.isna().sum()

ResponseId                            0
MainBranch                            0
Age                                   0
Employment                            0
RemoteWork                            7
CodingActivities                      9
EdLevel                               0
LearnCode                            55
LearnCodeOnline                    4806
TechDoc                            7818
YearsCode                            77
YearsCodePro                        134
DevType                              63
OrgSize                             333
PurchaseInfluence                   347
Country                               0
Currency                            247
CompTotal                          6022
LanguageHaveWorkedWith              109
DatabaseHaveWorkedWith             3447
PlatformHaveWorkedWith             6684
WebframeHaveWorkedWith             5809
EmbeddedHaveWorkedWith            18417
MiscTechHaveWorkedWith             8694
ToolsTechHaveWorkedWith            2098


In [12]:
#Modeling

#Temporary notation


In [17]:
#Evaluation