# Stack Overflow Developer Survey 2022

A Stack Overflow Developer Survey é uma pesquisa anual realizada pela plataforma Stack Overflow, que coleta informações sobre a comunidade de desenvolvedores. A pesquisa abrange uma variedade de tópicos, como linguagens de programação, ferramentas, práticas de desenvolvimento e satisfação profissional. Os resultados são analisados e publicados em um relatório que fornece insights sobre tendências e percepções dos desenvolvedores. A pesquisa é uma fonte importante de informações para profissionais de tecnologia e empresas de desenvolvimento de software.

Com esse projeto, pretendemos analisar os dados da pesquisa de 2022 a fim de construir e comparar regressores para a predição de salários de desenvolvedores de software. Para isso, utilizaremos técnicas de aprendizado de máquina e estatística.

In [74]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

In [75]:
# load raw data
# remember to download the data from
# https://www.kaggle.com/datasets/dheemanthbhat/stack-overflow-annual-developer-survey-2022
# https://insights.stackoverflow.com/survey

raw = pd.read_csv('raw/survey_results_public.csv')

# select columns of interest

target_col = 'ConvertedCompYearly'
features_cols = ['Employment', 'RemoteWork', 'EdLevel', 'YearsCode', 'YearsCodePro', 'Country', 'Age']
#
# maybe add 'Age'	'Gender'	'Trans'	'Sexuality'	'Ethnicity'	'Accessibility'
#

# remove rows with missing data

raw = raw[raw[target_col].notnull()]
raw = raw[raw[features_cols].notnull().all(axis=1)]

raw = raw[raw['Age'] != 'Prefer not to say']

# split multi label columns

raw['Employment'] = raw['Employment'].str.split(';')

# drop columns that are not of interest

raw = raw.drop(raw.columns.difference(features_cols + [target_col]), axis=1)

print('Total number of samples in the dataset ', raw.shape)
raw

Total number of samples in the dataset  (37698, 8)


Unnamed: 0,Employment,RemoteWork,EdLevel,YearsCode,YearsCodePro,Country,Age,ConvertedCompYearly
2,"[Employed, full-time]","Hybrid (some remote, some in-person)","Master’s degree (M.A., M.S., M.Eng., MBA, etc.)",14,5,United Kingdom of Great Britain and Northern I...,25-34 years old,40205.0
3,"[Employed, full-time]",Fully remote,"Bachelor’s degree (B.A., B.S., B.Eng., etc.)",20,17,Israel,35-44 years old,215232.0
8,"[Employed, full-time]","Hybrid (some remote, some in-person)","Master’s degree (M.A., M.S., M.Eng., MBA, etc.)",6,6,Netherlands,25-34 years old,49056.0
10,"[Employed, full-time]","Hybrid (some remote, some in-person)","Bachelor’s degree (B.A., B.S., B.Eng., etc.)",5,2,United Kingdom of Great Britain and Northern I...,18-24 years old,60307.0
11,"[Employed, full-time, Independent contractor, ...",Fully remote,"Bachelor’s degree (B.A., B.S., B.Eng., etc.)",12,10,United States of America,35-44 years old,194400.0
...,...,...,...,...,...,...,...,...
73114,"[Employed, full-time, Independent contractor, ...","Hybrid (some remote, some in-person)","Associate degree (A.A., A.S., etc.)",7,2,Germany,18-24 years old,41058.0
73116,"[Employed, full-time]","Hybrid (some remote, some in-person)","Bachelor’s degree (B.A., B.S., B.Eng., etc.)",21,16,United States of America,35-44 years old,115000.0
73118,"[Employed, full-time]",Fully remote,"Bachelor’s degree (B.A., B.S., B.Eng., etc.)",4,3,Nigeria,25-34 years old,57720.0
73119,"[Employed, full-time]","Hybrid (some remote, some in-person)","Bachelor’s degree (B.A., B.S., B.Eng., etc.)",5,1,United States of America,25-34 years old,70000.0


In [82]:
from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import OneHotEncoder, OrdinalEncoder, MultiLabelBinarizer

# preprocess categorical data

one_hot_transformer = Pipeline([
    ('onehot', OneHotEncoder())
])

# Custom transformer to apply multi-label binarization
class MultiHotEncoder(BaseEstimator, TransformerMixin):
    """Wraps `MultiLabelBinarizer` in a form that can work with `ColumnTransformer`. Note
    that input X has to be a `pandas.DataFrame`.
    """
    def __init__(self):
        self.mlbs = list()
        self.n_columns = 0
        self.categories_ = self.classes_ = list()

    def fit(self, X:pd.DataFrame, y=None):
        for i in range(X.shape[1]): # X can be of multiple columns
            mlb = MultiLabelBinarizer()
            mlb.fit(X.iloc[:,i])
            self.mlbs.append(mlb)
            self.classes_.append(mlb.classes_)
            self.n_columns += 1
        return self

    def transform(self, X:pd.DataFrame):
        if self.n_columns == 0:
            raise ValueError('Please fit the transformer first.')
        if self.n_columns != X.shape[1]:
            raise ValueError(f'The fit transformer deals with {self.n_columns} columns '
                             f'while the input has {X.shape[1]}.'
                            )
        result = list()
        for i in range(self.n_columns):
            result.append(self.mlbs[i].transform(X.iloc[:,i]))

        result = np.concatenate(result, axis=1)
        return result

    def get_feature_names_out(self, input_features=None):
        cats = self.categories_
        if input_features is None:
            input_features = self.columns
        elif len(input_features) != len(self.categories_):
            raise ValueError(
                "input_features should have length equal to number of "
                "features ({}), got {}".format(len(self.categories_),
                                               len(input_features)))

        feature_names = []
        for i in range(len(cats)):
            names = [input_features[i] + "_" + str(t) for t in cats[i]]
            feature_names.extend(names)

        return np.asarray(feature_names, dtype=object)

multilabel_transformer = Pipeline([
    ('multihot', MultiHotEncoder())
])

ordinal_transformer = Pipeline([
    ('ordinal', OrdinalEncoder())
])

one_hot_cols = ['Country', 'RemoteWork']
multilabel_cols = ['Employment']
ordinal_cols = ['EdLevel', 'Age']

column_trans_preprocessor = ColumnTransformer(
    [('one_hot', one_hot_transformer, one_hot_cols),
     ('multihot', multilabel_transformer, multilabel_cols),
     ('ordinal', ordinal_transformer, ordinal_cols),
     ('pass', 'passthrough', ['YearsCode', 'YearsCodePro'])],
    remainder='drop')

column_trans_preprocessor.fit_transform(raw)

column_trans_preprocessor.get_feature_names_out()

# Apply one-hot encoding to categorical columns


# Label encoding for ordinal columns

# Apply mapping to 'Age'

#data_encoded['Age'] = raw['Age'].map(age_mapping)

# drop old columns

# concatenate the original data with the new one-hot encoded columns

#data_encoded


ValueError: For a sparse output, all columns should be a numeric or convertible to a numeric.

In [104]:
# normalize the variables (????)

In [105]:
from sklearn.model_selection import train_test_split

# Split the data into train, validation, and test sets with proportions 70:10:20
X_train, X_val_test, y_train, y_val_test = train_test_split(
    raw_features,
    raw_target,
    test_size=0.3,
    random_state=42
)

X_val, X_test, y_val, y_test = train_test_split(
    X_val_test,
    y_val_test,
    test_size=0.67,
    random_state=42
)

# check the shapes of the resulting train, validation, and test sets
# X and y have the same shape

print('Train set shape: ', X_train.shape)
print('Validation set shape: ', X_val.shape)
print('Test set shape: ', X_test.shape)

Train set shape:  (26423, 184)
Validation set shape:  (3737, 184)
Test set shape:  (7588, 184)


In [106]:
# stolen code didn't check

from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score

regressor = LinearRegression()  # Create an instance of the regressor
regressor.fit(X_train, y_train)  # Train the model

y_pred = regressor.predict(X_test)  # Predict the target variable

mse = mean_squared_error(y_test, y_pred)
mae = mean_absolute_error(y_test, y_pred)
r2 = r2_score(y_test, y_pred)

print('Mean Squared Error:', mse)
print('Mean Absolute Error:', mae)
print('R-squared:', r2)

ValueError: could not convert string to float: 'Less than 1 year'