## **Estimating the salary of a Data Scientist in Brazil**
Based on the survey conducted by the Data Hackers community.

In [None]:
# imports

import pandas as pd
from sklearn.linear_model import LinearRegression

In [None]:
# reading the dataset

dataset = '/kaggle/input/pesquisa-data-hackers-2019/datahackers-survey-2019-anonymous-responses.csv'
df = pd.read_csv(dataset)

In [None]:
# selecting the desired columns

columns = ["('P3', 'living_in_brasil')",
           "('P8', 'degreee_level')",
           "('P17', 'time_experience_data_science')",
           "('P18', 'time_experience_before')",
           "('P19', 'is_data_science_professional')",
           "('P16', 'salary_range')"]

df = df[columns]

In [None]:
# some data manipulation

df.dropna(subset=["('P16', 'salary_range')"], inplace=True)

df = df[df["('P19', 'is_data_science_professional')"] == 1]

df = df[df["('P3', 'living_in_brasil')"] == 1]

df = df[df["('P8', 'degreee_level')"] != 'Não tenho graduação formal']

df.drop(columns=["('P3', 'living_in_brasil')", "('P19', 'is_data_science_professional')"], inplace=True)

In [None]:
# salary mapping

salary_to_num = {
    'Menos de R$ 1.000/mês': 1000,
    'de R$ 1.001/mês a R$ 2.000/mês': 1500,
    'de R$ 2.001/mês a R$ 3000/mês': 2500,
    'de R$ 3.001/mês a R$ 4.000/mês': 3500,
    'de R$ 4.001/mês a R$ 6.000/mês': 5000,
    'de R$ 6.001/mês a R$ 8.000/mês': 7000,
    'de R$ 8.001/mês a R$ 12.000/mês': 10000,
    'de R$ 12.001/mês a R$ 16.000/mês': 14000,
    'de R$ 16.001/mês a R$ 20.000/mês': 18000,
    'de R$ 20.001/mês a R$ 25.000/mês': 22500,
    'Acima de R$ 25.001/mês': 25000}

df["('P16', 'salary_range')"] = df["('P16', 'salary_range')"].map(salary_to_num)

In [None]:
# experience mapping

exp_to_num = {
    'Menos de 1 ano': 0.5,
    'de 1 a 2 anos': 1.5,
    'de 2 a 3 anos': 2.5,
    'de 4 a 5 anos': 4.5,
    'de 6 a 10 anos': 8,
    'Mais de 10 anos': 14,
    'Não tenho experiência na área de dados': 0,
    'Não tive experiência na área de TI/Engenharia de Software antes de começar a trabalhar na área de dados': 0}

df["('P17', 'time_experience_data_science')"] = df["('P17', 'time_experience_data_science')"].map(exp_to_num)

df["('P18', 'time_experience_before')"] = df["('P18', 'time_experience_before')"].map(exp_to_num)

In [None]:
# degree mapping

degree_encoding = {
    'Estudante de Graduação': 2.5,
    'Graduação/Bacharelado': 5,
    'Pós-graduação': 6,
    'Mestrado': 7,
    'Doutorado ou Phd': 11}

df["('P8', 'degreee_level')"] = df["('P8', 'degreee_level')"].map(degree_encoding)

In [None]:
# renaming columns

df = df.rename(columns={
    "('P8', 'degreee_level')": 'Years of academic study',
    "('P17', 'time_experience_data_science')": 'Experience in data science (in years)',
    "('P18', 'time_experience_before')": 'Experience in related fields (in years)',
    "('P16', 'salary_range')": 'Monthly salary (in R$)'})

In [None]:
# fitting the model

X_train = df[['Years of academic study',
              'Experience in data science (in years)',
              'Experience in related fields (in years)']]

y_train = df['Monthly salary (in R$)']

model = LinearRegression(normalize=True).fit(X_train, y_train)

In [None]:
# estimating the salary of a recent graduate without experience

predict = pd.DataFrame({
    'Years of academic study': [5],
    'Experience in data science (in years)': [0],
    'Experience in related fields (in years)': [0]})

print('Monthly salary = R$', int(model.predict(predict)))

In [None]:
# estimating the salary of a person with a degree and 3 years of experience in a related field

predict = pd.DataFrame({
    'Years of academic study': [5],
    'Experience in data science (in years)': [0],
    'Experience in related fields (in years)': [3]})

print('Monthly salary = R$', int(model.predict(predict)))

In [None]:
# estimating the salary of someone with a master's degree and 3 years of experience in data science

predict = pd.DataFrame({
    'Years of academic study': [7],
    'Experience in data science (in years)': [3],
    'Experience in related fields (in years)': [0]})

print('Monthly salary = R$', int(model.predict(predict)))

In [None]:
# estimating the salary of a very experienced data scientist with a PhD

predict = pd.DataFrame({
    'Years of academic study': [11],
    'Experience in data science (in years)': [15],
    'Experience in related fields (in years)': [0]})

print('Monthly salary = R$', int(model.predict(predict)))