In [None]:
# Python ≥3.5 is required
import sys
assert sys.version_info >= (3, 5)

# Scikit-Learn ≥0.20 is required
import sklearn
assert sklearn.__version__ >= "0.20"

# Common imports
import pandas as pd
import numpy as np
import os

# To plot pretty figures
%matplotlib inline
import matplotlib as mpl
import matplotlib.pyplot as plt
mpl.rc('axes', labelsize=14)
mpl.rc('xtick', labelsize=12)
mpl.rc('ytick', labelsize=12)

# Where to save the figures
PROJECT_ROOT_DIR = "."
CHAPTER_ID = "end_to_end_project"

input_file = "ds_salaries.csv"
data = pd.read_csv(input_file, header = 0)
data.head()
data.info()

In [None]:
data.describe()

In [None]:
import matplotlib.pyplot as plt

# Create a scatter plot of YearsExperience vs Salary
plt.scatter(data['work_year'], data['salary'])

# Add labels and title to the plot
plt.xticks(range(int(min(data['work_year'])), int(max(data['work_year']))+1, 1))

plt.xlabel('work_year')
plt.ylabel('salary')
plt.title('salary vs work_year')

# Show the plot
plt.show()

In [None]:
%matplotlib inline
import matplotlib.pyplot as plt
data.hist(bins=50, figsize=(20,15))

plt.show()

# Clean up the set


#### Check if there are any missing values in our columns

In [None]:
#check for missing values

columns_with_miss = data.isna().sum()
#filtering only the columns with at least 1 missing value
columns_with_miss = columns_with_miss[columns_with_miss!=0]
#The number of columns with missing values
print('Columns with missing values:', len(columns_with_miss))
#sorting the columns by the number of missing values descending
columns_with_miss.sort_values(ascending=False)

#### Drop the unuseful columns: salary, salary_currency (both covered by salary_in_usd).

In [None]:
data.drop(columns=['salary', 'salary_currency'], inplace=True)
data

### Transform String data to Integer values
#### First, print all column's possible values to see what we are working with

In [None]:
for column in data.columns:
    if data[column].dtype != 'int64':
        print(data[column].value_counts())
print(data['company_location'].value_counts())

#### Codify simple, ordinal data
experience_level (entry, mid, senior, executive)

In [None]:
data['experience_level'].replace({
    'EN': 1,
    'MI': 2,
    'SE': 3,
    'EX': 4,
}, inplace=True)

company_size (S, M, L)

In [None]:
data['company_size'].replace({
    'S': 1,
    'M': 2,
    'L': 3
}, inplace=True)

remote_ratio (0, 50, 100) ->  (0, 1, 2)

In [None]:
data['remote_ratio'].replace({
    0: 0,
    50: 1,
    100: 2
}, inplace=True)

#### Codify non-ordinal data
by converting a column to multiple binary value columns

In [None]:
def one_hot_encode_column(data, column):
    dummies = pd.get_dummies(data[column], prefix=column)
    data = pd.concat([data, dummies], axis=1)
    data.drop(columns=[column], inplace=True)
    return data
min_occurences = 50 
def bin_values(data, column):
    # Number of occurrences
    counts = data[column].value_counts()
    # Create a list with all values that have less than x occurrences
    other_values = list(counts[counts < min_occurences].index)
    # Replace the values in the "other_values" list with "Other"
    data[column].replace(other_values, "Other", inplace=True)
    return data

employment_type
FT    2973
PT      15
CT       9
FL       7

In [None]:
data = one_hot_encode_column(data, 'employment_type')
data

job_title simplification (reduce it to 4 main job categories)

In [None]:
def map_job_title_to_category(job_title):
    if "analyst" in job_title.lower():
        return "Data Analyst"
    elif "machine" in job_title.lower():
        return "Machine Learning Engineer"
    elif "scientist" in job_title.lower():
        return "Data Scientist"
    elif "engineer" in job_title.lower():
        return "Data Engineer"
    else:
        return "Other"

# Create a new column called "job_category" and remove old column
data["job_category"] = data["job_title"].apply(map_job_title_to_category)
data["job_category"].value_counts()
data = data.drop('job_title', axis=1)



In [None]:
data = bin_values(data, 'job_category')
data = one_hot_encode_column(data, 'job_category')

In [None]:
data

employee_residence (bin all countries with less than x value)

In [None]:
data = bin_values(data, 'employee_residence')
data = one_hot_encode_column(data, 'employee_residence')

In [None]:
data

company_location (bin all countries with less than x value)

In [None]:
data = bin_values(data, 'company_location')
data = one_hot_encode_column(data, 'company_location')

In [None]:
data

In [None]:
data.value_counts()

# Test set

In [None]:
# to make this notebook's output identical at every run
np.random.seed(42)

In [None]:
from sklearn.model_selection import train_test_split

train_set, test_set = train_test_split(data, test_size=0.2, random_state=42)

In [None]:
from sklearn.model_selection import StratifiedShuffleSplit

# stratify by experience level since that is one of the most important attributes
split = StratifiedShuffleSplit(n_splits=1, test_size=0.2, random_state=42)
for train_index, test_index in split.split(data, data["experience_level"]):
    strat_train_set = data.loc[train_index]
    strat_test_set = data.loc[test_index]

In [None]:
#check distribution of train and test
strat_test_set["experience_level"].value_counts() / len(strat_test_set)

In [None]:
strat_train_set["experience_level"].value_counts() / len(strat_train_set)

#### Check distribution of Experience_level in different sets: Overall, Stratified, Randomized

In [None]:
def income_cat_proportions(data):
    return data["experience_level"].value_counts() / len(data)

train_set, test_set = train_test_split(data, test_size=0.2, random_state=42)

compare_props = pd.DataFrame({
    "Overall": income_cat_proportions(data),
    "Stratified": income_cat_proportions(strat_test_set),
    "Random": income_cat_proportions(test_set),
}).sort_index()
compare_props["Rand. %error"] = 100 * compare_props["Random"] / compare_props["Overall"] - 100
compare_props["Strat. %error"] = 100 * compare_props["Stratified"] / compare_props["Overall"] - 100

In [None]:
compare_props

In [None]:
#cautam corelatia dintre salariu si restul atributelor
data = strat_train_set.copy()
corr_matrix = data.corr()
corr_matrix["salary_in_usd"].sort_values(ascending=False)