In [63]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
%matplotlib inline 
import seaborn as  sns
import warnings
warnings.filterwarnings('ignore')
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_squared_error

In [64]:
df = pd.read_csv('ds_salaries.csv')

In [65]:
df.head()

Unnamed: 0,work_year,experience_level,employment_type,job_title,salary,salary_currency,salary_in_usd,employee_residence,remote_ratio,company_location,company_size
0,2023,SE,FT,Principal Data Scientist,80000,EUR,85847,ES,100,ES,L
1,2023,MI,CT,ML Engineer,30000,USD,30000,US,100,US,S
2,2023,MI,CT,ML Engineer,25500,USD,25500,US,100,US,S
3,2023,SE,FT,Data Scientist,175000,USD,175000,CA,100,CA,M
4,2023,SE,FT,Data Scientist,120000,USD,120000,CA,100,CA,M


In [66]:
#Finding the unique values of all the variables
print('Unique Values:')
columns = list(df.columns)
for col in columns:
    u = df[col].nunique()
    print(f'{col} = {u}')

Unique Values:
work_year = 4
experience_level = 4
employment_type = 4
job_title = 93
salary = 815
salary_currency = 20
salary_in_usd = 1035
employee_residence = 78
remote_ratio = 3
company_location = 72
company_size = 3


Based on the unique values, we can conclude the following:
- There are 4 types of employement along with work_year and experience_level.
- There are 93 different types of job profiles in data science.
- Company size can be L - "Large", M - "Medium" and S - "small."
- There are 3 types of working culture, which can be remote, hybrid, in-office.


In [67]:
#finding different unique values in columns
column = df[['experience_level', 'employment_type', 'remote_ratio']]
for col in column:
    Unique = df[col].unique()
    print(f'{col} = {Unique}')

experience_level = ['SE' 'MI' 'EN' 'EX']
employment_type = ['FT' 'CT' 'FL' 'PT']
remote_ratio = [100   0  50]


In [68]:
# replacing abbreviations with relevant and more clear name
df['experience_level'] = df['experience_level'].replace(['EN', 'MI', 'SE', 'EX'],['Entry-Level', 'Mid-Level', 'Senior-Level', 'Executive-Level'])
df['employment_type'] = df['employment_type'].replace(['PT', 'FT', 'FL', 'CT'], 
                                                      ['Part-Time', 'Full-Time', 'Freelance', 'Contract'])
df['remote_ratio'] = df['remote_ratio'].replace([100, 0, 50], ['remote', 'on-site', 'hybrid'])

df['company_size'] = df['company_size'].replace(["L", "M", "S"], ['Large', 'Medium', 'Small'])

In [69]:
df.shape

(3755, 11)

In [70]:
df["company_size"].unique()

array(['Large', 'Small', 'Medium'], dtype=object)

In [71]:
df.describe().T

Unnamed: 0,count,mean,std,min,25%,50%,75%,max
work_year,3755.0,2022.373635,0.691448,2020.0,2022.0,2022.0,2023.0,2023.0
salary,3755.0,190695.571771,671676.500508,6000.0,100000.0,138000.0,180000.0,30400000.0
salary_in_usd,3755.0,137570.38988,63055.625278,5132.0,95000.0,135000.0,175000.0,450000.0


- On the basis of descriptive statistics of the dataframe, we can conclude that we have data points of 4 years starting from the year 2020 to 2023.
- Minimum salary in the field of Data science is 6000 with maximum being as high as 30400000.

In [72]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 3755 entries, 0 to 3754
Data columns (total 11 columns):
 #   Column              Non-Null Count  Dtype 
---  ------              --------------  ----- 
 0   work_year           3755 non-null   int64 
 1   experience_level    3755 non-null   object
 2   employment_type     3755 non-null   object
 3   job_title           3755 non-null   object
 4   salary              3755 non-null   int64 
 5   salary_currency     3755 non-null   object
 6   salary_in_usd       3755 non-null   int64 
 7   employee_residence  3755 non-null   object
 8   remote_ratio        3755 non-null   object
 9   company_location    3755 non-null   object
 10  company_size        3755 non-null   object
dtypes: int64(3), object(8)
memory usage: 322.8+ KB


## *DATA PREPROCESSING*

In [76]:

# Define a custom function to assign the salary range
def assign_salary_range(salary):
    if 0 < salary <= 450000:
        range_start = int(salary / 15000) * 15000
        range_end = range_start + 15000
        return f'{range_start}-{range_end}'
    else:
        return 'Other'
    
# Create the new column based on the salary range
df['salary_range'] = df['salary_in_usd'].apply(assign_salary_range)




In [77]:
from sklearn.preprocessing import LabelEncoder
import pandas as pd



# Columns to encode
columns_to_encode = ['experience_level', 'employment_type', 'job_title', 'remote_ratio', 'company_location', 'company_size','salary_range']

# Encoding labels
label_encoder = LabelEncoder()
for column in columns_to_encode:
    df[column] = label_encoder.fit_transform(df[column])



In [78]:

columns_to_drop = ['work_year', 'salary', 'salary_in_usd', 'employee_residence', 'salary_currency', 'salary_range']
X = df.drop(columns=columns_to_drop, axis = 1)


In [79]:
y = df["salary_range"]

In [80]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Linear Regression
linear_regression = LinearRegression()
linear_regression.fit(X_train, y_train)

# Random Forest Regression
random_forest = RandomForestRegressor()
random_forest.fit(X_train, y_train)

# Predicting on the test set
linear_regression_preds = linear_regression.predict(X_test)
random_forest_preds = random_forest.predict(X_test)

# Evaluating the models
linear_regression_rmse = mean_squared_error(y_test, linear_regression_preds, squared=False)
random_forest_rmse = mean_squared_error(y_test, random_forest_preds, squared=False)

print("Linear Regression RMSE:", linear_regression_rmse)
print("Random Forest RMSE:", random_forest_rmse)

Linear Regression RMSE: 9.776715423458153
Random Forest RMSE: 9.394350986587215


In [81]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.model_selection import cross_val_score
from sklearn.linear_model import LogisticRegression
from sklearn.naive_bayes import GaussianNB
from sklearn.tree import DecisionTreeClassifier
from sklearn.model_selection import train_test_split
import numpy as np
import pandas as pd
from sklearn.metrics import confusion_matrix, classification_report
from sklearn.metrics import accuracy_score as acs
from sklearn.model_selection import GridSearchCV
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=0, shuffle=True)


y_train = np.array(y_train)
y_train = y_train.reshape(-1)


y_test = np.array(y_test)
y_test = y_test.reshape(-1)


# KNN


kn = KNeighborsClassifier(n_neighbors=5, metric='minkowski', p=2)
kn.fit(X_train, y_train)


# Training and testing set from Decision tree


dt = DecisionTreeClassifier()
dt.fit(X_train, y_train)


# Naive Bayes

nb = GaussianNB()
nb.fit(X_train, y_train)

# Random forest


rfc = RandomForestClassifier(n_estimators=50, max_depth=15)
rfc.fit(X_train, y_train)



predictionKN = kn.predict(X_test)
predictionDT = dt.predict(X_test)
predictionNB = nb.predict(X_test)
predictionRFC = rfc.predict(X_test)

    

scoreKN = acs(y_test, predictionKN)
scoreDT = acs(y_test, predictionDT)
scoreNB = acs(y_test, predictionNB)
scoreRFC = acs(y_test, predictionRFC)

print(scoreKN, scoreDT, scoreNB, scoreRFC)


0.12379110251450677 0.12379110251450677 0.02127659574468085 0.11798839458413926


In [82]:
import xgboost as xgb

# Init classifier
xgb_cl = xgb.XGBClassifier()

# Fit
xgb_cl.fit(X_train, y_train)

# Predict
preds = xgb_cl.predict(X_test)

# Score
acs(y_test, preds)


0.12379110251450677