In [99]:
import numpy as np
import math, copy
import matplotlib.pyplot as plt
import pandas as pd

In [100]:
df = pd.read_csv('Data Science Jobs Salaries.csv')
df.head()

Unnamed: 0,work_year,experience_level,employment_type,job_title,salary,salary_currency,salary_in_usd,employee_residence,remote_ratio,company_location,company_size
0,2021e,EN,FT,Data Science Consultant,54000,EUR,64369,DE,50,DE,L
1,2020,SE,FT,Data Scientist,60000,EUR,68428,GR,100,US,L
2,2021e,EX,FT,Head of Data Science,85000,USD,85000,RU,0,RU,M
3,2021e,EX,FT,Head of Data,230000,USD,230000,RU,50,RU,L
4,2021e,EN,FT,Machine Learning Engineer,125000,USD,125000,US,100,US,S


In [101]:
dfcopy = df

In [102]:
df.columns

Index(['work_year', 'experience_level', 'employment_type', 'job_title',
       'salary', 'salary_currency', 'salary_in_usd', 'employee_residence',
       'remote_ratio', 'company_location', 'company_size'],
      dtype='object')

In [103]:
df.shape

(245, 11)

In [104]:
df.describe()

Unnamed: 0,salary,salary_in_usd,remote_ratio
count,245.0,245.0,245.0
mean,502541.8,99868.012245,69.183673
std,2276230.0,83983.326949,37.593421
min,4000.0,2876.0,0.0
25%,60000.0,45896.0,50.0
50%,103000.0,81000.0,100.0
75%,174000.0,130000.0,100.0
max,30400000.0,600000.0,100.0


In [105]:
df.isna().sum()

work_year             0
experience_level      0
employment_type       0
job_title             0
salary                0
salary_currency       0
salary_in_usd         0
employee_residence    0
remote_ratio          0
company_location      0
company_size          0
dtype: int64

In [106]:
df.duplicated().sum()

1

In [107]:
df.drop_duplicates(inplace = True)

In [108]:
df.drop(columns = {'salary_currency', 'salary'}, inplace = True)
df.head(2)

Unnamed: 0,work_year,experience_level,employment_type,job_title,salary_in_usd,employee_residence,remote_ratio,company_location,company_size
0,2021e,EN,FT,Data Science Consultant,64369,DE,50,DE,L
1,2020,SE,FT,Data Scientist,68428,GR,100,US,L


In [109]:
df.work_year.value_counts()

2021e    178
2020      66
Name: work_year, dtype: int64

In [110]:
df.work_year = df.work_year.str.replace('e', '').astype(int)

In [111]:
df.work_year.value_counts()

2021    178
2020     66
Name: work_year, dtype: int64

### Preparing data

#### Convert data from categorical to numeric

In [112]:
from sklearn.preprocessing import LabelEncoder
leb = LabelEncoder()

In [113]:
df.company_size.value_counts()

L    131
S     58
M     55
Name: company_size, dtype: int64

In [114]:
df.company_size = leb.fit_transform(df.company_size)
df.company_size.value_counts()

0    131
2     58
1     55
Name: company_size, dtype: int64

In [115]:
df.employee_residence.value_counts()

US    92
IN    22
DE    18
FR    13
GB    13
CA     9
ES     7
JP     4
NL     4
GR     4
BR     4
PL     3
PT     3
TR     3
IT     3
PK     3
RU     3
MX     2
DK     2
RO     2
AT     2
NG     2
SG     2
HU     2
VN     2
LU     1
MD     1
SI     1
HK     1
CN     1
HR     1
BE     1
CL     1
KE     1
IR     1
NZ     1
CO     1
BG     1
RS     1
PR     1
JE     1
AE     1
UA     1
PH     1
MT     1
Name: employee_residence, dtype: int64

In [116]:
df.employee_residence = leb.fit_transform(df.employee_residence)
df.employee_residence.value_counts()

43    92
18    22
9     18
12    13
13    13
5      9
11     7
22     4
29     4
14     4
4      4
33     3
35     3
41     3
20     3
32     3
38     3
27     2
10     2
36     2
1      2
28     2
39     2
17     2
44     2
24     1
25     1
40     1
15     1
7      1
16     1
2      1
6      1
23     1
19     1
30     1
8      1
3      1
37     1
34     1
21     1
0      1
42     1
31     1
26     1
Name: employee_residence, dtype: int64

In [117]:
df.experience_level.value_counts()

MI    102
SE     77
EN     54
EX     11
Name: experience_level, dtype: int64

In [118]:
df.experience_level = leb.fit_transform(df.experience_level)
df.experience_level.value_counts()

2    102
3     77
0     54
1     11
Name: experience_level, dtype: int64

In [119]:
df.employment_type = leb.fit_transform(df.employment_type)
df.employment_type.value_counts()

2    230
3      7
0      4
1      3
Name: employment_type, dtype: int64

In [120]:
df.job_title.value_counts()

Data Scientist                              58
Data Engineer                               38
Machine Learning Engineer                   20
Data Analyst                                20
Research Scientist                          10
Data Science Consultant                      7
Big Data Engineer                            6
Data Science Manager                         5
Lead Data Engineer                           5
BI Data Analyst                              5
AI Scientist                                 5
Principal Data Scientist                     5
Director of Data Science                     4
Machine Learning Scientist                   4
Data Engineering Manager                     3
Business Data Analyst                        3
ML Engineer                                  3
Data Analytics Engineer                      3
Lead Data Analyst                            3
Data Analytics Manager                       3
Computer Vision Engineer                     3
Computer Visi

In [121]:
df.job_title = leb.fit_transform(df.job_title)
df.job_title.value_counts()

20    58
15    38
32    20
11    20
41    10
17     7
6      6
19     5
29     5
4      5
1      5
39     5
23     4
34     4
16     3
7      3
31     3
12     3
28     3
13     3
9      3
10     2
30     2
38     2
8      2
33     2
27     2
26     2
18     2
22     2
25     1
35     1
3      1
24     1
40     1
0      1
42     1
2      1
14     1
36     1
5      1
37     1
21     1
Name: job_title, dtype: int64

In [122]:
df.company_location.value_counts()

US    108
DE     18
IN     17
GB     16
FR     11
CA     11
ES      7
JP      4
AT      3
NL      3
PL      3
DK      3
TR      3
BR      2
GR      2
PK      2
LU      2
CN      2
NG      2
RU      2
PT      2
MX      2
UA      1
IL      1
HR      1
AS      1
VN      1
MD      1
IT      1
CH      1
AE      1
SI      1
BE      1
CL      1
IR      1
NZ      1
CO      1
KE      1
HU      1
SG      1
MT      1
Name: company_location, dtype: int64

In [123]:
df.company_location = leb.fit_transform(df.company_location)
df.company_location.value_counts()

39    108
10     18
19     17
14     16
13     11
5      11
12      7
22      4
2       3
29      3
32      3
11      3
37      3
4       2
15      2
31      2
24      2
8       2
28      2
34      2
33      2
27      2
38      1
18      1
16      1
1       1
40      1
25      1
21      1
6       1
0       1
36      1
3       1
7       1
20      1
30      1
9       1
23      1
17      1
35      1
26      1
Name: company_location, dtype: int64

In [124]:
df.head()

Unnamed: 0,work_year,experience_level,employment_type,job_title,salary_in_usd,employee_residence,remote_ratio,company_location,company_size
0,2021,0,2,17,64369,9,50,10,0
1,2020,3,2,20,68428,14,100,39,0
2,2021,1,2,27,85000,38,0,34,1
3,2021,1,2,26,230000,38,50,34,0
4,2021,0,2,32,125000,43,100,39,2


### Data

In [125]:
#temp = df.drop(columns = {'salary_in_usd'})

lis_train = df[['work_year', 	'experience_level',	'employment_type',	'job_title', 'employee_residence',	
              'remote_ratio',	'company_location',	'company_size']].apply(lambda x: [x['work_year'], x['experience_level'],	x['employment_type'],	x['job_title'], x['employee_residence'],	
              x['remote_ratio'],	x['company_location'],	x['company_size']], axis= 1)
lis_train

0        [2021, 0, 2, 17, 9, 50, 10, 0]
1      [2020, 3, 2, 20, 14, 100, 39, 0]
2        [2021, 1, 2, 27, 38, 0, 34, 1]
3       [2021, 1, 2, 26, 38, 50, 34, 0]
4      [2021, 0, 2, 32, 43, 100, 39, 2]
                     ...               
240    [2020, 3, 2, 20, 43, 100, 39, 0]
241    [2021, 2, 2, 39, 43, 100, 39, 0]
242    [2020, 0, 2, 20, 43, 100, 39, 2]
243     [2020, 0, 0, 7, 43, 100, 39, 0]
244     [2021, 3, 2, 19, 18, 50, 19, 0]
Length: 244, dtype: object

In [126]:
df[['work_year', 	'experience_level',	'employment_type',	'job_title', 'employee_residence',	
              'remote_ratio',	'company_location',	'company_size']].to_numpy()

array([[2021,    0,    2, ...,   50,   10,    0],
       [2020,    3,    2, ...,  100,   39,    0],
       [2021,    1,    2, ...,    0,   34,    1],
       ...,
       [2020,    0,    2, ...,  100,   39,    2],
       [2020,    0,    0, ...,  100,   39,    0],
       [2021,    3,    2, ...,   50,   19,    0]], dtype=int64)

In [127]:
dftrain

Unnamed: 0,work_year,experience_level,employment_type,job_title,employee_residence,remote_ratio,company_location,company_size
0,2021,0,2,17,9,50,10,0
1,2020,3,2,20,14,100,39,0
2,2021,1,2,27,38,0,34,1
3,2021,1,2,26,38,50,34,0
4,2021,0,2,32,43,100,39,2
...,...,...,...,...,...,...,...,...
240,2020,3,2,20,43,100,39,0
241,2021,2,2,39,43,100,39,0
242,2020,0,2,20,43,100,39,2
243,2020,0,0,7,43,100,39,0


In [136]:
import pandas as pd

# Assuming you have a DataFrame called df
ddf = pd.DataFrame({'column1': [2104, 1416, 852],
                   'column2': [5, 3, 2],
                   'column3': [1, 2, 1],
                   'column4': [45, 40, 35]})

# Convert DataFrame to the desired list format
data = np.array(ddf.transpose().values.tolist())

# Print the list
print(data)

[[2104 1416  852]
 [   5    3    2]
 [   1    2    1]
 [  45   40   35]]


In [137]:
dftrain = df[['work_year', 	'experience_level',	'employment_type',	'job_title', 'employee_residence',	
              'remote_ratio', 'company_location',	'company_size']]
x_train = np.array(dftrain.transpose().values.tolist())

In [130]:
y_train = df.salary_in_usd
y_train.head()

0     64369
1     68428
2     85000
3    230000
4    125000
Name: salary_in_usd, dtype: int64

### Model Function

In [131]:
def predict_single_loop(x, w, b): 
    n = x.shape[0]
    p = 0
    for i in range(n):
        p_i = x[i] * w[i]  
        p = p + p_i         
    p = p + b                
    return p

In [132]:
def compute_cost(X, y, w, b): 
    m = X.shape[0]
    cost = 0.0
    for i in range(m):                                
        f_wb_i = np.dot(X[i], w) + b           #(n,)(n,) = scalar (see np.dot)
        cost = cost + (f_wb_i - y[i])**2       #scalar
    cost = cost / (2 * m)                      #scalar    
    return cost

In [133]:
def compute_gradient(X, y, w, b): 
    m,n = X.shape           #(number of examples, number of features)
    dj_dw = np.zeros((n,))
    dj_db = 0.

    for i in range(m):                             
        err = (np.dot(X[i], w) + b) - y[i]   
        for j in range(n):                         
            dj_dw[j] = dj_dw[j] + err * X[i, j]    
        dj_db = dj_db + err                        
    dj_dw = dj_dw / m                                
    dj_db = dj_db / m                                
        
    return dj_db, dj_dw

In [140]:
b_filal, w_filal = compute_gradient(x_train, y_train, (10,10,10,10,10,10,10, 10), 10)
#Compute and display gradient 
tmp_dj_db, tmp_dj_dw = compute_gradient(x_train, x_train, w_filal, b_filal)

ValueError: shapes (244,) and (8,) not aligned: 244 (dim 0) != 8 (dim 0)