In [38]:
import pandas as pd
import numpy as np
from sklearn.neural_network import MLPClassifier
from sklearn.model_selection import train_test_split
from sklearn.compose import make_column_transformer
from sklearn.preprocessing import OneHotEncoder
from sklearn.pipeline import make_pipeline
from sklearn.metrics import accuracy_score 

In [41]:
data = pd.read_csv("ds_salaries.csv")
y = data['company_size']
data = data.drop(['company_size'], axis=1)
X_train, X_test, y_train, y_test = train_test_split(data, y, test_size=.1, random_state=42)
cols_to_lowercase = ['experience_level', 'employment_type', 'job_title', 'salary_currency', 'employee_residence', 'company_location']
for col in cols_to_lowercase:
    data[col] = data[col].str.lower()
column_trans = make_column_transformer(
    (OneHotEncoder(handle_unknown='ignore'), cols_to_lowercase),
    remainder='passthrough'
)
mlp = MLPClassifier(random_state=321,
                    solver="sgd", 
                    activation="tanh", 
                    alpha=0.01, 
                    hidden_layer_sizes=(2, ), 
                    max_iter=2000, 
                    tol=0.00000001)
pipe = make_pipeline(column_trans, mlp)
pipe.fit(X_train, y_train)
predictions = pipe.predict(X_test) 
score = accuracy_score(y_test, predictions)
score

0.6065573770491803

In [49]:
data = pd.read_csv("ds_salaries.csv")
y = data['company_size']
data = data.drop(['company_size'], axis=1)
X_train, X_test, y_train, y_test = train_test_split(data, y, test_size=.1, random_state=42)
cols_to_lowercase = ['experience_level', 'employment_type', 'job_title', 'salary_currency', 'employee_residence', 'company_location']
for col in cols_to_lowercase:
    data[col] = data[col].str.lower()
column_trans = make_column_transformer(
    (OneHotEncoder(handle_unknown='ignore'), cols_to_lowercase),
    remainder='passthrough'
)
mlp = MLPClassifier(random_state=321,
                    solver="sgd", 
                    activation="tanh", 
                    alpha=1, 
                    hidden_layer_sizes=(2, ), 
                    max_iter=2000, 
                    tol=0.00000001)
pipe = make_pipeline(column_trans, mlp)
pipe.fit(X_train, y_train)
predictions = pipe.predict(X_test) 
score = accuracy_score(y_test, predictions)
score



0.6065573770491803

In [59]:
data = pd.read_csv("ds_salaries.csv")
y = data['company_size']
data = data.drop(['company_size'], axis=1)
X_train, X_test, y_train, y_test = train_test_split(data, y, test_size=.1, random_state=42)
cols_to_lowercase = ['experience_level', 'employment_type', 'job_title', 'salary_currency', 'employee_residence', 'company_location']
for col in cols_to_lowercase:
    data[col] = data[col].str.lower()
column_trans = make_column_transformer(
    (OneHotEncoder(handle_unknown='ignore'), cols_to_lowercase),
    remainder='passthrough'
)
mlp = MLPClassifier(random_state=321,
                    solver="sgd", 
                    activation="tanh", 
                    alpha=0.001, 
                    hidden_layer_sizes=(2, ), 
                    max_iter=1000, 
                    tol=0.0001)
pipe = make_pipeline(column_trans, mlp)
pipe.fit(X_train, y_train)
predictions = pipe.predict(X_test) 
score = accuracy_score(y_test, predictions)
score

0.6065573770491803

In [57]:
data = pd.read_csv("ds_salaries.csv")
y = data['company_size']
data = data.drop(['company_size'], axis=1)
X_train, X_test, y_train, y_test = train_test_split(data, y, test_size=.1, random_state=42)
cols_to_lowercase = ['experience_level', 'employment_type', 'job_title', 'salary_currency', 'employee_residence', 'company_location']
for col in cols_to_lowercase:
    data[col] = data[col].str.lower()
column_trans = make_column_transformer(
    (OneHotEncoder(handle_unknown='ignore'), cols_to_lowercase),
    remainder='passthrough'
)
mlp = MLPClassifier(random_state=321,
                    solver="sgd", 
                    activation="tanh", 
                    alpha=0.001, 
                    hidden_layer_sizes=(4, ), 
                    max_iter=1000, 
                    tol=0.0001)
pipe = make_pipeline(column_trans, mlp)
pipe.fit(X_train, y_train)
predictions = pipe.predict(X_test) 
score = accuracy_score(y_test, predictions)
score

0.6229508196721312

In [69]:
data = pd.read_csv("ds_salaries.csv")
y = data['company_size']
data = data.drop(['company_size'], axis=1)
X_train, X_test, y_train, y_test = train_test_split(data, y, test_size=.1, random_state=42)
cols_to_lowercase = ['experience_level', 'employment_type', 'job_title', 'salary_currency', 'employee_residence', 'company_location']
for col in cols_to_lowercase:
    data[col] = data[col].str.lower()
column_trans = make_column_transformer(
    (OneHotEncoder(handle_unknown='ignore'), cols_to_lowercase),
    remainder='passthrough'
)
mlp = MLPClassifier(random_state=321,
                    solver="sgd", 
                    activation="tanh", 
                    alpha=0.001, 
                    hidden_layer_sizes=(7, 5), 
                    max_iter=1000, 
                    tol=0.000001)
pipe = make_pipeline(column_trans, mlp)
pipe.fit(X_train, y_train)
predictions = pipe.predict(X_test) 
score = accuracy_score(y_test, predictions)
score



0.6229508196721312

In [66]:
from sklearn.preprocessing import StandardScaler 
data = pd.read_csv("ds_salaries.csv")
y = data['company_size']
data = data.drop(['company_size'], axis=1)
X_train, X_test, y_train, y_test = train_test_split(data, y, test_size=.1, random_state=42)
cols_to_lowercase = ['experience_level', 'employment_type', 'job_title', 'salary_currency', 'employee_residence', 'company_location']
for col in cols_to_lowercase:
    data[col] = data[col].str.lower()
column_trans = make_column_transformer(
    (OneHotEncoder(handle_unknown='ignore'), cols_to_lowercase),
    remainder='passthrough'
)
pipe.fit(X_train, y_train)

In [97]:
def salary_to_cag(salary):
    step = 10000
    mean_salary_begin = data.salary.mean() - step
    mean_salary_end = data.salary.mean() + step
    if salary < 50000:
        return 'Low'
    if salary < 70000:
        return 'Lower then mean'
    if salary < 150000:
        return 'High'
    if salary > 125000:
        return 'Higher than mean'
    return 'Mean'

In [98]:
import pandas as pd
from sklearn.compose import make_column_transformer
from sklearn.preprocessing import OneHotEncoder
from sklearn.pipeline import make_pipeline
from sklearn.model_selection import cross_val_score
from sklearn.tree import DecisionTreeRegressor, DecisionTreeClassifier
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression

In [99]:
data = pd.read_csv('ds_salaries.csv', index_col='db_id')
df = data.loc[:, ['company_location', 'job_title', 'experience_level', 'salary_in_usd']]
X = df.drop('salary_in_usd', axis='columns')
y = df.salary_in_usd
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=.1, random_state=42)
column_trans = make_column_transformer(
    (OneHotEncoder(handle_unknown='ignore'), ['company_location', 'job_title', 'experience_level']),
    remainder='passthrough'
)
clf = DecisionTreeRegressor( random_state=42)
pipe = make_pipeline(column_trans, clf)
pipe.fit(X_train, y_train)
importances = clf.feature_importances_

In [100]:
from sklearn.metrics import accuracy_score 
import pandas as pd
from sklearn.compose import make_column_transformer
from sklearn.preprocessing import OneHotEncoder
from sklearn.pipeline import make_pipeline
from sklearn.model_selection import cross_val_score
from sklearn.tree import DecisionTreeRegressor, DecisionTreeClassifier
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
data = pd.read_csv('ds_salaries.csv', index_col='db_id')
data['salary_in_usd'] = data['salary_in_usd'].apply(salary_to_cag)
df = data.loc[:, ['company_location', 'job_title', 'experience_level', 'salary_in_usd']]
X = df.drop('salary_in_usd', axis='columns')
y = df.salary_in_usd
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=.1, random_state=42)
column_trans = make_column_transformer(
    (OneHotEncoder(handle_unknown='ignore'), ['company_location', 'job_title', 'experience_level']),
    remainder='passthrough'
)
clf = DecisionTreeClassifier(random_state=42)
pipe = make_pipeline(column_trans, clf)
pipe.fit(X_train, y_train)
importances = clf.feature_importances_
predictions = pipe.predict(X_test) 
score = accuracy_score(y_test, predictions)
score

0.5409836065573771

In [107]:
data = pd.read_csv('ds_salaries.csv', index_col='db_id')
data['salary_in_usd'] = data['salary_in_usd'].apply(salary_to_cag)
df = data.loc[:, ['company_location', 'job_title', 'experience_level', 'salary_in_usd']]
X = df.drop('salary_in_usd', axis='columns')
y = df.salary_in_usd
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=.1, random_state=42)
column_trans = make_column_transformer(
    (OneHotEncoder(handle_unknown='ignore'), ['company_location', 'job_title', 'experience_level']),
    remainder='passthrough'
)
mlp = MLPClassifier(random_state=321,
                    solver="sgd", 
                    activation="tanh", 
                    alpha=0.01, 
                    hidden_layer_sizes=(12, ), 
                    max_iter=5000, 
                    tol=0.0001)
pipe = make_pipeline(column_trans, mlp)
pipe.fit(X_train, y_train)
predictions = pipe.predict(X_test) 
score2 = accuracy_score(y_test, predictions)
score2

0.5737704918032787

1