In [140]:
import pandas as pd
import numpy as np
from sklearn.linear_model import LogisticRegression
import tqdm

In [141]:
df = pd.read_csv('salary_prediction.csv', index_col=0)

In [142]:
df.head()

Unnamed: 0,age,country,role,employer_industry,years_experience,yearly_compensation,Q11_Part_1,Q11_Part_2,Q11_Part_3,Q11_Part_4,Q11_Part_5,Q11_Part_6
2,30-34,Other,Other,Other,5-10,"10-20,000",0,0,0,0,0,1
3,30-34,United States of America,Data Scientist,Other,0-1,"0-10,000",1,0,0,0,0,0
5,22-24,India,Data Analyst,Other,0-1,"0-10,000",0,0,0,0,0,1
7,35-39,Other,Other,Academics/Education,10-15,"10-20,000",0,0,0,0,0,1
8,18-21,India,Other,Other,0-1,"0-10,000",0,1,0,0,0,0


In [143]:
features = pd.get_dummies(df[[c for c in df.columns if c != 'yearly_compensation']])
target = df.yearly_compensation

In [144]:
categ = ['0-10,000', '10-20,000', '20-30,000', '30-40,000', '40-50,000',
         '50-60,000', '60-70,000', '70-80,000', '80-90,000', '90-100,000',
         '100-125,000', '125-150,000', '150-200,000', '200-250,000', '>250,000']
cat_type = CategoricalDtype(categories=categ, ordered=True)
target = target.astype(cat_type)

In [155]:
def train_models(features, target, base_model, params):
    models = []
    for i, cat in tqdm.tqdm(enumerate(target.cat.categories[:-1])):
    #     print(i, cat, categ[:i + 1], categ[i + 1:])
        labels = [0] * len(categ[:i+1]) + [1] * len(categ[i + 1:])
        map = {cat: labels[j] for j, cat in enumerate(categ)}
        new_target = target.replace(map)
        mod = base_model(**params)
        mod.fit(features, new_target)
        models.append(mod)
    return models

In [157]:
models = train_models(features, target, LogisticRegression, {"random_state": 42, "solver": "lbfgs"})

14it [00:05,  2.57it/s]


In [169]:
import pickle

In [170]:
with open('salary_model.pcl', 'wb') as f:
    pickle.dump(models, f)

In [175]:
with open('salary_model.pcl', 'rb') as f:
    try_models = pickle.load(f)

In [161]:
from functools import reduce
prod = lambda x, y: x * y
def seq_predict_sample(sample, models):
    """
    Given list of binary classifiers and a sample, predict the most probable category index and the distribution
    """
    binary_probabilities = [mod.predict_proba(sample) for mod in models]
    neg_probs = [p[0][0] for p in binary_probabilities]
    pos_probs = [p[0][1] for p in binary_probabilities]
    bin_probabilities = []
    for i in range(len(models)):
        bin_prob = [p for p in pos_probs[:i]] + [neg_probs[i]]
        reduced = reduce(prod, bin_prob)
        bin_probabilities.append(reduced)
    bin_probabilities.append(reduce(prod, pos_probs))
    return np.argmax(bin_probabilities), bin_probabilities

In [147]:
sample = pd.DataFrame(np.zeros((1, len(cols))), columns=features.columns)

In [135]:
q11_map = {"None of these activities are an important part of my role at work": "Q11_Part_6",
           "Do research that advances the state of the art of machine learning": "Q11_Part_5",
           "Build prototypes to explore applying machine learning to new areas": "Q11_Part_4",
           "Build and/or run the data infrastructure that my business uses for storing, analyzing, and operationalizing data": "Q11_Part_3",
           "Build and/or run a machine learning service that operationally improves my product or workflows": "Q11_Part_2",
           "Analyze and understand data to influence product or business decisions": "Q11_Part_1"}

In [136]:
cols = list(features.columns)
def form_input_to_sample(age, country, industry, role, experience, q11):
    global cols
    sample = pd.DataFrame(np.zeros((1, len(cols))), columns=features.columns)
    sample['age_' + age] = 1
    sample['country_' + country] = 1
    sample['employer_industry_' + industry] = 1
    sample['role_' + role] = 1
    sample['years_experience_' + experience] = 1
    
    for value in q11:
        sample[q11_map[value]] = 1
    return sample
    

In [167]:
sample_tf = form_input_to_sample('18-21', 'Brazil', 'Other', 'Student', '>15', ['Do research that advances the state of the art of machine learning'])

In [168]:
seq_predict_sample(sample_tf, models)

(0,
 [0.7046607899818838,
  0.2562497759234744,
  0.03838824648734072,
  0.0006833248069617659,
  1.735696577707625e-05,
  4.915676697242235e-07,
  1.3585357841939851e-08,
  6.445176941933233e-10,
  3.428091875728108e-11,
  2.5360167291155782e-12,
  1.9447067370598507e-13,
  5.518411312459705e-15,
  1.0884639473254323e-16,
  1.522572106228656e-18,
  7.698024167015982e-21])