In [1]:
DF_PATH = "../data/processed/cleaned_data.pkl"
ROLE_COL=['DevType']
TEC_COLS = ['LanguageHaveWorkedWith',
                  'DatabaseHaveWorkedWith',
                  'WebframeHaveWorkedWith',
                  'MiscTechHaveWorkedWith',
                  'ToolsTechHaveWorkedWith',
                  'NEWCollabToolsHaveWorkedWith']
MLFLOW_TRACKING_URI = '../models/mlruns'
MLFLOW_EXPERIMENT_NAME = "skills_jobs_stackoverflow"

LOG_PATH = "../models/temp/"
LOG_DATA_PKL    =  "data.pkl"
LOG_MODEL_PKL   =  "model.pkl"
LOG_METRICS_PKL =  "metrics.pkl"

In [2]:
import pandas as pd 
import numpy as np
import logging
import pickle
import random
import plotly
import os 
import pathlib as path 

import mlflow 
from mlflow.tracking import MlflowClient

from sklearn.preprocessing import MultiLabelBinarizer
from sklearn.preprocessing import StandardScaler, RobustScaler, MinMaxScaler
from sklearn.pipeline import make_pipeline, FeatureUnion
from sklearn.feature_selection import VarianceThreshold
from sklearn.multioutput import MultiOutputClassifier
from sklearn.model_selection import train_test_split
from sklearn.model_selection import GridSearchCV

from sklearn import metrics
from sklearn.metrics import auc, accuracy_score, confusion_matrix, f1_score, precision_score, recall_score

from sklearn.decomposition import PCA, KernelPCA

from sklearn import tree
from  sklearn.linear_model import LogisticRegression
from sklearn.ensemble import AdaBoostClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.neighbors import kneighbors_graph

from matplotlib import pyplot as plt

In [3]:
def calculate_quality(ground_truth, predictions, metric_function, sort_values=False):
    quality_scores = {}
    for col in predictions.columns:
        role_pred  = predictions[col].copy()
        role_truth = ground_truth[col].copy()
        quality_scores[col] = round(metric_function(role_truth, role_pred) * 100, 2)
        
    quality_scores = pd.Series(quality_scores.values(), index=quality_scores.keys())
    if sort_values:
        quality_scores = quality_scores.sort_values()
    
    return quality_scores

In [4]:
df = pd.read_pickle(DF_PATH)

In [5]:
df

Unnamed: 0_level_0,DevType,DevType,DevType,DevType,DevType,DevType,DevType,DevType,DevType,DevType,...,skills_clusters,skills_clusters,skills_clusters,skills_clusters,skills_clusters,skills_clusters,skills_clusters,skills_clusters,skills_clusters,skills_clusters
Unnamed: 0_level_1,Academic researcher,Data or business analyst,Data scientist or machine learning specialist,Database administrator,DevOps specialist,"Developer, QA or test","Developer, back-end","Developer, desktop or enterprise applications","Developer, embedded applications or devices","Developer, front-end",...,skills_group_20,skills_group_21,skills_group_22,skills_group_3,skills_group_4,skills_group_5,skills_group_6,skills_group_7,skills_group_8,skills_group_9
0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,1,1,0,0,0,0,0
3,0,0,0,0,0,0,0,0,0,1,...,2,0,0,0,0,0,0,0,0,0
4,0,0,0,0,0,0,0,1,0,0,...,0,0,0,6,0,2,2,0,0,2
8,0,0,0,0,0,0,0,0,0,1,...,0,0,0,3,0,0,0,0,0,2
9,0,0,1,0,0,0,0,0,0,0,...,0,0,0,2,1,0,2,0,3,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
83434,0,0,0,0,0,0,1,0,0,0,...,0,0,0,2,0,0,0,2,0,0
83435,0,0,0,0,0,0,0,0,0,0,...,0,0,0,3,1,0,0,0,0,1
83436,0,0,1,0,1,0,0,0,0,0,...,0,0,0,5,0,1,2,2,1,3
83437,0,0,0,0,0,0,1,0,0,0,...,0,0,0,5,0,1,2,0,2,3


In [6]:
roles_df = df['DevType'].copy()
roles_df.sum(axis=0)

Academic researcher                               1708
Data or business analyst                          1658
Data scientist or machine learning specialist     2460
Database administrator                            1210
DevOps specialist                                 3056
Developer, QA or test                             1135
Developer, back-end                              17084
Developer, desktop or enterprise applications     4845
Developer, embedded applications or devices       2138
Developer, front-end                              8932
Developer, full-stack                            20655
Developer, game or graphics                        899
Developer, mobile                                 4751
Engineer, data                                    1941
Scientist                                         1046
System administrator                              2069
dtype: int64

In [8]:
#Resample roles
samples_per_class = 1200
resampled_roles =[]
for role_col in roles_df.columns:
    sub_df= roles_df.loc[roles_df[role_col]==1].copy()
    
    if len(sub_df) < samples_per_class:
        #unsample
        sub_df= sub_df.sample(samples_per_class, replace=True, random_state=0)
    else:
        #Downloadsample
        sub_df= sub_df.sample(samples_per_class, random_state =0)
    resampled_roles.append(sub_df)    

In [9]:
#contrust dfs
roles_df= pd.concat(resampled_roles)
df= df.loc[roles_df.index].copy()

In [10]:
roles_df.sum(axis=0)

Academic researcher                              2280
Data or business analyst                         1965
Data scientist or machine learning specialist    2576
Database administrator                           1765
DevOps specialist                                2170
Developer, QA or test                            1514
Developer, back-end                              5710
Developer, desktop or enterprise applications    2690
Developer, embedded applications or devices      1773
Developer, front-end                             2614
Developer, full-stack                            5602
Developer, game or graphics                      1441
Developer, mobile                                2155
Engineer, data                                   2046
Scientist                                        1910
System administrator                             2110
dtype: int64

# SPlit to train & test  

In [11]:
X_train, X_test, Y_train, Y_test = train_test_split(df.drop('DevType',axis=1),df['DevType'],random_state=0)

  X_train, X_test, Y_train, Y_test = train_test_split(df.drop('DevType',axis=1),df['DevType'],random_state=0)


In [12]:
#training model 
#intiailtize client and experiment
mlflow.set_tracking_uri(MLFLOW_TRACKING_URI)
client = MlflowClient()
mlflow.get_experiment_by_name(MLFLOW_EXPERIMENT_NAME)
exp = client.get_experiment_by_name(MLFLOW_EXPERIMENT_NAME)

In [13]:
rf_clf= make_pipeline(RobustScaler(),
                     PCA(n_components=0.95),RandomForestClassifier(n_jobs=8,verbose=1,random_state=0))
rf_clf.fit(X_train.values,Y_train.values)

[Parallel(n_jobs=8)]: Using backend ThreadingBackend with 8 concurrent workers.
[Parallel(n_jobs=8)]: Done  34 tasks      | elapsed:    7.5s
[Parallel(n_jobs=8)]: Done 100 out of 100 | elapsed:   19.1s finished


Pipeline(steps=[('robustscaler', RobustScaler()),
                ('pca', PCA(n_components=0.95)),
                ('randomforestclassifier',
                 RandomForestClassifier(n_jobs=8, random_state=0, verbose=1))])

In [14]:
#evaluate on train set
predictions = pd.DataFrame(rf_clf.predict(X_train.values),columns= Y_train.columns)
train_scores = {score.__name__:calculate_quality(Y_train,predictions,score) 
                for score in [accuracy_score,precision_score,recall_score,f1_score]}
train_scores =pd.concat(train_scores,axis =1)

[Parallel(n_jobs=8)]: Using backend ThreadingBackend with 8 concurrent workers.
[Parallel(n_jobs=8)]: Done  34 tasks      | elapsed:    0.7s
[Parallel(n_jobs=8)]: Done 100 out of 100 | elapsed:    1.9s finished


In [16]:
#Evaluate on test set
predictions = pd.DataFrame(rf_clf.predict(X_test.values), columns=Y_test.columns)
test_scores = {score.__name__:calculate_quality(Y_test,predictions,score)
              for score in [accuracy_score,precision_score,recall_score,f1_score]}
test_scores = pd.concat(test_scores, axis=1)
mean_test_scores = test_scores.mean()

[Parallel(n_jobs=8)]: Using backend ThreadingBackend with 8 concurrent workers.
[Parallel(n_jobs=8)]: Done  34 tasks      | elapsed:    0.3s
[Parallel(n_jobs=8)]: Done 100 out of 100 | elapsed:    0.8s finished


In [17]:
print(mean_test_scores)
test_scores.sort_values('precision_score')

accuracy_score     92.671875
precision_score    94.156875
recall_score       50.010625
f1_score           64.225000
dtype: float64


Unnamed: 0,accuracy_score,precision_score,recall_score,f1_score
"Developer, full-stack",83.81,78.56,61.09,68.73
"Developer, back-end",79.27,83.87,38.02,52.32
"Developer, front-end",89.92,89.66,33.86,49.16
Data scientist or machine learning specialist,94.1,90.47,62.96,74.25
Academic researcher,94.92,93.58,60.25,73.3
"Developer, desktop or enterprise applications",91.02,94.54,34.99,51.08
"Developer, mobile",93.92,95.15,47.75,63.59
Scientist,97.1,95.41,75.56,84.33
"Developer, embedded applications or devices",94.56,95.86,38.94,55.38
DevOps specialist,92.73,96.48,35.96,52.39


In [21]:
#log 
#Data details 
data_details= {"data_path":DF_PATH,
              "training_indices":X_train.index.to_list(),
              "test_indices":X_test.index.to_list(),
              "features_names":X_train.columns.droplevel(0).tolist(),
              "targets_names":Y_train.columns.tolist()}
with open(os.path.join(LOG_PATH,LOG_DATA_PKL),"wb") as output_file:
    pickle.dump(data_details,output_file)

In [22]:
#model
model ={"model_description": "Random Forest: with PCA - Basic",
         "model_details": str(rf_clf),
         "model_object": rf_clf}
with open(os.path.join(LOG_PATH, LOG_MODEL_PKL),"wb") as output_file:
    pickle.dump(model, output_file)

In [23]:
#preformance details
classes_metrcis ={"train_scores":train_scores,
                 "test_scores":test_scores}
with open(os.path.join(LOG_PATH,LOG_METRICS_PKL),"wb") as output_file:
    pickle.dump(classes_metrcis, output_file)
    

In [24]:
#start a new run and track
with mlflow.start_run(experiment_id=exp.experiment_id,
                     run_name=model["model_description"]):
    mlflow.log_artifacts(LOG_PATH)
    for metrcis, score in mean_test_scores.items():
        mlflow.log_metric(metrcis, score)

AttributeError: 'NoneType' object has no attribute 'experiment_id'

In [25]:
#Hyperparameter tuning 
hpt_rf_clf = make_pipeline(RobustScaler(),PCA(),RandomForestClassifier(n_jobs=8,verbose=1,random_state=0))

In [26]:
tuned_parameters=[{
    'pca__n_components': [0.7, 0.85, 0.95],
    'randomforestclassifier__n_estimators': [250, 500],
    'randomforestclassifier__max_depth':    [3, 10, None],}]

In [27]:
hpt_rf_clf=GridSearchCV(hpt_rf_clf,tuned_parameters)
hpt_rf_clf.fit(X_train.values,Y_train.values)

[Parallel(n_jobs=8)]: Using backend LokyBackend with 8 concurrent workers.
[Parallel(n_jobs=8)]: Done  34 tasks      | elapsed:   48.9s
[Parallel(n_jobs=8)]: Done 184 tasks      | elapsed:   54.2s
[Parallel(n_jobs=8)]: Done 250 out of 250 | elapsed:   56.2s finished
[Parallel(n_jobs=8)]: Using backend ThreadingBackend with 8 concurrent workers.
[Parallel(n_jobs=8)]: Done  34 tasks      | elapsed:    0.2s
[Parallel(n_jobs=8)]: Done 184 tasks      | elapsed:    1.1s
[Parallel(n_jobs=8)]: Done 250 out of 250 | elapsed:    1.5s finished
[Parallel(n_jobs=8)]: Using backend LokyBackend with 8 concurrent workers.
[Parallel(n_jobs=8)]: Done  34 tasks      | elapsed:    1.3s
[Parallel(n_jobs=8)]: Done 184 tasks      | elapsed:    6.3s
[Parallel(n_jobs=8)]: Done 250 out of 250 | elapsed:    8.3s finished
[Parallel(n_jobs=8)]: Using backend ThreadingBackend with 8 concurrent workers.
[Parallel(n_jobs=8)]: Done  34 tasks      | elapsed:    0.1s
[Parallel(n_jobs=8)]: Done 184 tasks      | elapsed: 

[Parallel(n_jobs=8)]: Done  34 tasks      | elapsed:    1.8s
[Parallel(n_jobs=8)]: Done 184 tasks      | elapsed:    8.8s
[Parallel(n_jobs=8)]: Done 250 out of 250 | elapsed:   11.5s finished
[Parallel(n_jobs=8)]: Using backend ThreadingBackend with 8 concurrent workers.
[Parallel(n_jobs=8)]: Done  34 tasks      | elapsed:    0.2s
[Parallel(n_jobs=8)]: Done 184 tasks      | elapsed:    1.1s
[Parallel(n_jobs=8)]: Done 250 out of 250 | elapsed:    1.5s finished
[Parallel(n_jobs=8)]: Using backend LokyBackend with 8 concurrent workers.
[Parallel(n_jobs=8)]: Done  34 tasks      | elapsed:    2.1s
[Parallel(n_jobs=8)]: Done 184 tasks      | elapsed:   10.6s
[Parallel(n_jobs=8)]: Done 434 tasks      | elapsed:   24.6s
[Parallel(n_jobs=8)]: Done 500 out of 500 | elapsed:   28.4s finished
[Parallel(n_jobs=8)]: Using backend ThreadingBackend with 8 concurrent workers.
[Parallel(n_jobs=8)]: Done  34 tasks      | elapsed:    0.2s
[Parallel(n_jobs=8)]: Done 184 tasks      | elapsed:    1.2s
[Paral

[Parallel(n_jobs=8)]: Done  34 tasks      | elapsed:    0.2s
[Parallel(n_jobs=8)]: Done 184 tasks      | elapsed:    1.5s
[Parallel(n_jobs=8)]: Done 434 tasks      | elapsed:    3.6s
[Parallel(n_jobs=8)]: Done 500 out of 500 | elapsed:    4.1s finished
[Parallel(n_jobs=8)]: Using backend LokyBackend with 8 concurrent workers.
[Parallel(n_jobs=8)]: Done  34 tasks      | elapsed:    3.9s
[Parallel(n_jobs=8)]: Done 184 tasks      | elapsed:   21.3s
[Parallel(n_jobs=8)]: Done 434 tasks      | elapsed:   43.6s
[Parallel(n_jobs=8)]: Done 500 out of 500 | elapsed:   49.7s finished
[Parallel(n_jobs=8)]: Using backend ThreadingBackend with 8 concurrent workers.
[Parallel(n_jobs=8)]: Done  34 tasks      | elapsed:    0.2s
[Parallel(n_jobs=8)]: Done 184 tasks      | elapsed:    1.2s
[Parallel(n_jobs=8)]: Done 434 tasks      | elapsed:    2.7s
[Parallel(n_jobs=8)]: Done 500 out of 500 | elapsed:    3.1s finished
[Parallel(n_jobs=8)]: Using backend LokyBackend with 8 concurrent workers.
[Parallel(n

[Parallel(n_jobs=8)]: Done 184 tasks      | elapsed:   16.5s
[Parallel(n_jobs=8)]: Done 250 out of 250 | elapsed:   21.5s finished
[Parallel(n_jobs=8)]: Using backend ThreadingBackend with 8 concurrent workers.
[Parallel(n_jobs=8)]: Done  34 tasks      | elapsed:    0.2s
[Parallel(n_jobs=8)]: Done 184 tasks      | elapsed:    1.4s
[Parallel(n_jobs=8)]: Done 250 out of 250 | elapsed:    1.9s finished
[Parallel(n_jobs=8)]: Using backend LokyBackend with 8 concurrent workers.
[Parallel(n_jobs=8)]: Done  34 tasks      | elapsed:    3.6s
[Parallel(n_jobs=8)]: Done 184 tasks      | elapsed:   16.9s
[Parallel(n_jobs=8)]: Done 250 out of 250 | elapsed:   22.5s finished
[Parallel(n_jobs=8)]: Using backend ThreadingBackend with 8 concurrent workers.
[Parallel(n_jobs=8)]: Done  34 tasks      | elapsed:    0.3s
[Parallel(n_jobs=8)]: Done 184 tasks      | elapsed:    1.8s
[Parallel(n_jobs=8)]: Done 250 out of 250 | elapsed:    2.5s finished
[Parallel(n_jobs=8)]: Using backend LokyBackend with 8 con

[Parallel(n_jobs=8)]: Done 434 tasks      | elapsed:   52.3s
[Parallel(n_jobs=8)]: Done 500 out of 500 | elapsed:   59.8s finished
[Parallel(n_jobs=8)]: Using backend ThreadingBackend with 8 concurrent workers.
[Parallel(n_jobs=8)]: Done  34 tasks      | elapsed:    0.2s
[Parallel(n_jobs=8)]: Done 184 tasks      | elapsed:    1.2s
[Parallel(n_jobs=8)]: Done 434 tasks      | elapsed:    2.9s
[Parallel(n_jobs=8)]: Done 500 out of 500 | elapsed:    3.3s finished
[Parallel(n_jobs=8)]: Using backend LokyBackend with 8 concurrent workers.
[Parallel(n_jobs=8)]: Done  34 tasks      | elapsed:    4.7s
[Parallel(n_jobs=8)]: Done 184 tasks      | elapsed:   22.4s
[Parallel(n_jobs=8)]: Done 434 tasks      | elapsed:   52.7s
[Parallel(n_jobs=8)]: Done 500 out of 500 | elapsed:  1.0min finished
[Parallel(n_jobs=8)]: Using backend ThreadingBackend with 8 concurrent workers.
[Parallel(n_jobs=8)]: Done  34 tasks      | elapsed:    0.3s
[Parallel(n_jobs=8)]: Done 184 tasks      | elapsed:    1.4s
[Paral

[Parallel(n_jobs=8)]: Done  34 tasks      | elapsed:    0.2s
[Parallel(n_jobs=8)]: Done 184 tasks      | elapsed:    1.2s
[Parallel(n_jobs=8)]: Done 434 tasks      | elapsed:    2.8s
[Parallel(n_jobs=8)]: Done 500 out of 500 | elapsed:    3.3s finished
[Parallel(n_jobs=8)]: Using backend LokyBackend with 8 concurrent workers.
[Parallel(n_jobs=8)]: Done  34 tasks      | elapsed:    1.6s
[Parallel(n_jobs=8)]: Done 184 tasks      | elapsed:    8.5s
[Parallel(n_jobs=8)]: Done 434 tasks      | elapsed:   32.1s
[Parallel(n_jobs=8)]: Done 500 out of 500 | elapsed:   35.2s finished
[Parallel(n_jobs=8)]: Using backend ThreadingBackend with 8 concurrent workers.
[Parallel(n_jobs=8)]: Done  34 tasks      | elapsed:    0.3s
[Parallel(n_jobs=8)]: Done 184 tasks      | elapsed:    1.5s
[Parallel(n_jobs=8)]: Done 434 tasks      | elapsed:    3.3s
[Parallel(n_jobs=8)]: Done 500 out of 500 | elapsed:    3.8s finished
[Parallel(n_jobs=8)]: Using backend LokyBackend with 8 concurrent workers.
[Parallel(n

[Parallel(n_jobs=8)]: Done  34 tasks      | elapsed:    0.3s
[Parallel(n_jobs=8)]: Done 184 tasks      | elapsed:    1.3s
[Parallel(n_jobs=8)]: Done 250 out of 250 | elapsed:    1.8s finished
[Parallel(n_jobs=8)]: Using backend LokyBackend with 8 concurrent workers.
[Parallel(n_jobs=8)]: Done  34 tasks      | elapsed:    7.5s
[Parallel(n_jobs=8)]: Done 184 tasks      | elapsed:   35.5s
[Parallel(n_jobs=8)]: Done 250 out of 250 | elapsed:   46.6s finished
[Parallel(n_jobs=8)]: Using backend ThreadingBackend with 8 concurrent workers.
[Parallel(n_jobs=8)]: Done  34 tasks      | elapsed:    0.2s
[Parallel(n_jobs=8)]: Done 184 tasks      | elapsed:    1.5s
[Parallel(n_jobs=8)]: Done 250 out of 250 | elapsed:    1.9s finished
[Parallel(n_jobs=8)]: Using backend LokyBackend with 8 concurrent workers.
[Parallel(n_jobs=8)]: Done  34 tasks      | elapsed:    6.8s
[Parallel(n_jobs=8)]: Done 184 tasks      | elapsed:   32.6s
[Parallel(n_jobs=8)]: Done 250 out of 250 | elapsed:   43.8s finished
[P

GridSearchCV(estimator=Pipeline(steps=[('robustscaler', RobustScaler()),
                                       ('pca', PCA()),
                                       ('randomforestclassifier',
                                        RandomForestClassifier(n_jobs=8,
                                                               random_state=0,
                                                               verbose=1))]),
             param_grid=[{'pca__n_components': [0.7, 0.85, 0.95],
                          'randomforestclassifier__max_depth': [3, 10, None],
                          'randomforestclassifier__n_estimators': [250, 500]}])