In [1]:
import os
import joblib

project_dir = os.getcwd()
data_dir = os.path.join(project_dir, "data")
model_dir = os.path.join(project_dir, "model")

In [2]:
import pandas as pd
from tqdm import tqdm

pd.options.display.max_colwidth = 255
tqdm.pandas()

In [3]:
X_train = joblib.load(f"{data_dir}/x_train.pkl")
X_val = joblib.load(f"{data_dir}/x_val.pkl")
y_train = joblib.load(f"{data_dir}/y_train.pkl")
y_val = joblib.load(f"{data_dir}/y_val.pkl")
y_classes = joblib.load(f"{data_dir}/y_classes.pkl")

### Train model

In [4]:
from sklearn.multiclass import OneVsRestClassifier
from xgboost import XGBClassifier

xgb_classifier = XGBClassifier(max_depth=5,
                               eta=0.2,
                               gamma=4,
                               min_child_weight=6,
                               subsample=0.8,
                               early_stopping_rounds=10,
                               num_round=200,
                               n_jobs=-1)

clf = OneVsRestClassifier(xgb_classifier)
clf.fit(X_train, y_train)



Parameters: { "early_stopping_rounds", "num_round" } might not be used.

  This could be a false alarm, with some parameters getting used by language bindings but
  then being mistakenly passed down to XGBoost core, or some parameter actually being used
  but getting flagged wrongly here. Please open an issue if you find any such cases.


Parameters: { "early_stopping_rounds", "num_round" } might not be used.

  This could be a false alarm, with some parameters getting used by language bindings but
  then being mistakenly passed down to XGBoost core, or some parameter actually being used
  but getting flagged wrongly here. Please open an issue if you find any such cases.


Parameters: { "early_stopping_rounds", "num_round" } might not be used.

  This could be a false alarm, with some parameters getting used by language bindings but
  then being mistakenly passed down to XGBoost core, or some parameter actually being used
  but getting flagged wrongly here. Please open an issue if you 

Parameters: { "early_stopping_rounds", "num_round" } might not be used.

  This could be a false alarm, with some parameters getting used by language bindings but
  then being mistakenly passed down to XGBoost core, or some parameter actually being used
  but getting flagged wrongly here. Please open an issue if you find any such cases.


Parameters: { "early_stopping_rounds", "num_round" } might not be used.

  This could be a false alarm, with some parameters getting used by language bindings but
  then being mistakenly passed down to XGBoost core, or some parameter actually being used
  but getting flagged wrongly here. Please open an issue if you find any such cases.


Parameters: { "early_stopping_rounds", "num_round" } might not be used.

  This could be a false alarm, with some parameters getting used by language bindings but
  then being mistakenly passed down to XGBoost core, or some parameter actually being used
  but getting flagged wrongly here. Please open an issue if you 

OneVsRestClassifier(estimator=XGBClassifier(base_score=None, booster=None,
                                            colsample_bylevel=None,
                                            colsample_bynode=None,
                                            colsample_bytree=None,
                                            early_stopping_rounds=10,
                                            enable_categorical=False, eta=0.2,
                                            gamma=4, gpu_id=None,
                                            importance_type=None,
                                            interaction_constraints=None,
                                            learning_rate=None,
                                            max_delta_step=None, max_depth=5,
                                            min_child_weight=6, missing=nan,
                                            monotone_constraints=None,
                                            n_estimators=100, n_jobs=-1,
     

In [5]:
joblib.dump(clf, f"{model_dir}/one_vs_rest_classifier.pkl")

['C:\\Users\\sotir\\Documents\\git\\satori-case-study\\model/one_vs_rest_classifier.pkl']

In [6]:
clf = joblib.load(f"{model_dir}/one_vs_rest_classifier.pkl")

In [7]:
y_pred = clf.predict(X_val)

In [8]:
joblib.dump(y_pred, f"{model_dir}/y_pred.pkl")

['C:\\Users\\sotir\\Documents\\git\\satori-case-study\\model/y_pred.pkl']

### Evaluate model

#### Precision, Recall, F-1 score

In [9]:
from sklearn.metrics import precision_recall_fscore_support as score

In [10]:
precision, recall, fscore, support = score(y_val, y_pred)

print(f"precision: {precision}")
print(f"recall: {recall}")
print(f"fscore: {fscore}")

precision: [0.68735806 0.97493509 0.95869614 0.84954312 0.89045074 0.9007286
 0.94295369 0.85416667 0.70720175 0.83797702 0.76610542 0.92309581
 0.7852476  0.87271538 0.88250465 0.67351547 0.92669097 0.97412418
 0.96672764 0.80987912]
recall: [0.07576769 0.85819141 0.84436142 0.45759169 0.6618028  0.60712093
 0.60129283 0.6827174  0.41653931 0.65934488 0.38798777 0.65492492
 0.62211759 0.71395094 0.69431861 0.44071803 0.78644506 0.86139312
 0.85918873 0.60977528]
fscore: [0.13649004 0.91284581 0.89790371 0.59480303 0.75928678 0.7253392
 0.73432743 0.7588789  0.52427973 0.73800544 0.5151049  0.76622371
 0.69422826 0.78539005 0.77718209 0.5327975  0.8508274  0.91429688
 0.90979138 0.69572463]


#### Hamming loss

In [11]:
from sklearn.metrics import hamming_loss

hamming = []

for i, (test, pred) in enumerate(zip(y_val.T, y_pred.T)):
    hamming.append(hamming_loss(test, pred))

In [12]:
metric_df = pd.DataFrame(data=[precision, recall, fscore, hamming],
                         index=["Precision", "Recall", "F-1 score", "Hamming loss"],
                         columns=y_classes)

In [13]:
metric_df

Unnamed: 0,.net,android,angularjs,asp.net,c,c#,c++,css,html,ios,iphone,java,javascript,jquery,mysql,objective-c,php,python,ruby-on-rails,sql
Precision,0.687358,0.974935,0.958696,0.849543,0.890451,0.900729,0.942954,0.854167,0.707202,0.837977,0.766105,0.923096,0.785248,0.872715,0.882505,0.673515,0.926691,0.974124,0.966728,0.809879
Recall,0.075768,0.858191,0.844361,0.457592,0.661803,0.607121,0.601293,0.682717,0.416539,0.659345,0.387988,0.654925,0.622118,0.713951,0.694319,0.440718,0.786445,0.861393,0.859189,0.609775
F-1 score,0.13649,0.912846,0.897904,0.594803,0.759287,0.725339,0.734327,0.758879,0.52428,0.738005,0.515105,0.766224,0.694228,0.78539,0.777182,0.532797,0.850827,0.914297,0.909791,0.695725
Hamming loss,0.027002,0.017526,0.00459,0.021732,0.011422,0.05457,0.024202,0.021493,0.052311,0.025796,0.018522,0.053914,0.079872,0.036001,0.019829,0.02459,0.032055,0.012374,0.005173,0.022313
