 # Technical Roles Analysis

 ## Script notebook

 This notebook contains the script used in our extended study about technical
 roles.

 ## Libraries

In [None]:
print(f"Python: {platform.python_version()}")
print(f"Numpy: {np.__version__}")
print(f"Pandas: {pd.__version__}")
print(f"Scikit learn: {sklearn.__version__}")


In [None]:
SEED = 42
FOLDS = 10
CORR_THRESHOLD = 0.7
REPO_THRESHOLD = 5

DEPENDENCIES_PATH = "/data/repo_dependencies.csv"
DESCRIPTIONS_PATH = "/data/repo_descriptions.csv"
LANGUAGES_PATH = "/data/repo_commits.csv"
DEVELOPERS_PATH = "/data/developers.csv"
DEVELOPERS_FS_PATH = "/data/developers-with-fullstack.csv"

BIO_MIN = 0.01
BIO_MAX = 0.2
DESC_MIN = 0.04
DESC_MAX = 0.15
NAMES_MIN = 0.03
NAMES_MAX = 0.25
TOPICS_MIN = 0.01
TOPICS_MAX = 0.25


 ## Pre-Processing Steps

 ### Author information

In [None]:
authors_ds = pd.read_csv(DEVELOPERS_PATH, delimiter=",")

authors_ds.gh_bio = authors_ds.gh_bio \
    .apply(strip_html_tags) \
    .apply(strip_numbers)


In [None]:
authors_ds.shape[0]


In [None]:
filtered_authors = authors_ds[authors_ds.gh_repos >= REPO_THRESHOLD].fillna("")


In [None]:
filtered_authors.loc[:, "Backend":].sum()


In [None]:
filtered_authors.drop(["gh_bio", "gh_repos"], axis=1) \
    .groupby(["Backend", "Frontend", "Mobile", "DevOps", "DataScientist"]) \
    .count() \
    .reset_index()


In [None]:
bio_bw = apply_bag_of_words(filtered_authors.gh_bio.values.astype("U"),
                            BIO_MAX, BIO_MIN)

print(f"{len(bio_bw[0])} words were selected for developer bio after Bag of "
      "Words.")


In [None]:
bio_ds = pd.DataFrame(
    data=normalize(bio_bw[1].toarray()),
    columns=[b + " (Bio)" for b in bio_bw[0]],
    index=filtered_authors.gh_login
)


 ### Repositories descriptions

In [None]:
descriptions_ds = pd.read_csv(DESCRIPTIONS_PATH, delimiter=",")
descriptions_ds = descriptions_ds.fillna("")

descriptions_ds.repo_desc = descriptions_ds.repo_desc \
    .apply(strip_html_tags) \
    .apply(strip_numbers)
descriptions_ds.repo_tags = descriptions_ds.repo_tags \
    .apply(strip_html_tags) \
    .apply(strip_numbers)
descriptions_ds.repo_name = descriptions_ds.repo_name \
    .apply(strip_html_tags) \
    .apply(strip_numbers)


In [None]:
desc_ds = descriptions_ds.groupby("gh_login") \
    .agg(lambda c: " ".join(c))

# right join with bio_ds to include developers without repositories
desc_ds = desc_ds \
    .join(bio_ds, how="right") \
    .iloc[:, :3]



In [None]:
repo_desc_bw = apply_bag_of_words(
    desc_ds.repo_desc.values.astype("U"), DESC_MAX, DESC_MIN)
repo_topics_bw = apply_bag_of_words(
    desc_ds.repo_tags.values.astype("U"), TOPICS_MAX, TOPICS_MIN)
repo_names_bw = apply_bag_of_words(
    desc_ds.repo_name.values.astype("U"), NAMES_MAX, NAMES_MIN)


In [None]:
rdesc_ds = pd.DataFrame(
    data=normalize(repo_desc_bw[1].toarray()),
    columns=[b + " (desc.)" for b in repo_desc_bw[0]],
    index=desc_ds.index
)
rtopics_ds = pd.DataFrame(
    data=normalize(repo_topics_bw[1].toarray()),
    columns=[b + " (topic)" for b in repo_topics_bw[0]],
    index=desc_ds.index
)
rnames_ds = pd.DataFrame(
    data=normalize(repo_names_bw[1].toarray()),
    columns=[b + " (name)" for b in repo_names_bw[0]],
    index=desc_ds.index
)


In [None]:
print(f"{len(repo_desc_bw[0])} words were selected for repository description "
      "after Bag of Words.")
print(f"{len(repo_topics_bw[0])} words were selected for repository topics "
      "after Bag of Words.")
print(f"{len(repo_names_bw[0])} words were selected for repository names after"
      " Bag of Words.")


 ### Languages

In [None]:
lang_ds = pd.read_csv(LANGUAGES_PATH, delimiter=",")
lang_ds = lang_ds.fillna(0)


In [None]:
lang_rate = lang_ds.loc[:, lang_ds.columns.str.endswith("_rate")] \
    .assign(gh_login=lang_ds.gh_login) \
    .groupby(["gh_login"]) \
    .mean()

lang_author = lang_ds.loc[:, lang_ds.columns.str.endswith("_author")] \
    .assign(gh_login=lang_ds.gh_login) \
    .groupby(["gh_login"]) \
    .sum()

lang_total = lang_ds.loc[:, lang_ds.columns.str.endswith("_total")] \
    .assign(gh_login=lang_ds.gh_login) \
    .groupby(["gh_login"]) \
    .sum()

# right join with bio_ds to include developers without repositories
lang_rate = lang_rate.join(bio_ds.iloc[:, :0], how="right").fillna(0)
lang_author = lang_author.join(bio_ds.iloc[:, :0], how="right").fillna(0)
lang_total = lang_total.join(bio_ds.iloc[:, :0], how="right").fillna(0)


In [None]:
lang_ds = lang_rate.join([lang_author, lang_total])

dropped_languages = find_correlation(lang_ds, "spearman", CORR_THRESHOLD)
lang_ds = lang_ds.drop(dropped_languages.keys(), axis=1)


In [None]:
unique_languages = lang_ds.nunique()[lang_ds.nunique() <= 1].index
lang_ds = lang_ds.drop(unique_languages, axis=1)


In [None]:
lang_ds = lang_ds.rename(columns={
    **{k: k.replace("_author", " (author)") for k in lang_author.columns},
    **{k: k.replace("_rate", " (rate)") for k in lang_rate.columns},
    **{k: k.replace("_total", " (total)") for k in lang_total.columns},
})


 ### Dependencies

In [None]:
dependencies_ds = pd.read_csv(DEPENDENCIES_PATH, delimiter=",")


In [None]:
dependencies_ds = dependencies_ds \
    .drop("repo_name", 1) \
    .groupby("gh_login") \
    .any()

# right join with bio_ds to include developers without repositories
deps_ds = dependencies_ds \
    .join(bio_ds.iloc[:, :0], how="right") \
    .fillna(False)


In [None]:
dropped_dependencies = find_correlation(deps_ds, "spearman", CORR_THRESHOLD)
deps_ds = deps_ds.drop(dropped_dependencies.keys(), axis=1)


In [None]:
unique_dependencies = deps_ds.nunique()[deps_ds.nunique() <= 1].index
deps_ds = deps_ds.drop(unique_dependencies, axis=1)


In [None]:
deps_ds = deps_ds.rename(columns={k: k.replace("_dep", " (dep.)")
                                  for k in deps_ds.columns})


  ### Finishing up dataset setup

In [None]:
X = bio_ds.join([rdesc_ds, rtopics_ds, rnames_ds, lang_ds, deps_ds])
Y = filtered_authors.loc[:, "Backend":]

Y.index = X.index


 As a result our dataset ends up as:

In [None]:
print(f"Features: {X.shape[0]} rows, {X.shape[1]} columns")
print(f"Dependent Vars.: {Y.shape[0]} rows, {Y.shape[1]} columns")

print(f"\nLanguages: {lang_ds.shape[1]}")
print(f"Dependencies: {deps_ds.shape[1]}")
print(f"Description: {rdesc_ds.shape[1]}")
print(f"Names: {rnames_ds.shape[1]}")
print(f"Topics: {rtopics_ds.shape[1]}")
print(f"Short bio: {bio_ds.shape[1]}")


 ## Classification

 For all classifications, we relied on two classification algorithms and a
 stratified baseline.

In [None]:
rf = RandomForestClassifier(n_estimators=500, random_state=SEED)
baseline = DummyClassifier("stratified", random_state=SEED)
nb_baseline = MultinomialNB()
skf = KFold(n_splits=FOLDS, random_state=SEED)
rf_clf = OneVsRestClassifier(rf)
baseline_clf = OneVsRestClassifier(baseline)
nb_clf = OneVsRestClassifier(nb_baseline)


 ### RQ.1: How accurate are machine learning classifiers in identifying
 ### technical roles?

In [None]:
br_scores, br_folds = classify(X, Y, skf, rf_clf, average="micro")
b_scores, b_folds = classify(X, Y, skf, baseline_clf, average="micro")
nb_scores, nb_folds = classify(X, Y, skf, nb_clf, average="micro")


 BR Results for identifying developers who **work** in each role:

In [None]:
print("******** Random Forest ********")
classify_report(br_scores, Y.columns)
print("\n******** Naive Bayes ********")
classify_report(nb_scores, Y.columns)
print("\n******** Baseline ********")
classify_report(b_scores, Y.columns)


 ### RQ.2: What are the most relevant features to identify technical roles?

In [None]:
var_imp = feature_importances_rank(X, Y, clone(rf))


In [None]:
var_imp["order"] = var_imp.groupby("role").rank(
    method="first", ascending=False)
var_imp[var_imp.category == "Dependency"].groupby("role").tail(10)



In [None]:
top_10_features(var_imp)


 We analyzed the distribution of all features detected as important in
 our RandomForest ranking. For this, we plotted the histogram for all 10
 features present in feature importance ranking for each role.

In [None]:
for r in Y.columns:
    features_df = build_histogram_data(X, Y, var_imp, r)
    print(plot_histogram_data(features_df, r))


 ### RQ.3: Do technical roles influence each other during classification?

 To answer this question, we applied classifier chain multilabel strategy
 over all possible roles configurations.

In [None]:
Y_rq3 = Y.loc[:, :]
permutations = itertools.permutations(range(0, Y_rq3.shape[1]))

iterations = []
for i, p in enumerate(permutations, start=1):
    p = list(p)
    order = np.array(Y_rq3.columns.tolist())[p]
    print(f"============= {order} =============")

    chain_clf = ClassifierChain(rf, order=list(p), random_state=SEED)
    cc_scores, _ = classify(X, Y_rq3, skf, chain_clf, average="micro")
    classify_report(cc_scores, Y_rq3)

    iteration = {i: r for i, r in enumerate(order)}
    iteration.update({
        "index": i,
        "precision": cc_scores["precision"],
        "recall": cc_scores["recall"],
        "f1": cc_scores["f1"],
        "auc": cc_scores["auc"],
        "jaccard": cc_scores["jaccard"],
        "hamming_loss": cc_scores["hamming_loss"]
    })
    for role in list(Y_rq3.columns):
        iteration.update({
            f"precision_{role}": cc_scores[f"precision_{role}"],
            f"recall_{role}": cc_scores[f"recall_{role}"],
            f"f1_{role}": cc_scores[f"f1_{role}"],
        })
    iterations.append(iteration)

cc_dataset = build_cc_data(iterations, br_scores)


In [None]:
cc_general = cc_dataset[np.any([
    cc_dataset.metric == "Precision",
    cc_dataset.metric == "Recall",
    cc_dataset.metric == "F1",
    cc_dataset.metric == "AUC",
    cc_dataset.metric == "Jaccard",
    cc_dataset.metric == "Hamming Loss"
], axis=0)]

cc_by_role = cc_dataset[np.any([
    cc_dataset.metric.str.contains("Backend"),
    cc_dataset.metric.str.contains("Frontend"),
    cc_dataset.metric.str.contains("Mobile"),
    cc_dataset.metric.str.contains("DevOps"),
    cc_dataset.metric.str.contains("DataScientist")
], axis=0)]


In [None]:
print(
    ggplot(cc_general, aes(x="index", y="value"))
    + geom_line()
    + geom_hline(yintercept=0, linetype="dashed")
    + facet_wrap("~ metric", ncol=2)
    + labs(x="Classifier Chains permutations", y="Metric value")
    + theme_bw()
)

print(
    ggplot(cc_by_role, aes(x="index", y="value"))
    + geom_line()
    + geom_hline(yintercept=0, linetype="dashed")
    + facet_wrap("~ metric", ncol=3)
    + labs(x="Classifier Chains permutations", y="Metric value")
    + theme_bw()
)


 ### RQ.4 How effectively can we identify full-stack developers?

 We applied the same labeling process in order to identify FullStack
 developers from the developers pool in Stak Overflow, generating a new
 dataset with `FullStack` role.

In [None]:
fs_authors_ds = pd.read_csv(DEVELOPERS_FS_PATH, delimiter=",")

fs_filtered_authors = fs_authors_ds[fs_authors_ds.gh_repos >= REPO_THRESHOLD] \
    .fillna("")
fs_filtered_authors.gh_bio = fs_filtered_authors.gh_bio \
    .apply(strip_html_tags) \
    .apply(strip_numbers)

fs_bio_bw = apply_bag_of_words(fs_filtered_authors.gh_bio.values.astype("U"),
                               BIO_MAX, BIO_MIN)

fs_bio_ds = pd.DataFrame(
    data=normalize(fs_bio_bw[1].toarray()),
    columns=[b + " (Bio)" for b in fs_bio_bw[0]],
    index=fs_filtered_authors.gh_login
)


In [None]:
fs_desc_ds = descriptions_ds.groupby("gh_login") \
    .agg(lambda c: " ".join(c))

fs_desc_ds = fs_desc_ds \
    .join(fs_bio_ds, how="right") \
    .iloc[:, :3]

fs_repo_desc_bw = apply_bag_of_words(
    fs_desc_ds.repo_desc.values.astype("U"), DESC_MAX, DESC_MIN)
fs_repo_topics_bw = apply_bag_of_words(
    fs_desc_ds.repo_tags.values.astype("U"), TOPICS_MAX, TOPICS_MIN)
fs_repo_names_bw = apply_bag_of_words(
    fs_desc_ds.repo_name.values.astype("U"), NAMES_MAX, NAMES_MIN)

fs_rdesc_ds = pd.DataFrame(
    data=normalize(fs_repo_desc_bw[1].toarray()),
    columns=[b + " (desc.)" for b in fs_repo_desc_bw[0]],
    index=fs_desc_ds.index
)
fs_rtopics_ds = pd.DataFrame(
    data=normalize(fs_repo_topics_bw[1].toarray()),
    columns=[b + " (topic)" for b in fs_repo_topics_bw[0]],
    index=fs_desc_ds.index
)
fs_rnames_ds = pd.DataFrame(
    data=normalize(fs_repo_names_bw[1].toarray()),
    columns=[b + " (name)" for b in fs_repo_names_bw[0]],
    index=fs_desc_ds.index
)


In [None]:
fs_lang_rate = lang_rate.join(fs_bio_ds.iloc[:, :0], how="right").fillna(0)
fs_lang_author = lang_author.join(fs_bio_ds.iloc[:, :0], how="right").fillna(0)
fs_lang_total = lang_total.join(fs_bio_ds.iloc[:, :0], how="right").fillna(0)

fs_lang_ds = fs_lang_rate.join([fs_lang_author, fs_lang_total])
fs_dropped_languages = find_correlation(fs_lang_ds, "spearman", CORR_THRESHOLD)
fs_lang_ds = fs_lang_ds.drop(fs_dropped_languages.keys(), axis=1)
fs_lang_ds = fs_lang_ds.rename(columns={
    **{k: k.replace("_author", " (author)") for k in fs_lang_author.columns},
    **{k: k.replace("_rate", " (rate)") for k in fs_lang_rate.columns},
    **{k: k.replace("_total", " (total)") for k in fs_lang_total.columns},
})

fs_lang_unique_cols = fs_lang_ds.nunique()[fs_lang_ds.nunique() <= 1].index
fs_lang_ds = fs_lang_ds.drop(fs_lang_unique_cols, axis=1)


In [None]:
fs_deps_ds = dependencies_ds \
    .join(fs_bio_ds.iloc[:, :0], how="right") \
    .fillna(False)

fs_dropped_dependencies = find_correlation(
    fs_deps_ds, "spearman", CORR_THRESHOLD)
fs_deps_ds = fs_deps_ds.drop(fs_dropped_dependencies.keys(), axis=1)
fs_deps_ds = fs_deps_ds.rename(columns={k: k.replace("_dep", " (dep.)")
                                        for k in fs_deps_ds.columns})

fs_deps_unique_cols = fs_deps_ds.nunique()[fs_deps_ds.nunique() <= 1].index
fs_deps_ds = fs_deps_ds.drop(fs_deps_unique_cols, axis=1)


In [None]:
X_fs = fs_bio_ds.join(
    [fs_rdesc_ds, fs_rtopics_ds, fs_rnames_ds, fs_lang_ds, fs_deps_ds])

Y_fs = fs_filtered_authors.loc[:, "Backend":]
Y_fs.index = X_fs.index

print(f"Features: {X_fs.shape[0]} rows, {X_fs.shape[1]} columns")
print(f"Dependent Vars.: {Y_fs.shape[0]} rows, {Y_fs.shape[1]} columns")


 The new `FullStack` developers are distributed as the following:

In [None]:
Y_fs[Y_fs.FullStack == 1] \
    .groupby(["Backend", "Frontend"]) \
    .count()


In [None]:
fs_br_scores, _ = classify(X_fs, Y_fs, skf, rf_clf, average="micro")
fs_b_scores, _ = classify(X_fs, Y_fs, skf, baseline_clf, average="micro")
fs_nb_scores, _ = classify(X_fs, Y_fs, skf, nb_clf, average="micro")


 BR Results for identifying developers who **work** in each role, including
 `FullStack` role:

In [None]:
print("******** Random Forest ********")
classify_report(fs_br_scores, Y_fs.columns)
print("\n******** Naive Bayes ********")
classify_report(fs_nb_scores, Y_fs.columns)
print("\n******** Baseline ********")
classify_report(fs_b_scores, Y_fs.columns)


 As we can observe, FullStack developers add a lot of noise to the dataset as
 the results for both `Backend` and `Frontend` are significantly lower than
 before.

 This happens because developers who are FullStack do not describe themselves
 as Backend and/or Frontend, as these roles are implicit to the FullStack
 definition.

 Therefore, we decide to redefine the labels of `Backend` and
 `Frontend` to 1 whenever a developer is labelled as
 `FullStack`:

In [None]:
fs_roles = ["Backend", "Frontend"]
Y_fs.loc[Y_fs.FullStack == 1, fs_roles] = 1

fs_br_scores, _ = classify(X_fs, Y_fs, skf, rf_clf, average="micro")
fs_b_scores, _ = classify(X_fs, Y_fs, skf, baseline_clf, average="micro")
fs_nb_scores, _ = classify(X_fs, Y_fs, skf, nb_clf, average="micro")


In [None]:
print("******** Random Forest ********")
classify_report(fs_br_scores, Y_fs.columns)
print("\n******** Naive Bayes ********")
classify_report(fs_nb_scores, Y_fs.columns)
print("\n******** Baseline ********")
classify_report(fs_b_scores, Y_fs.columns)


 As expected, the results increased significantly after we reassigned both
 `Backend` and `Frontend` based on the label values at
 `FullStack`.

In [None]:
y_pred = pd.DataFrame(
    cross_val_predict(rf_clf, X, Y, cv=skf),
    columns=["Backend_pred", "Frontend_pred", "Mobile_pred", "DevOps_pred",
             "DataScientist_pred"],
    index=Y.index
)
y_fs_pred = pd.DataFrame(
    cross_val_predict(rf_clf, X_fs, Y_fs, cv=skf),
    columns=["Backend_pred", "Frontend_pred", "Mobile_pred", "DevOps_pred",
             "DataScientist_pred", "FullStack_pred"],
    index=Y_fs.index
)
