In [35]:
DF_PATH = "~/OneDrive/Desktop/iti/data/processed/1_preprocessed_df.pkl"
ROLES_PATH = "https://raw.githubusercontent.com/Deena-Gergis/e2e_ds_project/master/data/raw/roles_short_names.csv" 
FIG_DIR = "~/OneDrive/Desktop/iti/reports/figures"

NA_STRING = 'Not Specified'
TRANSPARENT_STRING = 'rgba(0, 0, 0, 0)'

ROLE_COLS      = ['DevType']
TECH_COLS      = ['LanguageWorkedWith',
                  'DatabaseWorkedWith',
                  'PlatformWorkedWith',
                  'WebframeWorkedWith',
                  'MiscTechWorkedWith',
                  'MiscTechWorkedWith',
                  'NEWCollabToolsWorkedWith']

In [36]:
# Load packages
import pandas as pd 
import numpy as np
import logging
import pickle
import os

import plotly 
import plotly.graph_objects as go
from plotly.subplots import make_subplots
import plotly.express as px
import plotly.figure_factory as ff

from sklearn.manifold import TSNE
from sklearn.preprocessing import MultiLabelBinarizer
from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import RobustScaler
from sklearn.preprocessing import MinMaxScaler
from sklearn.pipeline import make_pipeline
from sklearn.cluster import AgglomerativeClustering

from matplotlib import pyplot as plt
from scipy.cluster.hierarchy import dendrogram

_________

## Read data and preprocess

In [37]:
# Read Data 
raw_df = pd.read_pickle(DF_PATH)
roles_names = pd.read_csv(ROLES_PATH, sep=';')

### Onehot enconde

In [38]:
df = raw_df.copy()
encoded_dfs = {}
for col in ROLE_COLS + TECH_COLS:
    binarizer = MultiLabelBinarizer()
    encoded_df = pd.DataFrame(binarizer.fit_transform(df[col]),
                               columns=binarizer.classes_,
                               index=df[col].index)
    encoded_dfs[col] = encoded_df

In [39]:
# Merge 1-hot encoded 
df = pd.concat(encoded_dfs, axis=1)
df

Unnamed: 0_level_0,DevType,DevType,DevType,DevType,DevType,DevType,DevType,DevType,DevType,DevType,...,NEWCollabToolsWorkedWith,NEWCollabToolsWorkedWith,NEWCollabToolsWorkedWith,NEWCollabToolsWorkedWith,NEWCollabToolsWorkedWith,NEWCollabToolsWorkedWith,NEWCollabToolsWorkedWith,NEWCollabToolsWorkedWith,NEWCollabToolsWorkedWith,NEWCollabToolsWorkedWith
Unnamed: 0_level_1,Academic researcher,Data or business analyst,Data scientist or machine learning specialist,Database administrator,Designer,DevOps specialist,"Developer, QA or test","Developer, back-end","Developer, desktop or enterprise applications","Developer, embedded applications or devices",...,Facebook Workplace,Github,Gitlab,"Google Suite (Docs, Meet, etc)",Jira,Microsoft Azure,Microsoft Teams,Slack,Stack Overflow for Teams,Trello
0,0,0,0,0,0,0,0,0,1,0,...,0,0,0,0,1,1,0,1,0,1
1,0,0,0,0,0,0,0,0,0,0,...,0,1,1,0,1,0,0,1,0,0
2,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,0,0,0,0,0,0,0,0,0,0,...,0,1,0,1,1,0,0,1,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
64456,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
64457,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
64458,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
64459,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,1,0,1,1,0,0


_________

## Display jobs frequency

In [40]:
jobs_freq = df['DevType'].copy().sum().sort_values().reset_index()

In [41]:
jobs_freq

Unnamed: 0,index,0
0,Marketing or sales professional,642
1,Senior executive/VP,1320
2,"Engineer, site reliability",1940
3,Scientist,2086
4,Product manager,2497
5,Engineering manager,2724
6,"Developer, game or graphics",2789
7,Educator,2928
8,Academic researcher,3552
9,"Engineer, data",3738


## Display skills frequency 

In [42]:
# Calculate the sum for all cols except Devtypes
skills_freq = df.copy().drop('DevType', axis=1).sum().reset_index()
skills_freq.columns = ['group', 'skill', 'freq']

In [43]:
skills_freq.sort_values('freq', ascending=False)

Unnamed: 0,group,skill,freq
92,NEWCollabToolsWorkedWith,Github,43786
10,LanguageWorkedWith,JavaScript,38822
7,LanguageWorkedWith,HTML/CSS,36181
20,LanguageWorkedWith,SQL,31413
47,PlatformWorkedWith,Linux,29600
...,...,...,...
82,MiscTechWorkedWith,Puppet,945
26,DatabaseWorkedWith,Couchbase,937
45,PlatformWorkedWith,IBM Cloud or Watson,876
75,MiscTechWorkedWith,Chef,733


In [44]:
fig = px.treemap(skills_freq, 
                 path=['group', 'skill'], 
                 color_continuous_scale='deep',
                 values='freq',color='freq')

fig.update_layout(width=1400, height=700)
fig.show()


## Create Jobs & Skills Heatmap

In [45]:
sorted_roles = df['DevType'].sum().sort_values().index.tolist()
sorted_skills = df.drop('DevType', axis=1).sum().sort_values(ascending=False).droplevel(level=0).index.tolist()

In [46]:
skills = []

# For each role, calculate the percentage of each skill
for role in sorted_roles:
    # Filter for the role
    role_mask = (df[('DevType', role)] == 1)

    #For each skill column, calculate mean of the one-hot-encoded -> percentage
    skills_role = pd.concat({tech_col: df.loc[role_mask, tech_col].mean() * 100
                             for tech_col in TECH_COLS})

    # Append to original list
    skills.append(skills_role)

# Concat and format
skills = pd.concat(skills, axis=1)
skills.columns = sorted_roles
skills = skills.reset_index(level=0, drop=True)
skills = skills.loc[sorted_skills]
skills = skills.T

In [47]:
skills

Unnamed: 0,Github,JavaScript,HTML/CSS,SQL,Linux,Windows,Slack,MySQL,Python,Jira,...,Facebook Workplace,IBM DB2,Drupal,Unreal Engine,Haskell,Puppet,Couchbase,IBM Cloud or Watson,Chef,Julia
Marketing or sales professional,70.872274,71.028037,76.635514,57.788162,42.367601,47.196262,45.638629,61.370717,38.629283,28.971963,...,11.370717,5.76324,9.968847,6.853583,4.205607,4.517134,6.23053,7.476636,4.672897,4.049844
Senior executive/VP,75.075758,72.878788,69.166667,65.075758,58.863636,44.318182,59.393939,49.545455,45.984848,43.939394,...,5.681818,5.227273,6.060606,4.318182,3.409091,4.848485,5.227273,5.0,3.939394,2.727273
"Engineer, site reliability",79.845361,65.876289,58.505155,61.701031,74.639175,39.43299,60.56701,53.762887,59.896907,56.340206,...,5.721649,3.71134,5.206186,3.14433,3.969072,9.278351,3.659794,3.298969,8.041237,2.010309
Scientist,77.756472,48.465964,47.027804,43.6721,67.162033,50.0,45.493768,39.213806,67.689358,32.118888,...,3.739214,3.499521,3.691275,4.026846,4.554171,2.301055,3.259827,3.978907,2.157239,5.465005
Product manager,75.090108,73.007609,69.603524,63.115739,53.664397,50.620745,53.344013,52.422907,42.010412,44.53344,...,4.565479,3.92471,5.246296,3.203845,2.282739,3.003604,3.123748,2.803364,2.643172,1.561874
Engineering manager,75.991189,69.346549,60.682819,60.425844,61.600587,42.621145,61.453744,47.099853,46.512482,59.030837,...,4.038179,3.817915,4.662261,2.679883,2.679883,4.515419,4.001468,2.790015,4.331865,1.578561
"Developer, game or graphics",79.455002,67.228397,62.459663,49.26497,52.922194,64.25242,49.40839,48.045895,43.850843,42.344926,...,3.908211,2.617426,3.657225,16.09896,3.155253,2.402295,3.011832,2.294729,2.007888,1.398351
Educator,78.278689,68.40847,66.803279,56.625683,56.93306,51.263661,51.434426,53.85929,45.252732,40.471311,...,4.337432,3.790984,4.678962,3.381148,4.20082,2.834699,2.766393,2.698087,2.288251,2.185792
Academic researcher,77.730856,54.19482,53.462838,46.706081,62.190315,49.099099,45.213964,47.015766,59.037162,30.152027,...,3.997748,3.040541,3.293919,2.956081,4.504505,2.111486,2.505631,3.350225,1.52027,4.335586
"Engineer, data",75.307651,56.902087,52.594971,67.17496,62.841091,49.518459,52.220439,49.678973,65.088283,49.277689,...,4.11985,4.333868,3.210273,2.70198,2.835741,2.808989,3.504548,3.263777,2.3542,2.247191


In [48]:
fig = go.Figure(data=go.Heatmap(z=skills, x=skills.columns,y=skills.index, colorscale='magma', ygap=1))
fig.update_layout(width=2500, height=700)
fig.show()


## Create Jobs dendrogram

In [58]:
roles_names

Unnamed: 0,Original name,Short name
0,"Developer, back-end",Back-end dev
1,"Developer, full-stack",Full-stack dev
2,"Developer, front-end",Front-end dev
3,"Developer, desktop or enterprise applications",Desktop dev
4,"Developer, mobile",Mobile dev
5,DevOps specialist,DevOps
6,Database administrator,Database admin
7,Designer,Designer
8,System administrator,System admin
9,"Developer, embedded applications or devices",Embedded dev


In [65]:
roles_short_dict = roles_names.set_index("Original name")["Short name"].to_dict()

short_labels = [roles_short_dict[role] for role in sorted_roles]

fig = ff.create_dendrogram(skills, labels=short_labels, orientation='left', color_threshold=0)
fig.update_layout(height=700, width=1000, showlegend=False)
fig.show()

KeyError: 'Short name'

## Normalizing features 

In [66]:
std_skills = StandardScaler().fit_transform(skills)
std_skills = pd.DataFrame(std_skills, columns=skills.columns, index=skills.index)

In [67]:
fig = go.Figure(data=go.Heatmap(z=std_skills, x=skills.columns,y=skills.index, colorscale='magma', ygap=1))
fig.update_layout(width=2500, height=700)
fig.show()
fig.write_html(os.path.join(FIG_DIR, 'normalized_heatmap.html'))

FileNotFoundError: [Errno 2] No such file or directory: '~\\OneDrive\\Desktop\\iti\\reports\\figures\\normalized_heatmap.html'

## Job profiles

In [68]:
sorted_roles

['Marketing or sales professional',
 'Senior executive/VP',
 'Engineer, site reliability',
 'Scientist',
 'Product manager',
 'Engineering manager',
 'Developer, game or graphics',
 'Educator',
 'Academic researcher',
 'Engineer, data',
 'Developer, QA or test',
 'Data scientist or machine learning specialist',
 'Data or business analyst',
 'Developer, embedded applications or devices',
 'System administrator',
 'Designer',
 'Database administrator',
 'DevOps specialist',
 'Developer, mobile',
 'Developer, desktop or enterprise applications',
 'Developer, front-end',
 'Developer, full-stack',
 'Developer, back-end']

In [69]:
#role = np.random.choice(sorted_roles)
# role = 'Developer, embedded applications or devices'
# role = 'Data scientist or machine learning specialist'
# role = 'Developer, back-end'
role = 'Product manager'

In [70]:
single_role_skills = pd.concat([skills.loc[role], std_skills.loc[role]], axis=1)
single_role_skills.columns = ['percentage', 'specificity']
single_role_skills = single_role_skills.sort_values('percentage')

In [71]:
#single_role_skills

In [72]:
threshold = 25

single_role_skills = single_role_skills[single_role_skills['percentage'] > threshold]

fig = px.bar(df, 
             y=single_role_skills.index, 
             x=single_role_skills['percentage'], 
             color=single_role_skills['specificity'], 
             color_continuous_scale='orrd', 
             range_color=[std_skills.values.min(),std_skills.values.max()],
             orientation='h')

fig.update_layout(width=800, height=800, title=role)
fig.show()