In [None]:
from IPython.display import display

import json
import os

import plotly.graph_objects as go
import plotly.express as px
import pandas as pd

nlp_dir = os.path.abspath('')
assets_dir = os.path.join(os.path.dirname(nlp_dir), 'tex', 'assets')
os.makedirs(assets_dir, exist_ok=True)

In [None]:
with open(os.path.join(os.path.join(os.path.dirname(nlp_dir), 'results'), 'all.json')) as file:
  results = json.load(file)
df = pd.DataFrame(results)

# Normalize to monthly salary.
df[['salary']] = df[['salary']].applymap(lambda salaries: [v if v < 15000 else v / 12 for v in salaries])

df[['cities']] = df[['location']].applymap(lambda v: v['cities'])
df[['states']] = df[['location']].applymap(lambda v: v['states'])
df = df.drop(columns=['location'])

In [None]:
def occurrences(df, col, name):
  results = pd.Series([value for values in df[col] for value in values]).value_counts(sort=True)
  return pd.DataFrame({name: results.index, 'Count': results.values})

In [None]:
lang_data = pd.Series([l for l in df['language']]).value_counts(sort=True)
lang_results = pd.DataFrame({'Language': lang_data.index, 'Count': lang_data.values})

In [None]:
fig = px.pie(
  lang_results,
  labels='Language',
  values='Count',
  color='Language',
)
fig.update_layout(plot_bgcolor='rgba(0,0,0,0)', paper_bgcolor='rgba(0,0,0,0)')
fig.write_image(os.path.join(assets_dir, 'language-pie-chart.pdf'), width=800, height=480)
fig.show()

In [None]:
total_count = len(df)
total_count

In [None]:
certifications_result = occurrences(df, 'certifications', 'Certification')
certifications_result

In [None]:
total_certifications = sum([1 if c else 0 for c in df['certifications']])
total_certifications

In [None]:
fig = px.bar(
  certifications_result.sort_values(by='Count'), 
  x='Certification', 
  y='Count', 
  color='Certification', 
  text='Count'
)
fig.update_traces(textposition='outside')
fig.update_layout(plot_bgcolor='rgba(0,0,0,0)', paper_bgcolor='rgba(0,0,0,0)')
fig.write_image(os.path.join(assets_dir, 'certifications-bar-chart.pdf'), width=800, height=480)
fig.show()

In [None]:
df[['avg_salary']] = df[['salary']].applymap(lambda v: sum(v) / len(v) if v else None)
df[['max_salary']] = df[['salary']].applymap(lambda v: max(v) if v else None)

average_salary = df.explode('states')[['states', 'avg_salary']].groupby('states').mean().reset_index().sort_values(by='avg_salary')
average_salary

In [None]:
total_average_salary = df['avg_salary'].dropna().mean()
total_average_salary

In [None]:
fig = px.bar(
  average_salary.rename(columns={"avg_salary": "Average Salary", "states": "State"}), 
  x='State', 
  y='Average Salary', 
  color='State', 
  text=round(average_salary['avg_salary']),
)
fig.update_traces(textposition='outside')
fig.update_layout(plot_bgcolor='rgba(0,0,0,0)', paper_bgcolor='rgba(0,0,0,0)')
fig.write_image(os.path.join(assets_dir, 'average-salary-bar-chart.pdf'), width=800, height=480)
fig.show()

In [None]:
employment_type_result = pd.Series([value for values in df['employment_type'] for value in values]).value_counts(sort=True).to_dict()

for key in ['full-time', 'part-time', 'permanent', 'temporary']:
  if not key in employment_type_result:
    employment_type_result[key] = 0

employment_type_result

In [None]:
data = employment_type_result

full_time_part_time = ['full-time', 'part-time']
full_time_part_time_data = [data[key] for key in full_time_part_time]

permanent_temporary = ['permanent', 'temporary']
permanent_temporary_data = [data[key] for key in permanent_temporary]

fig = go.Figure(
  data=[
    go.Bar(name='full-time vs. part-time', x=full_time_part_time, y=full_time_part_time_data, text=full_time_part_time_data),
    go.Bar(name='permanent vs. temporary', x=permanent_temporary, y=permanent_temporary_data, text=permanent_temporary_data),
  ]
)
fig.update_traces(textposition='outside')
fig.update_layout(barmode='group')
fig.update_layout(plot_bgcolor='rgba(0,0,0,0)', paper_bgcolor='rgba(0,0,0,0)')
fig.write_image(os.path.join(assets_dir, 'employment-type-bar-chart.pdf'), width=800, height=480)
fig.show()

In [None]:
occurrences(df, 'experience', 'Experience').head(50)

In [None]:
occurrences(df, 'cities', 'City')

In [None]:
state_count = occurrences(df, 'states', 'State')

with open(os.path.join(nlp_dir, 'state_population.json')) as file:
  state_population = json.load(file)

state_population = pd.DataFrame(state_population.items(), columns=['State', 'Population'])

state_result = pd.merge(state_count, state_population, left_on='State', right_on='State')
state_result

state_result['Count per Capita'] = state_result['Count'] / state_result['Population']

state_result

In [None]:
fig = px.bar(
  state_result.sort_values(by='Count'), 
  x='State', 
  y='Count', 
  color='State', 
  text='Count'
)
fig.update_traces(textposition='outside')
fig.update_layout(plot_bgcolor='rgba(0,0,0,0)', paper_bgcolor='rgba(0,0,0,0)')
fig.write_image(os.path.join(assets_dir, 'location-bar-chart.pdf'), width=800, height=480)
fig.show()

In [None]:
data = state_result.sort_values(by='Count per Capita')
fig = px.bar(
  data,
  x='State',
  y='Count per Capita',
  color='State',
  text=round(data['Count per Capita'] * 10000000) / 10,
)
fig.update_traces(textposition='outside')
fig.update_layout(plot_bgcolor='rgba(0,0,0,0)', paper_bgcolor='rgba(0,0,0,0)')
fig.write_image(os.path.join(assets_dir, 'location-per-capita-bar-chart.pdf'), width=800, height=480)
fig.show()

In [None]:
education_data = occurrences(df, 'education_type', 'Education Type')

education_data['Percentage'] = education_data['Count'] / total_count

education_data

In [None]:
fig = px.bar(
  education_data,
  x='Education Type',
  y='Count',
  color='Education Type',
  text=education_data['Percentage'].map(lambda n: '{:.1%}'.format(n)),
)
fig.update_traces(textposition='outside')
fig.update_layout(plot_bgcolor='rgba(0,0,0,0)', paper_bgcolor='rgba(0,0,0,0)')
fig.write_image(os.path.join(assets_dir, 'education-bar-chart.pdf'), width=800, height=480)
fig.show()

In [None]:
fig = px.pie(
  education_data,
  labels='Education Type',
  values='Count',
  color='Education Type',
)
fig.update_layout(plot_bgcolor='rgba(0,0,0,0)', paper_bgcolor='rgba(0,0,0,0)')
fig.write_image(os.path.join(assets_dir, 'education-pie-chart.pdf'), width=800, height=480)
fig.show()