# 2. Skill Demand

### Importar módulos y librerías

In [2]:
import ast 
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
from datasets import load_dataset

  from .autonotebook import tqdm as notebook_tqdm


### Cargar datos

In [3]:
dataset = load_dataset('lukebarousse/data_jobs')
df = dataset['train'].to_pandas()

### Limpieza de datos

In [4]:
df['job_posted_date'] = pd.to_datetime(df['job_posted_date'])
df['job_skills'] = df['job_skills'].apply(lambda x: ast.literal_eval(x) if pd.notna(x) else x)

### Filtrar para puestos en Argentina

In [5]:
df_ARG = df[df['job_country'] == 'Argentina']

### .explode

The .explode() method is designed to expand entries in a list-like column across multiple rows, making each element in the list a separate row.

In [7]:
df_skills = df_ARG.explode('job_skills')

df_skills[['job_title', 'job_skills']]

Unnamed: 0,job_title,job_skills
236,Científico de Datos LLM,
417,Data Science SR Puerto Madero,java
417,Data Science SR Puerto Madero,scala
863,Data Engineer Jr,python
863,Data Engineer Jr,azure
...,...,...
785668,Jr Data Analyst,excel
785668,Jr Data Analyst,tableau
785668,Jr Data Analyst,power bi
785668,Jr Data Analyst,spss


### Contar skills por puesto de trabajo

Groups DataFrame by job_skills and job_title_short, counting the occurrences of each skill within each job title

Then resets the index of the Series to turn it back into a DataFrame and renames the series holding the count to 'count'. 

The final DataFrame, df_skills_count, shows the frequency of each skill with each job title.

In [8]:
# Group by job_skills and job_title_short and count the number of occurrences
df_skills_count = df_skills.groupby(['job_skills', 'job_title_short']).size()

# Name the count column as count
df_skills_count = df_skills_count.reset_index(name='skill_count')

# Sort the values by skill_count in descending order
df_skills_count.sort_values(by='skill_count', ascending=False, inplace=True)

df_skills_count

Unnamed: 0,job_skills,job_title_short,skill_count
1054,sql,Data Engineer,1424
822,python,Data Engineer,1401
823,python,Data Scientist,1048
1055,sql,Data Scientist,930
71,aws,Data Engineer,803
...,...,...,...
11,airtable,Machine Learning Engineer,1
1245,word,Cloud Engineer,1
10,airtable,Data Analyst,1
1264,zoom,Senior Data Engineer,1


### Crear lista de top 3 puestos de trabajo

In [9]:
job_titles = df_skills_count['job_title_short'].unique().tolist()

job_titles = sorted(job_titles[:3])

job_titles

['Data Analyst', 'Data Engineer', 'Data Scientist']

### Barplot para visualizar las skills según cantidad de apariciones

Para las top 5 skills para los top 3 puestos de trabajo (que obtuvimos anteriormente)

In [1]:
fig, ax = plt.subplots(len(job_titles), 1)

sns.set_theme(style='ticks')

for i, job_title in enumerate(job_titles):
    df_plot = df_skills_count[df_skills_count['job_title_short'] == job_title].head(5)[::-1]
    sns.barplot(data=df_plot, x='skill_count', y='job_skills', ax=ax[i], hue='skill_count', palette='light:seagreen')
    ax[i].set_title(job_title)
    ax[i].invert_yaxis()
    ax[i].set_ylabel('')
    ax[i].set_xlabel('')
    ax[i].get_legend().remove()
    ax[i].set_xlim(0, 45000) # make the scales the same

fig.suptitle('Counts of Skills Requested in US Job Postings', fontsize=15)
fig.tight_layout(h_pad=0.5) # fix the overlap
plt.show()

NameError: name 'plt' is not defined