In [None]:
!pip -q install --upgrade seaborn

import numpy as np 
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt

from wordcloud import WordCloud

In [None]:
PALETTE_NAME = 'Spectral'
sns.set_palette(PALETTE_NAME)
sns.set_style("whitegrid")

df_analyst = pd.read_csv('../input/data-analyst-jobs/DataAnalyst.csv')
df_engineer = pd.read_csv('../input/data-engineer-jobs/DataEngineer.csv')
df_scientist = pd.read_csv('../input/data-scientist-jobs/DataScientist.csv')

df = pd.concat([df_analyst, df_engineer, df_scientist])

In [None]:
df = df.drop(['Unnamed: 0', 'index', 'Headquarters', 'Type of ownership', 'Competitors', 'Easy Apply', 'Revenue'], axis=1)
df = df.replace('-1', 'Unknown', regex=True)
df = df[df['Salary Estimate'] != 'Unknown']

df = df.reset_index(drop=True);

In [None]:
# Code from https://www.kaggle.com/taha07/data-scientists-jobs-analysis-visualization
hours_per_week = 40
weeks_per_year = 52

for i in range(df.shape[0]):
    salary_estimate = df.loc[i,"Salary Estimate"]
    salary_estimate = salary_estimate.replace("$", "")

    if "Per Hour" in salary_estimate:
        lower, upper = salary_estimate.split("-")
        upper, _ = upper.split("Per")
        upper= upper.strip()
        lower = int(lower)*hours_per_week*weeks_per_year*(1/1000)
        upper = int(upper)*hours_per_week*weeks_per_year*(1/1000)

    else:
        lower, upper = salary_estimate.split("-")
        lower = lower.replace("K", "")
        upper, _= upper.split("(")
        upper=upper.replace("K", "")
        upper = upper.strip()

    lower = int(lower)
    upper = int(upper)
    df.loc[i,"salary_estimate_lower_bound"] = lower
    df.loc[i,"salary_estimate_upper_bound"] = upper

df = df.drop('Salary Estimate', axis=1)

In [None]:
for i in range(df.shape[0]):
    name = df.loc[i,"Company Name"]
    if "\n" in str(name):
        name,_ = name.split("\n")
    df.loc[i,"Company Name"] = name

In [None]:
df["Job Description"] = df["Job Description"].replace("\n", " ", regex=True);

### Which sectors & industries need employees?

In [None]:
df_sectors = df.Sector.value_counts().drop(['Unknown']).sort_values(ascending=False).head(n=10)

fig, axs = plt.subplots(nrows=1, ncols=1, figsize=(10, 10))
df_sectors.plot.pie(autopct='%1.1f%%', ylabel="");

### Locations and Companies with the highest number of jobs in Data Science

In [None]:
df_locations = df['Location'].value_counts().sort_values(ascending=False).head(n=15)
df_companies = df['Company Name'].value_counts().sort_values(ascending=False).head(n=15)

fig, axs = plt.subplots(nrows=1, ncols=2, figsize=(22, 5))
df_locations.plot.bar(ylabel="No. of job listings", xlabel="Location", ax=axs[0]);
df_companies.plot.bar(ylabel="No. of job listings", xlabel="Company", ax=axs[1]);

### Which of the Data Science positions is the most popular?

In [None]:
df['Job Title'].value_counts().sort_values(ascending=False).head(n=10).plot.bar(figsize=(22, 5), ylabel="No. of job listings");

### Which Job Titles offer the highest salary?

In [None]:
df_pay = df.copy()
df_pay['Role'] = 'Other'
df_pay.loc[df_pay['Job Title'].str.lower().str.contains('data analyst'), 'Role'] = 'Data Analyst'
df_pay.loc[df_pay['Job Title'].str.lower().str.contains('data engineer'), 'Role'] = 'Data Engineer'
df_pay.loc[df_pay['Job Title'].str.lower().str.contains('data scientist'), 'Role'] = 'Data Scientist'
df_pay.loc[df_pay['Job Title'].str.lower().str.contains('machine learning engineer'), 'Role'] = 'ML Engineer'
df_pay.loc[df_pay['Job Title'].str.lower().str.contains('business intelligence analyst'), 'Role'] = 'BI Analyst'

df_pay = df_pay.loc[df_pay['Role'] != 'Other', ['Role', 'salary_estimate_upper_bound']]

fig, axs = plt.subplots(figsize=(22, 5));
sns.kdeplot(data=df_pay, x="salary_estimate_upper_bound", hue="Role", ax=axs);
axs.set_xlabel("Salary (* 1000$/year)");

### The most popular keywords in Data Science job offers

In [None]:
def get_keywords_languages(df_keywords_list, num_listings):
    languages = ["Python", "C++", "MATLAB", ".NET", "C#", "JavaScript", "HTML", "Bash", "Java", "Scala", "SQL"]
    languages_freq = dict()
    for item in languages:
        counter = 0
        for it in df_keywords_list:
            if item.lower() in it:
                counter += 1
        languages_freq[item] = counter
    df_languages = pd.DataFrame(list(languages_freq.items()),columns = ['Languages','Count']) 
    df_languages['Count'] = df_languages['Count'].div(1.0 * num_listings)
    df_languages = df_languages.sort_values(["Count"], axis=0, ascending=False)
    return df_languages
    
def get_keywords_tools(df_keywords_list, num_listings):
    big_data  = ["Big Data", "ETL", "Hadoop", "Spark", "Impala", "Cassandra", "Kafka", "HDFS", "HBase", "Hive", "Kubernetes", "Kubeflow", "Airflow", "BigQuery"]
    big_data_freq = dict()
    for item in big_data:
        counter = 0
        for it in df_keywords_list:
            if item.lower() in it:
                counter += 1
        big_data_freq[item] = counter
    df_big_data = pd.DataFrame(list(big_data_freq.items()),columns = ['Tools','Count']) 
    df_big_data['Count'] = df_big_data['Count'].div(1.0 * num_listings)
    df_big_data = df_big_data.sort_values(["Count"], axis=0, ascending=False)
    return df_big_data

def get_keywords_cloud(df_keywords_list, num_listings):
    cloud = ["AWS", "GCP","Azure", "Google Cloud", "S3","Redshift","EC2","Lambda","Route S3","Dynamo DB"]
    cloud_freq = dict()
    for item in cloud:
        counter = 0
        for it in df_keywords_list:
            if item.lower() in it:
                counter += 1
        cloud_freq[item] = counter
    df_cloud = pd.DataFrame(list(cloud_freq.items()),columns = ['Cloud','Count']) 
    df_cloud['Count'] = df_cloud['Count'].div(1.0 * num_listings)
    df_cloud = df_cloud.sort_values(["Count"], axis=0, ascending=False)
    return df_cloud

def get_keywords_exp_edu(df_keywords_list, num_listings):
    exp_edu  = ["BSc", "MSc","PhD", "Full-Time", "Intern", "Junior", "Senior", "Remote", "Master", "Doctorate", "Bachelor", "Post-Doc"]
    exp_edu_freq = dict()
    for item in exp_edu:
        counter = 0
        for it in df_keywords_list:
            if item.lower() in it:
                counter += 1
        exp_edu_freq[item] = counter
    df_exp_edu = pd.DataFrame(list(exp_edu_freq.items()),columns = ['Experience/Education','Count']) 
    df_exp_edu['Count'] = df_exp_edu['Count'].div(1.0 * num_listings)
    df_exp_edu = df_exp_edu.sort_values(["Count"], axis=0, ascending=False)
    return df_exp_edu

In [None]:
def plot_programming_languages(position):
    num_listings = len(df.loc[df['Job Title'].str.lower().str.contains(position.lower()), :])
    job_keywords = [x.lower() for x in df.loc[df['Job Title'].str.lower().str.contains(position.lower()), "Job Description"].tolist()]

    plt.figure(figsize=(22, 6))
    sns.barplot(data=get_keywords_languages(job_keywords, num_listings).head(10), x="Languages", y="Count");
    plt.xlabel("Keywords related to (Programming) Languages")
    plt.ylabel(f"%  of {position} listings");
    
    
def plot_tools_and_tech(position):
    num_listings = len(df.loc[df['Job Title'].str.lower().str.contains(position.lower()), :])
    job_keywords = [x.lower() for x in df.loc[df['Job Title'].str.lower().str.contains(position.lower()), "Job Description"].tolist()]

    plt.figure(figsize=(22, 6))
    sns.barplot(data=get_keywords_tools(job_keywords, num_listings).head(10), x="Tools", y="Count");
    plt.xlabel("Keywords related to Tools and Technologies")
    plt.ylabel(f"%  of {position} listings");
    
    
def plot_cloud(position):
    num_listings = len(df.loc[df['Job Title'].str.lower().str.contains(position.lower()), :])
    job_keywords = [x.lower() for x in df.loc[df['Job Title'].str.lower().str.contains(position.lower()), "Job Description"].tolist()]

    plt.figure(figsize=(22, 6))
    sns.barplot(data=get_keywords_cloud(job_keywords, num_listings).head(10), x="Cloud", y="Count");
    plt.xlabel("Keywords related to Cloud Computing and Storage")
    plt.ylabel(f"%  of {position} listings");
    
    
def plot_exp_edu(position):
    num_listings = len(df.loc[df['Job Title'].str.lower().str.contains(position.lower()), :])
    job_keywords = [x.lower() for x in df.loc[df['Job Title'].str.lower().str.contains(position.lower()), "Job Description"].tolist()]

    plt.figure(figsize=(22, 6))
    sns.barplot(data=get_keywords_exp_edu(job_keywords, num_listings).head(10), x="Experience/Education", y="Count");
    plt.xlabel("Keywords related to Experience/Education")
    plt.ylabel(f"%  of {position} listings");

#### Data Analyst

In [None]:
plot_programming_languages(position='Data Analyst')

In [None]:
plot_tools_and_tech(position='Data Analyst')

In [None]:
plot_cloud(position='Data Analyst')

In [None]:
plot_exp_edu(position='Data Analyst')

#### Data Engineer

In [None]:
plot_programming_languages(position='Data Engineer')

In [None]:
plot_tools_and_tech(position='Data Engineer')

In [None]:
plot_cloud(position='Data Engineer')

In [None]:
plot_exp_edu(position='Data Engineer')

#### Data Scientist

In [None]:
plot_programming_languages(position='Data Scientist')

In [None]:
plot_tools_and_tech(position='Data Scientist')

In [None]:
plot_cloud(position='Data Scientist')

In [None]:
plot_exp_edu(position='Data Scientist')

#### Machine Learning Engineer

In [None]:
plot_programming_languages(position='Machine Learning Engineer')

In [None]:
plot_tools_and_tech(position='Machine Learning Engineer')

In [None]:
plot_cloud(position='Machine Learning Engineer')

In [None]:
plot_exp_edu(position='Machine Learning Engineer')