In [None]:
import os
import pandas as pd
import numpy as numpy
import matplotlib.pyplot as plt
import seaborn as sns

In [None]:
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))
da = pd.read_csv("../input/data-analyst-jobs/DataAnalyst.csv")
ba = pd.read_csv("../input/business-analyst-jobs/BusinessAnalyst.csv")
ds = pd.read_csv("../input/data-scientist-jobs/DataScientist.csv")

### Data Cleaning

In [None]:
da["Job Type"] = "Data Analyst"
ba["Job Type"] = "Business Analyst"
ds["Job Type"] = "Data Scientist"

#Data Cleaning
ba.drop(["index"],axis=1,inplace=True)
ds.drop(["index"],axis=1,inplace=True)

#Combine the datasets
df = pd.concat([da,ba,ds])

drop_columns = ['Unnamed: 0','Headquarters', 'Size', 'Founded', 'Revenue', 'Competitors', 'Easy Apply']
df.dropna()
df.drop(drop_columns, axis=1, inplace=True)

### Clean the Salary Column

In [None]:
# Since all the salary data are estimated by glass doors and they are all in the same range
# We can use average salary as indicator.
# We have to use regular expression here:
import re
def get_avg_salary(salary):
    salary_list = re.findall(r"\$(.+?)K",salary)
    salary_list = [int(i) for i in salary_list]
    return sum(salary_list)/2

df["Avg Salary"] = df["Salary Estimate"].apply(lambda x : get_avg_salary(x))
df.drop(df[df["Avg Salary"]==0].index,axis=0,inplace=True)

### Visualization

In [None]:
import plotly.express as px
fig = px.histogram(df, x=df["Avg Salary"], color=df["Job Type"])
fig.show()
#As we can see, data scientists are more likely to gain higher salary.

### Define a Good Job

In [None]:
# How do we define a good job?
# Good Job Indicator = Salary * 0.8 + Rating * 0.2
from sklearn.preprocessing import MinMaxScaler
scaler = MinMaxScaler()
df[["Rating Scaled","Avg Salary Scaled"]] = pd.DataFrame(scaler.fit_transform(df[["Rating","Avg Salary"]]))

df["Good Job Indicator"] = df["Rating Scaled"]*20 + df["Avg Salary Scaled"]*80
df.sort_values(by="Good Job Indicator", ascending=False, inplace=True)
df.head() # Best 5 jobs. This result make sense to me.
# For jobs with lowest salary section, their indicator is 0, which does not make sense.
df.drop(df[df["Good Job Indicator"].isnull()].index, axis=0, inplace=True)

In [None]:
import plotly.express as px
fig = px.histogram(df, x="Good Job Indicator", color="Job Type")
fig.update_layout(
    title="Job Score Distribution",
    xaxis_title="Job Score",
    yaxis_title="Num of Jobs",
)
fig.show()

In [None]:
# Lets define jobs scoring more than 60 as great jobs 
# and check their company names, location, job description and industry.
good_job_df = df[df["Good Job Indicator"]>=40]
# print(good_job_df.shape) #1689 out of 9571 jobs.

### More Viz

In [None]:
city_rank = good_job_df.groupby(by="Location").count().reset_index()\
            .sort_values("Job Title", ascending=False)[0:20][["Location", "Job Title"]]
city_rank.rename(columns={"Job Title":"Num of Jobs"},inplace=True)
fig = px.bar(city_rank, x='Location', y='Num of Jobs',color='Num of Jobs')
fig.show()

In [None]:
city_rank = good_job_df.groupby(by=["Location","Job Type"]).count().reset_index()\
            .sort_values("Job Title", ascending=False)[0:20][["Location", "Job Title","Job Type"]]
city_rank.rename(columns={"Job Title":"Num of Jobs"},inplace=True)
fig = px.bar(city_rank, x="Location", y="Num of Jobs", color="Job Type", title="Best Jobs Distribution among Cities")
fig.show()

In [None]:
industry_rank = good_job_df.groupby(by=["Industry","Job Type"]).count()\
                .reset_index().sort_values("Job Title", ascending=False)[0:20][["Industry", "Job Title", "Job Type"]]
industry_rank.rename(columns={"Job Title":"Num of Jobs"},inplace=True)
# fig = px.pie(industry_rank, values='Num of Jobs', names='Industry', title='Good Jobs Industry Distribution',width=900, height=600)
# fig.show()
industry_rank.drop(industry_rank[industry_rank["Industry"]=="-1"].index, axis=0, inplace=True)

In [None]:

industry_rank[industry_rank["Job Type"]=="Business Analyst"]
industry_rank[industry_rank["Job Type"]=="Business Analyst"]
industry_rank[industry_rank["Job Type"]=="Business Analyst"]


fig1 = px.pie(industry_rank, values='Num of Jobs', names='Industry', \
              title='Good Jobs Industry Distribution',
              color_discrete_sequence=px.colors.sequential.RdBu,
              width=900, height=600)
fig1.show()

In [None]:
job_types = industry_rank["Job Type"].unique().tolist()
industry_rank_chart = list()
fig = list()
for i in range(0,3):
    industry_rank_chart.append(industry_rank[industry_rank["Job Type"]==job_types[i]])
for i in range(0,3):
    fig.append(px.pie(industry_rank_chart[i], values='Num of Jobs', names='Industry', \
                  title=job_types[i]+' Industry Distribution',
                  width=600, height=500))
colors = ['gold', 'mediumturquoise', 'darkorange', 'lightgreen']
for i in range(0,3):
    fig[i].update_traces(textposition='inside', textinfo='label+percent',\
                         marker=dict(colors=colors, line=dict(color='#000232', width=2)))
    fig[i].show()