In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

In [None]:
import pandas as pd
import numpy as np
import plotly.graph_objects as go
import plotly.express as px 

## **First, I'd like to start with my findings**

* Region South has the highest number of states in the dataset however the highest number of job postings come from West region.
* Northeast region has the smallest number of states (only 3) in the dataset but overall achieves almost the same number of job postings as region South with 7 states.
* California (West region) scores with the highest number of job postings per state.
* Looking at the industry of the posting companies, IT services and Stuffing and Outsourcing companies hit the first place in open positions.(Filtered out unknown job industries).
* Private companies founded around 2000s post the most. (Filtered out unknown years of founding).
* When it comes to salary, derived mean salary peaks the highest number in the West region, however the salary range in the West is the widest too. 
* The lowest difference in the lower and upper boundary salary is in Illinois (Midwest), Californa gets the highest difference, which might drive the results for the all region up.
* Real Estate shows the highest difference in the salaries per sector, whereas Travel and Tourism the lowest.  

## **And how did I get there?**

## **Load dataframe**

In [None]:
dataanalysts = pd.read_csv('/kaggle/input/data-analyst-jobs/DataAnalyst.csv').drop(columns = 'Unnamed: 0', axis = 1)

In [None]:
dataanalysts.head()

## **Basic data cleaning**

In [None]:
dataanalysts['Salary Estimate'] = dataanalysts['Salary Estimate'].str.partition(' (Glassdoor est.)')[0]
dataanalysts['Job Description'] = dataanalysts['Job Description'].str.replace('\n', ' ')
dataanalysts['Company Name'] = dataanalysts['Company Name'].str.partition('\n')[0]
dataanalysts['Size'] = dataanalysts['Size'].str.replace('to','-')
dataanalysts['Size'] = dataanalysts['Size'].str.replace('employees','')
dataanalysts = dataanalysts.drop(columns = ['Easy Apply','Competitors'])


### **Load table for match between state and US region from https://raw.githubusercontent.com/cphalpert/census-regions/master/us%20census%20bureau%20regions%20and%20divisions.csv**

In [None]:
regions = pd.read_csv('../input/us-regions-states/us%20census%20bureau%20regions%20and%20divisions.csv')

## **Merge regions and states in dataframe**

In [None]:
dataanalysts = dataanalysts.merge(regions, how = 'left', left_on=dataanalysts['Location'].str[-2:], right_on= ['State Code'])

## **Number of job postings per state**

In [None]:
barplot_state = dataanalysts['State'].value_counts().reset_index().rename(columns = {'index':'State','State':'Count'})
px.bar(barplot_state, x='State', y= 'Count', title="Number of job postings per state")

## **Number of job postings per state on a map**

In [None]:
df_map = dataanalysts.groupby('State Code').size().to_frame().reset_index().rename(columns = {0:'Number of postings'})

In [None]:
df_map = pd.merge(df_map,dataanalysts[['State Code','Region']]).drop_duplicates().reset_index(drop = True)

In [None]:

fig = go.Figure(data=go.Choropleth(
    locations=df_map['State Code'], 
    z = df_map['Number of postings'].astype(int), 
    locationmode = 'USA-states', 
    text=df_map['Region']    
    
))

fig.update_layout(
    title_text = 'Job Postings by States',
    geo_scope='usa', # limite map scope to USA
    
    
    
)

fig.update_traces(hovertemplate='State: %{location} <br>Region: %{text} <br>Number of postings: %{z} <extra></extra>')


## **Number of states in one region**

In [None]:
states_in_region = dataanalysts.groupby('Region')['State'].nunique().reset_index().rename(columns = {'State':'Count State'})

In [None]:
px.bar(states_in_region, x="Region", y="Count State", color="Count State", 
       title="States Count within Region", labels = {'Count State':'Count'})

In [None]:
px.sunburst(dataanalysts, path=['Region', 'State'], 
            title = 'States and Region Breakdown based on the Number of Job Postings')

## **And overall number of job postings per region**

In [None]:
barplot_region = dataanalysts['Region'].value_counts().reset_index().rename(columns = {'index':'Region','Region':'Count'})
px.bar(barplot_region, x='Region', y= 'Count', title="Number of job postings per region")

## **And job postings per region displayed on a map**

In [None]:
df_map_region = dataanalysts.groupby('Region').size().to_frame().rename(columns = {0:'Number of postings per region'}).reset_index()
df_map_region = pd.merge(df_map,df_map_region).drop_duplicates().reset_index(drop = True)

In [None]:
df_map_region['text'] = df_map_region['Region'] + '<br>' + \
    'Number of postings per state: ' + df_map_region['Number of postings'].astype('str')
fig = go.Figure(data=go.Choropleth(
    locations=df_map_region['State Code'], # Spatial coordinates
    z = df_map_region['Number of postings per region'].astype(int), # Data to be color-coded
    locationmode = 'USA-states', # set of locations match entries in `locations`
    text=df_map_region['text']   
    
))

fig.update_layout(
    title_text = 'Job Postings per Region',
    geo_scope='usa', # limit map scope to USA   
    
)


fig.update_traces(hovertemplate='State: %{location} <br>Region: %{text} \
<br>Number of postings per region: %{z}<extra></extra>')



## **Industry in which the posting companies are active**

In [None]:
fig = px.pie(dataanalysts.query("Industry != '-1'"),names = 'Industry')
fig.update_traces(textposition='inside')
fig.update_layout(uniformtext_minsize=12, uniformtext_mode='hide')

## **Year of founding, number of job postings and type of ownership**

In [None]:
stacked_df = dataanalysts.groupby(['Founded','Type of ownership']).agg({'Founded':'count'}).rename(columns = {'Founded': 'Count'}).reset_index()
stacked_df = stacked_df.query("Founded != -1")

In [None]:
px.bar(stacked_df, x="Founded", y="Count", color = "Type of ownership", title = 'Number of Job Listings vs Year when a Firm was established')

## **Salary ranges comparisons**

### **Getting mean salary range**

In [None]:
dataanalysts['lower_boundary'] = (pd.to_numeric((dataanalysts['Salary Estimate'].str.split('-').str[0]).str[1:-1])) * 1000
dataanalysts['upper_boundary'] = (pd.to_numeric((dataanalysts['Salary Estimate'].str.split('-').str[1]).str[1:-1])) * 1000
dataanalysts['salary range mean'] = (dataanalysts['lower_boundary'] + dataanalysts['upper_boundary']) / 2

In [None]:
px.box(dataanalysts, x="Region", y="salary range mean", title = 'Salary Range per Region',
      labels = {'salary range mean':'mean salary range'})

### **Difference between lower and upper boundary**

In [None]:
dataanalysts['difference'] = dataanalysts['upper_boundary'] - dataanalysts['lower_boundary']

In [None]:
difference = dataanalysts.groupby('State')['difference'].mean().to_frame()[1:].reset_index().sort_values(by = 'difference')

In [None]:
px.bar(difference, x = 'State', y = 'difference',title = 'Mean Difference between lower and upper Salary Boundary per State')

**And detailed breakdown**

In [None]:
boundary_state = dataanalysts.groupby('State')['upper_boundary','lower_boundary','difference'].mean()[1:].reset_index().sort_values(
    by = 'difference')

In [None]:
fig = go.Figure(data=[
    go.Bar(name='Lower Boundary', x=boundary_state['State'], y=boundary_state['lower_boundary']),
    go.Bar(name='Upper Boundary', x=boundary_state['State'], y=boundary_state['upper_boundary'])
])
# Change the bar mode
fig.update_layout(barmode='group', title = 'Lower and upper mean Boundaries per State, ordered from Lowest to Highest Difference')

## **Breakdown of Salaries per Sector** 

In [None]:
boundary_sector = dataanalysts.groupby('Sector')['upper_boundary','lower_boundary','difference'].mean().sort_values(
    by = 'difference').reset_index()
boundary_sector = boundary_sector[boundary_sector['Sector'] != '-1']

fig = go.Figure(data=[
    go.Bar(name='Lower Boundary', x=boundary_sector['Sector'], y=boundary_sector['lower_boundary']),
    go.Bar(name='Upper Boundary', x=boundary_sector['Sector'], y=boundary_sector['upper_boundary'])
])
# Change the bar mode
fig.update_layout(barmode='group', title = 'Lower and Upper Mean Boundaries by Sector from Lowest to Highest Difference')