In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 5GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

**Import Libraries and Data**

In [None]:
import os
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import textwrap
import plotly.graph_objects as go
import seaborn as sns

In [None]:
eng_jobs = pd.read_csv('../input/data-engineer-jobs/DataEngineer.csv')
eng_jobs.head(5)

# **Preprocessing the Data**

Let's remove some unwanted columns

In [None]:
eng_jobs.drop(columns=['Job Description','Company Name','Competitors'],axis=1,inplace=True)

**Cleaning the Easy Apply column values**

In [None]:
eng_jobs.replace([-1.0,-1,'-1'],np.nan, inplace=True)
eng_jobs['Easy Apply'] = eng_jobs['Easy Apply'].fillna(False).astype(bool)

**Cleaning the Salary Estimate column values**

In [None]:
eng_jobs['Salary Estimate'] = eng_jobs['Salary Estimate'].str.replace('(','').str.replace(')','').str.replace('Glassdoor est.','').str.replace('Employer est.','')

In [None]:
eng_jobs['Mini Salary'],eng_jobs['Max Salary'] = eng_jobs['Salary Estimate'].str.split('-').str
eng_jobs['Mini Salary'] = eng_jobs['Mini Salary'].str.strip(' ').str.strip('$').str.strip('K').fillna(0).astype(int)
eng_jobs['Max Salary'] = eng_jobs['Max Salary'].str.strip(' ').str.strip('$').str.strip('K').fillna(0).astype(int)

In [None]:
eng_jobs.drop(columns=['Salary Estimate'],axis=1,inplace=True)

**Cleaning the Founded column values**

In [None]:
#Replaced the nan with the most occuring date
eng_jobs.Founded.replace(np.nan, 2000, inplace=True) 
eng_jobs['Founded'] = eng_jobs['Founded'].astype(int)

**Cleaning Rating column values**

In [None]:
eng_jobs.Rating.replace(np.nan, 3.9, inplace=True)

**Cleaning the Revenue column values**

In [None]:
eng_jobs['Revenue'].replace('Unknown / Non-Applicable',np.nan,inplace=True)
eng_jobs.head(5)

# **Visualization**

# **Sector**

In [None]:
easy_sec = eng_jobs.loc[eng_jobs['Easy Apply'] == True]
easy_sec = easy_sec.groupby('Sector')['Easy Apply'].count().reset_index()
Easy_sec = easy_sec.sort_values('Easy Apply',ascending=False).head(8)

In [None]:
easy_sec.head(5)

In [None]:
fig, ax = plt.subplots(figsize = [16,5])
sns.barplot(data = Easy_sec,x = 'Sector',y = 'Easy Apply', ax = ax)
ax.set_ylabel('Count Jobs')
ax.set_yticks(np.arange(0, 65, step = 5))
for index,Easy_sec in enumerate(Easy_sec['Easy Apply'].astype(int)):
       ax.text(x=index-0.1 , y =Easy_sec+1 , s=f"{Easy_sec}" , fontdict=dict(fontsize=10))
plt.show()

# **Location vs Salaries**

In [None]:
sala_city = eng_jobs.groupby('Location')[['Mini Salary','Max Salary']].mean().sort_values(['Mini Salary','Max Salary'],ascending=False).head(20)

In [None]:
fig = go.Figure()

fig.add_trace(go.Bar(x=sala_city.index,y=sala_city['Mini Salary'],name='Minimum salary'))
fig.add_trace(go.Bar(x=sala_city.index,y=sala_city['Max Salary'],name='Maximum Salary'))

fig.update_layout(title='Top 20 cities with their minimum and maximum salaries',barmode='stack')

fig.show()

**LOCATION VS JOB TITLES**

In [None]:
loc_job = eng_jobs.groupby('Location')['Job Title'].count().reset_index()
loc_job = loc_job.sort_values('Job Title', ascending = False).head(10)

fig, ax = plt.subplots(figsize = [16,5])
sns.barplot(data = loc_job, x = 'Location', y = 'Job Title', ax = ax)
ax.set_ylabel('Count Jobs')
ax.set_yticks(np.arange(0, 250, step = 20))
for index,loc_job in enumerate(loc_job['Job Title'].astype(int)):
       ax.text(x=index-0.1 , y =loc_job+1 , s=f"{loc_job}" , fontdict=dict(fontsize=10))
plt.show()

# **Industry**

In [None]:
Job_Rev = eng_jobs.groupby('Industry')['Revenue'].count().reset_index()
Job_Rev = Job_Rev.sort_values('Revenue',ascending=False).head(5)
Job_Rev

In [None]:
max_width = 15
fig, ax = plt.subplots(figsize = [16,5])
sns.barplot(data = Job_Rev,x = 'Industry',y = 'Revenue', ax = ax)
ax.set_ylabel('Count Jobs')
ax.set_title('Industry against Revenue')
ax.set_xticklabels(textwrap.fill(x.get_text(), max_width) for x in ax.get_xticklabels())
for index,Job_Rev in enumerate(Job_Rev['Revenue'].astype(int)):
       ax.text(x=index-0.1 , y =Job_Rev+1 , s=f"{Job_Rev}" , fontdict=dict(fontsize=10))
plt.show()

# **Rating**

In [None]:
rate_job = eng_jobs.groupby('Job Title')['Rating'].max().reset_index()
rate_job = rate_job.sort_values('Rating',ascending=True).head(10)
rate_job

In [None]:
fig, ax = plt.subplots(figsize = (16,5))
sns.barplot(data = rate_job,x = 'Rating',y = 'Job Title', ax = ax)
ax.set_ylabel('Job Title')
ax.set_title('Bottowm 10 Job Title Ratings')
plt.show()

**CHICAGO**

Let's see how many sectors are owned by Private or by the Government in Chicago

In [None]:
jobs = eng_jobs.loc[eng_jobs.Headquarters.isin(['Chicago, IL'])]
jobs.head(5)

In [None]:
own_sec = jobs.groupby('Type of ownership')['Sector'].count().reset_index()
own_sec.sort_values('Sector',ascending=False)
own_sec.head(5)

In [None]:
fig, ax = plt.subplots(figsize = [16,5])
sns.barplot(data = own_sec,x = 'Type of ownership',y='Sector',ax = ax)
ax.set_ylabel('Count ownership')
ax.set_yticks(np.arange(0, 80, step = 5))
for index,own_sec in enumerate(own_sec['Sector'].astype(int)):
       ax.text(x=index-0.1 , y =own_sec+1 , s=f"{own_sec}" , fontdict=dict(fontsize=10))
plt.show()

Let's see the Minimum and Maximum salary in top 5 sectors in Chicago

In [None]:
money_min = jobs.groupby('Sector')[['Mini Salary','Max Salary']].mean().sort_values(['Mini Salary','Max Salary'],ascending=False).head(8)
money_min.reset_index(inplace=True)

money_max = jobs.groupby('Sector')[['Mini Salary','Max Salary']].mean().sort_values(['Mini Salary','Max Salary'],ascending=True).head(8)
money_max.reset_index(inplace=True)

print(money_max, '\n')
print(money_min)

In [None]:
max_width = 15
money = [money_min,money_max]
money_title = ['Top 8', 'Bottom 8']
fig, ax = plt.subplots(2,1, figsize = (22,14))
fig.subplots_adjust(hspace = 0.5)
for i in range(0,2):
    sns.barplot(ax = ax[i], data = money[i], x = 'Sector', y = 'Max Salary', color = 'orangered', label = 'Max Salary')
    sns.barplot(ax = ax[i], data = money[i], x = 'Sector', y = 'Mini Salary', color = 'darkslateblue', label = 'Mini Salary')
    ax[i].legend()
    ax[i].set_title(money_title[i]+' Average Salary in Each Sector', fontsize = 20)
    ax[i].set_ylabel('Salary', fontsize = 20)
    ax[i].set_xlabel('Sector', fontsize = 20)
    ax[i].set_xticklabels(textwrap.fill(x.get_text(), max_width) for x in ax[i].get_xticklabels())
    ax[i].set_yticks(np.arange(0, 300, step = 50))
    ax[i].tick_params(labelsize = 18)
    
plt.show()

**If you like my work, do UPVOTE**