In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 5GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

# About the dataset

*Amidst the pandemic many people lost their jobs, with this dataset it is possible to hone the job search so that more people in need can find employment.
This dataset was created by picklesueat and contains more than 2000 job listing for data analyst positions, with features such as:*
* *Salary Estimate*
* *Location*
* *Company Rating*
* *Job Description*
* *and more.*

# Loading the datasat and packages

In [None]:
import numpy as np 
import pandas as pd 
import seaborn as sns
import matplotlib.pyplot as plt 

In [None]:
df = pd.read_csv('/kaggle/input/data-analyst-jobs/DataAnalyst.csv')

In [None]:
df.head()

# Cleaning the data
We start with cleaning the data. We drop the **Unnamed** columns, check for missing values and modyfy **Salary Estimate** column for further analysis.

In [None]:
df.drop(['Unnamed: 0'], axis=1, inplace=True)
df.isnull().sum()

In [None]:
df[df['Company Name'].isnull()]

We see that there is one missing value in the data. This entry also show that we have some entries which make no sense, i.e values -1 in Size, Revenue etc. columns. We will change all these entries into  NaN values.


In [None]:
df['Easy Apply'].unique()

In [None]:
df['Easy Apply']= df['Easy Apply'].replace('-1', 'False')

In [None]:
df.dtypes

In [None]:
df.Rating = df.Rating.apply(lambda x: np.nan if x<0 else x)
df.Founded = df.Founded.apply(lambda x: np.nan if x<0 else x)
df = df.replace(['-1'], np.nan)

In [None]:
df.isnull().sum()

In [None]:
df['Company Name'] = df['Company Name'].str.split('\n').str[0]

In [None]:
df[['Salary_min','Salary_max']] = df['Salary Estimate'].str.split("-",expand=True)

In [None]:
df.Salary_min = df.Salary_min.str.extract('(\d+)').astype('float')*1000
df.Salary_max = df.Salary_max.str.extract('(\d+)').astype('float')*1000
df['Salary_avg'] = (df.Salary_max+df.Salary_min)/2

Our data frame:

In [None]:
df.head()

# Companies

In [None]:
df_comp = df.groupby(['Company Name']).count()[['Job Title']]

In [None]:
df_comp = df_comp.sort_values('Job Title', ascending=False)[:15]

In [None]:
plt.figure(figsize=(12,6))
sns.set(style="whitegrid")
ax = sns.barplot(x=df_comp.index, y=df_comp["Job Title"],palette='Set1')
ax.set(xlabel='Company', ylabel='Job offers')
ax.set_xticklabels(ax.get_xticklabels(), rotation=45,horizontalalignment='right')
plt.title('Companies with most job offers')

Next we want to check how many jobs were available in each sector. We'll sort it by type of ownership.

In [None]:
df['Type of ownership'].value_counts()

In [None]:
df['New_type_owner'] = df['Type of ownership'].apply(lambda x: x  if x in (['Company - Private', 'Company - Public', 'Nonprofit Organization']) else 'Other' )
        

In [None]:
plt.figure(figsize=(20,6))
sns.set(style="whitegrid")
ax = sns.countplot(x='Sector', hue='New_type_owner', data= df, order = df['Sector'].value_counts().index[:10] ,palette='Set1')
ax.set(xlabel='Sector', ylabel='Job offers')
ax.set_xticklabels(ax.get_xticklabels(), rotation=45,horizontalalignment='right')
plt.legend(loc='upper right')
plt.title('Job offers in each sector of economy')

# Location

In [None]:
plt.figure(figsize=(20,6))
ax = sns.countplot(x='Location', data= df, order = df['Location'].value_counts().index[:25] ,palette='Set1')
ax.set(xlabel='City', ylabel='Job offers')
ax.set_xticklabels(ax.get_xticklabels(), rotation=45,horizontalalignment='right')
plt.title('Job offers in different cities')

In [None]:
df['State'] = df.Location.str.split(", ").str[1]

In [None]:
df['State'].unique()
df['State'] = df.State.replace('Arapahoe', 'CO')

In [None]:
plt.figure(figsize=(20,6))
sns.set(style="whitegrid")
ax = sns.countplot(x='State', data= df, order = df['State'].value_counts().index ,palette='Set1')
ax.set(xlabel='State', ylabel='Job offers')
ax.set_xticklabels(ax.get_xticklabels(), rotation=45,horizontalalignment='right')
plt.title('Job offers in different states')

# Easy Apply

In [None]:
plt.figure(figsize=(4,5))
ax = sns.countplot(x='Easy Apply', data= df,palette='Set1')
ax.set(xlabel='Easy Apply', ylabel='Job offers with Easy Apply')
ax.set_xticklabels(ax.get_xticklabels())
for p in ax.patches:
        ax.annotate('{}'.format(p.get_height()), (p.get_x()+0.35, p.get_height()+20))
plt.title('Job offers with Easy Apply')

In [None]:
plt.figure(figsize=(20,6))
ax = sns.countplot(x='Company Name', data= df[df['Easy Apply']=='True'],palette='Set1')
ax.set(xlabel='Company', ylabel='Job offers with Easy Apply')
ax.set_xticklabels(ax.get_xticklabels(), rotation=45,horizontalalignment='right')
plt.title('Companies with the Easy Apply job offers')

# Does bigger mean better?

How the size of the company impacts its rating and salary.

In [None]:
df['Job Title'].value_counts()[df['Job Title'].value_counts()>5]

We deal with the most obvious inconsistent data entries.

In [None]:
df['Short Job Title'] = df['Job Title'].str.split(',').str[0]
df['Short Job Title'].apply(lambda x: 'Senior Data Analyst' if x == 'Sr. Data Analyst' else x)
df['Short Job Title'].apply(lambda x: 'Junior Data Analyst' if x == 'Data Analyst Junior' else x)

In [None]:
df_1 = df[df['Short Job Title'].isin(['Data Analyst','Senior Data Analyst', 'Junior Data Analyst'])]

In [None]:
plt.figure()
sns.set(style="whitegrid")
chart = sns.catplot(x="Size", y="Salary_avg", hue="Short Job Title", order= ['1 to 50 employees', '51 to 200 employees', '201 to 500 employees','501 to 1000 employees', '1001 to 5000 employees',
       '5001 to 10000 employees', '10000+ employees'], kind="swarm", data=df_1[(df_1['Size'].notnull()) & (df_1['Size']!= 'Unknown')],
        height=6, aspect = 1.5);
chart.set(xlabel='Size (employess)', ylabel='Salary')
for axes in chart.axes.flat:
    axes.set_xticklabels(['1 to 50', '51 to 200', '201 to 500','501 to 1000', '1001 to 5000',
       '5001 to 10000', '10000+'], rotation=65, horizontalalignment='right')
plt.title('Avarage salary in the companies')

It seems that there is no strong correlation between size of the companies and salaries. I'd guess that such correlation should be visible between salary and the location of the comapany.

In [None]:
plt.figure()
sns.set(style="whitegrid")
chart = sns.catplot(x="State", y="Salary_avg", hue="Short Job Title",kind="swarm", data=df_1,
        height=6, aspect = 2.5);
chart.set(xlabel='State', ylabel='Salary')
plt.title('Salary in different states')

In [None]:
size_order = ['1 to 50 employees', '51 to 200 employees', '201 to 500 employees','501 to 1000 employees', '1001 to 5000 employees',
       '5001 to 10000 employees', '10000+ employees', 'Unknown']
rating_df = df.groupby('Size').agg({'Rating': [np.min, np.mean,np.max]}).rename(columns={'amin': 'min', 'amean':'mean', 'amax':'max'}).reindex(size_order)
rating_df.columns = ['_'.join(col) for col in rating_df.columns.values]
rating_df = rating_df.reset_index()
rating_df


In [None]:
plt.figure()
sns.relplot(x="Rating", y="Salary_avg", col='Size',kind="line",col_wrap=4, data=df[df.Size!='Unknown'],
           col_order = size_order[:-1])
plt.title('Avarage salary VS rating for diffferent size of the company')
